diff --git a/basis/bootstrap/ui/ui.factor b/basis/bootstrap/ui/ui.factor index 4f7f82a067..271a99c223 100755 --- a/basis/bootstrap/ui/ui.factor +++ b/basis/bootstrap/ui/ui.factor @@ -10,12 +10,4 @@ IN: bootstrap.ui { [ os unix? ] [ "x11" ] } } cond ] unless* "ui.backend." prepend require - - "ui-text-backend" get [ - { - { [ os macosx? ] [ "core-text" ] } - { [ os windows? ] [ "pango" ] } - { [ os unix? ] [ "pango" ] } - } cond - ] unless* "ui.text." prepend require ] when diff --git a/basis/documents/elements/elements-tests.factor b/basis/documents/elements/elements-tests.factor index a3f05d7a71..9b323ae8e9 100644 --- a/basis/documents/elements/elements-tests.factor +++ b/basis/documents/elements/elements-tests.factor @@ -3,68 +3,72 @@ USING: tools.test namespaces documents documents.elements multiline ; IN: document.elements.tests - "doc" set -"123\nabc" "doc" get set-doc-string +SYMBOL: doc + doc set +"123\nabcé" doc get set-doc-string ! char-elt -[ { 0 0 } ] [ { 0 0 } "doc" get char-elt prev-elt ] unit-test -[ { 0 0 } ] [ { 0 1 } "doc" get char-elt prev-elt ] unit-test -[ { 0 3 } ] [ { 1 0 } "doc" get char-elt prev-elt ] unit-test +[ { 0 0 } ] [ { 0 0 } doc get char-elt prev-elt ] unit-test +[ { 0 0 } ] [ { 0 1 } doc get char-elt prev-elt ] unit-test +[ { 0 3 } ] [ { 1 0 } doc get char-elt prev-elt ] unit-test +[ { 1 3 } ] [ { 1 5 } doc get char-elt prev-elt ] unit-test -[ { 1 3 } ] [ { 1 3 } "doc" get char-elt next-elt ] unit-test -[ { 0 2 } ] [ { 0 1 } "doc" get char-elt next-elt ] unit-test -[ { 1 0 } ] [ { 0 3 } "doc" get char-elt next-elt ] unit-test +[ { 1 5 } ] [ { 1 5 } doc get char-elt next-elt ] unit-test +[ { 0 2 } ] [ { 0 1 } doc get char-elt next-elt ] unit-test +[ { 1 0 } ] [ { 0 3 } doc get char-elt next-elt ] unit-test +[ { 1 5 } ] [ { 1 3 } doc get char-elt next-elt ] unit-test ! word-elt - "doc" set -"Hello world\nanother line" "doc" get set-doc-string + doc set +"Hello world\nanother line" doc get set-doc-string -[ { 0 0 } ] [ { 0 0 } "doc" get word-elt prev-elt ] unit-test -[ { 0 0 } ] [ { 0 2 } "doc" get word-elt prev-elt ] unit-test -[ { 0 0 } ] [ { 0 5 } "doc" get word-elt prev-elt ] unit-test -[ { 0 5 } ] [ { 0 6 } "doc" get word-elt prev-elt ] unit-test -[ { 0 6 } ] [ { 0 8 } "doc" get word-elt prev-elt ] unit-test -[ { 0 11 } ] [ { 1 0 } "doc" get word-elt prev-elt ] unit-test +[ { 0 0 } ] [ { 0 0 } doc get word-elt prev-elt ] unit-test +[ { 0 0 } ] [ { 0 2 } doc get word-elt prev-elt ] unit-test +[ { 0 0 } ] [ { 0 5 } doc get word-elt prev-elt ] unit-test +[ { 0 5 } ] [ { 0 6 } doc get word-elt prev-elt ] unit-test +[ { 0 6 } ] [ { 0 8 } doc get word-elt prev-elt ] unit-test +[ { 0 11 } ] [ { 1 0 } doc get word-elt prev-elt ] unit-test + +[ { 0 5 } ] [ { 0 0 } doc get word-elt next-elt ] unit-test +[ { 0 6 } ] [ { 0 5 } doc get word-elt next-elt ] unit-test +[ { 0 11 } ] [ { 0 6 } doc get word-elt next-elt ] unit-test +[ { 1 0 } ] [ { 0 11 } doc get word-elt next-elt ] unit-test -[ { 0 5 } ] [ { 0 0 } "doc" get word-elt next-elt ] unit-test -[ { 0 6 } ] [ { 0 5 } "doc" get word-elt next-elt ] unit-test -[ { 0 11 } ] [ { 0 6 } "doc" get word-elt next-elt ] unit-test -[ { 1 0 } ] [ { 0 11 } "doc" get word-elt next-elt ] unit-test ! one-word-elt -[ { 0 0 } ] [ { 0 0 } "doc" get one-word-elt prev-elt ] unit-test -[ { 0 0 } ] [ { 0 2 } "doc" get one-word-elt prev-elt ] unit-test -[ { 0 0 } ] [ { 0 5 } "doc" get one-word-elt prev-elt ] unit-test -[ { 0 5 } ] [ { 0 2 } "doc" get one-word-elt next-elt ] unit-test -[ { 0 5 } ] [ { 0 5 } "doc" get one-word-elt next-elt ] unit-test +[ { 0 0 } ] [ { 0 0 } doc get one-word-elt prev-elt ] unit-test +[ { 0 0 } ] [ { 0 2 } doc get one-word-elt prev-elt ] unit-test +[ { 0 0 } ] [ { 0 5 } doc get one-word-elt prev-elt ] unit-test +[ { 0 5 } ] [ { 0 2 } doc get one-word-elt next-elt ] unit-test +[ { 0 5 } ] [ { 0 5 } doc get one-word-elt next-elt ] unit-test ! line-elt - "doc" set -"Hello\nworld, how are\nyou?" "doc" get set-doc-string + doc set +"Hello\nworld, how are\nyou?" doc get set-doc-string -[ { 0 0 } ] [ { 0 3 } "doc" get line-elt prev-elt ] unit-test -[ { 0 3 } ] [ { 1 3 } "doc" get line-elt prev-elt ] unit-test -[ { 2 4 } ] [ { 2 1 } "doc" get line-elt next-elt ] unit-test +[ { 0 0 } ] [ { 0 3 } doc get line-elt prev-elt ] unit-test +[ { 0 3 } ] [ { 1 3 } doc get line-elt prev-elt ] unit-test +[ { 2 4 } ] [ { 2 1 } doc get line-elt next-elt ] unit-test ! one-line-elt -[ { 1 0 } ] [ { 1 3 } "doc" get one-line-elt prev-elt ] unit-test -[ { 1 14 } ] [ { 1 3 } "doc" get one-line-elt next-elt ] unit-test +[ { 1 0 } ] [ { 1 3 } doc get one-line-elt prev-elt ] unit-test +[ { 1 14 } ] [ { 1 3 } doc get one-line-elt next-elt ] unit-test ! page-elt - "doc" set + doc set <" First line Second line Third line Fourth line Fifth line -Sixth line"> "doc" get set-doc-string +Sixth line"> doc get set-doc-string -[ { 0 0 } ] [ { 3 3 } "doc" get 4 prev-elt ] unit-test -[ { 1 2 } ] [ { 5 2 } "doc" get 4 prev-elt ] unit-test +[ { 0 0 } ] [ { 3 3 } doc get 4 prev-elt ] unit-test +[ { 1 2 } ] [ { 5 2 } doc get 4 prev-elt ] unit-test -[ { 4 3 } ] [ { 0 3 } "doc" get 4 next-elt ] unit-test -[ { 5 10 } ] [ { 4 2 } "doc" get 4 next-elt ] unit-test +[ { 4 3 } ] [ { 0 3 } doc get 4 next-elt ] unit-test +[ { 5 10 } ] [ { 4 2 } doc get 4 next-elt ] unit-test ! doc-elt -[ { 0 0 } ] [ { 3 4 } "doc" get doc-elt prev-elt ] unit-test -[ { 5 10 } ] [ { 3 4 } "doc" get doc-elt next-elt ] unit-test \ No newline at end of file +[ { 0 0 } ] [ { 3 4 } doc get doc-elt prev-elt ] unit-test +[ { 5 10 } ] [ { 3 4 } doc get doc-elt next-elt ] unit-test diff --git a/basis/documents/elements/elements.factor b/basis/documents/elements/elements.factor index adb498df13..f485f1bec1 100644 --- a/basis/documents/elements/elements.factor +++ b/basis/documents/elements/elements.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2006, 2009 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. USING: arrays combinators documents fry kernel math sequences -unicode.categories accessors ; +accessors unicode.categories unicode.breaks combinators.short-circuit ; IN: documents.elements GENERIC: prev-elt ( loc document elt -- newloc ) @@ -20,27 +20,32 @@ SINGLETON: char-elt M: char-elt prev-elt - drop [ drop -1 +col ] (prev-char) ; + drop [ [ last-grapheme-from ] modify-col ] prev ; M: char-elt next-elt - drop [ drop 1 +col ] (next-char) ; + drop [ [ first-grapheme-from ] modify-col ] next ; SINGLETON: one-char-elt @@ -50,21 +55,16 @@ M: one-char-elt next-elt 2drop ; @@ -73,23 +73,23 @@ SINGLETON: one-word-elt M: one-word-elt prev-elt drop - [ [ 1- ] dip f (prev-word) ] (word-elt) ; + [ [ 1- ] dip f prev-word ] modify-col ; M: one-word-elt next-elt drop - [ f (next-word) ] (word-elt) ; + [ f next-word ] modify-col ; SINGLETON: word-elt M: word-elt prev-elt drop - [ [ [ 1- ] dip ((word-elt)) (prev-word) ] (word-elt) ] - (prev-char) ; + [ [ [ 1- ] dip blank-at? prev-word ] modify-col ] + prev ; M: word-elt next-elt drop - [ [ ((word-elt)) (next-word) ] (word-elt) ] - (next-char) ; + [ [ blank-at? next-word ] modify-col ] + next ; SINGLETON: one-line-elt @@ -118,4 +118,4 @@ SINGLETON: doc-elt M: doc-elt prev-elt 3drop { 0 0 } ; -M: doc-elt next-elt drop nip doc-end ; \ No newline at end of file +M: doc-elt next-elt drop nip doc-end ; diff --git a/basis/peg/peg.factor b/basis/peg/peg.factor index ce34beb725..dda36432e7 100644 --- a/basis/peg/peg.factor +++ b/basis/peg/peg.factor @@ -155,18 +155,21 @@ TUPLE: peg-head rule-id involved-set eval-set ; dup pos>> pos set ans>> ; inline -:: (setup-lr) ( r l s -- ) - s head>> l head>> eq? [ - l head>> s (>>head) - l head>> [ s rule-id>> suffix ] change-involved-set drop - r l s next>> (setup-lr) - ] unless ; +:: (setup-lr) ( l s -- ) + s [ + s left-recursion? [ s throw ] unless + s head>> l head>> eq? [ + l head>> s (>>head) + l head>> [ s rule-id>> suffix ] change-involved-set drop + l s next>> (setup-lr) + ] unless + ] when ; :: setup-lr ( r l -- ) l head>> [ r rule-id V{ } clone V{ } clone peg-head boa l (>>head) ] unless - r l lrstack get (setup-lr) ; + l lrstack get (setup-lr) ; :: lr-answer ( r p m -- ast ) [let* | @@ -216,8 +219,10 @@ TUPLE: peg-head rule-id involved-set eval-set ; lrstack get next>> lrstack set pos get m (>>pos) lr head>> [ - ans lr (>>seed) - r p m lr-answer + m ans>> left-recursion? [ + ans lr (>>seed) + r p m lr-answer + ] [ ans ] if ] [ ans m (>>ans) ans diff --git a/basis/ui/gadgets/worlds/worlds.factor b/basis/ui/gadgets/worlds/worlds.factor index 163dbff514..655c9ba49d 100644 --- a/basis/ui/gadgets/worlds/worlds.factor +++ b/basis/ui/gadgets/worlds/worlds.factor @@ -3,8 +3,7 @@ USING: accessors arrays assocs continuations kernel math models namespaces opengl sequences io combinators combinators.short-circuit fry math.vectors math.rectangles cache ui.gadgets ui.gestures -ui.render ui.text ui.text.private ui.backend ui.gadgets.tracks -ui.commands ; +ui.render ui.backend ui.gadgets.tracks ui.commands ; IN: ui.gadgets.worlds TUPLE: world < track @@ -53,7 +52,6 @@ M: world request-focus-on ( child gadget -- ) swap >>status swap >>title swap 1 track-add - dup init-text-rendering dup request-focus ; : ( gadget title status -- world ) @@ -74,15 +72,20 @@ M: world remove-gadget 2dup layers>> memq? [ layers>> delq ] [ call-next-method ] if ; +SYMBOL: flush-layout-cache-hook + +flush-layout-cache-hook [ [ ] ] initialize + : (draw-world) ( world -- ) dup handle>> [ { [ init-gl ] [ draw-gadget ] - [ finish-text-rendering ] + [ text-handle>> [ purge-cache ] when* ] [ images>> [ purge-cache ] when* ] } cleave - ] with-gl-context ; + ] with-gl-context + flush-layout-cache-hook get call( -- ) ; : draw-world? ( world -- ? ) #! We don't draw deactivated worlds, or those with 0 size. diff --git a/basis/ui/text/core-text/core-text.factor b/basis/ui/text/core-text/core-text.factor index 785a9366cb..3704189e48 100644 --- a/basis/ui/text/core-text/core-text.factor +++ b/basis/ui/text/core-text/core-text.factor @@ -18,12 +18,11 @@ M: core-text-renderer string-dim [ cached-line dim>> ] if-empty ; -M: core-text-renderer finish-text-rendering - text-handle>> purge-cache +M: core-text-renderer flush-layout-cache cached-lines get purge-cache ; : rendered-line ( font string -- texture ) - world get text-handle>> + world get world-text-handle [ cached-line [ image>> ] [ loc>> ] bi ] 2cache ; diff --git a/basis/ui/text/pango/pango.factor b/basis/ui/text/pango/pango.factor index 8b644be469..017a4b2cf2 100755 --- a/basis/ui/text/pango/pango.factor +++ b/basis/ui/text/pango/pango.factor @@ -14,12 +14,11 @@ M: pango-renderer string-dim [ " " string-dim { 0 1 } v* ] [ cached-layout logical-rect>> dim>> [ >integer ] map ] if-empty ; -M: pango-renderer finish-text-rendering - text-handle>> purge-cache +M: pango-renderer flush-layout-cache cached-layouts get purge-cache ; : rendered-layout ( font string -- texture ) - world get text-handle>> + world get world-text-handle [ cached-layout [ image>> ] [ text-position vneg ] bi ] 2cache ; diff --git a/basis/ui/text/text.factor b/basis/ui/text/text.factor index d0766e9ee6..ebf4b9cce0 100644 --- a/basis/ui/text/text.factor +++ b/basis/ui/text/text.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2009 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. USING: kernel arrays sequences math math.order opengl opengl.gl -strings fonts colors accessors ; +strings fonts colors accessors namespaces ui.gadgets.worlds ; IN: ui.text > [ dup init-text-rendering ] unless + text-handle>> ; -M: object finish-text-rendering drop ; +HOOK: flush-layout-cache font-renderer ( -- ) + +[ flush-layout-cache ] flush-layout-cache-hook set-global HOOK: string-dim font-renderer ( font string -- dim ) @@ -68,4 +72,14 @@ M: array draw-text [ draw-string ] [ [ 0.0 ] 2dip string-height 0.0 glTranslated ] 2bi ] with each - ] do-matrix ; \ No newline at end of file + ] do-matrix ; + +USING: vocabs.loader namespaces system combinators ; + +"ui-backend" get [ + { + { [ os macosx? ] [ "core-text" ] } + { [ os windows? ] [ "pango" ] } + { [ os unix? ] [ "pango" ] } + } cond +] unless* "ui.text." prepend require \ No newline at end of file diff --git a/basis/ui/ui.factor b/basis/ui/ui.factor index 8ce8f57cf0..bf17e455f8 100644 --- a/basis/ui/ui.factor +++ b/basis/ui/ui.factor @@ -4,8 +4,7 @@ USING: arrays assocs io kernel math models namespaces make dlists deques sequences threads sequences words continuations init combinators hashtables concurrency.flags sets accessors calendar fry destructors ui.gadgets ui.gadgets.private ui.gadgets.worlds -ui.gadgets.tracks ui.gestures ui.backend ui.render ui.text -ui.text.private ; +ui.gadgets.tracks ui.gestures ui.backend ui.render ; IN: ui > select-gl-context ] - [ text-handle>> dispose ] + [ text-handle>> [ dispose ] when* ] [ images>> [ dispose ] when* ] [ hand-clicked close-global ] [ hand-gadget close-global ] @@ -95,8 +94,7 @@ M: world ungraft* : restore-world ( world -- ) { [ reset-world ] - [ init-text-rendering ] - [ f >>images drop ] + [ f >>text-handle f >>images drop ] [ restore-gadget ] } cleave ; diff --git a/basis/unicode/breaks/breaks-tests.factor b/basis/unicode/breaks/breaks-tests.factor index 493c2db0c2..3a26b01213 100644 --- a/basis/unicode/breaks/breaks-tests.factor +++ b/basis/unicode/breaks/breaks-tests.factor @@ -9,6 +9,9 @@ IN: unicode.breaks.tests [ 3 ] [ "\u001112\u001161\u0011abA\u000300a" dup last-grapheme head last-grapheme ] unit-test +[ 3 ] [ 2 "hello" first-grapheme-from ] unit-test +[ 1 ] [ 2 "hello" last-grapheme-from ] unit-test + : grapheme-break-test ( -- filename ) "vocab:unicode/breaks/GraphemeBreakTest.txt" ; diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor index 12314505d9..1b1d9434f8 100644 --- a/basis/unicode/breaks/breaks.factor +++ b/basis/unicode/breaks/breaks.factor @@ -101,6 +101,16 @@ PRIVATE> [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop nip swap length or 1+ ; +: first-grapheme-from ( start str -- i ) + over tail-slice first-grapheme + ; + +: last-grapheme ( str -- i ) + unclip-last-slice grapheme-class swap + [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ; + +: last-grapheme-from ( end str -- i ) + swap head-slice last-grapheme ; + pieces ( str quot: ( str -- i ) -- graphemes ) @@ -114,10 +124,6 @@ PRIVATE> : string-reverse ( str -- rts ) >graphemes reverse concat ; -: last-grapheme ( str -- i ) - unclip-last-slice grapheme-class swap - [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ; - > define-writer-generic ] [ diff --git a/extra/html/parser/parser.factor b/extra/html/parser/parser.factor index 677737618b..94ef59bdfd 100644 --- a/extra/html/parser/parser.factor +++ b/extra/html/parser/parser.factor @@ -68,10 +68,10 @@ SYMBOL: tagstack [ blank? ] trim ; : read-comment ( state-parser -- ) - "-->" take-until-string make-comment-tag push-tag ; + "-->" take-until-sequence make-comment-tag push-tag ; : read-dtd ( state-parser -- ) - ">" take-until-string make-dtd-tag push-tag ; + ">" take-until-sequence make-dtd-tag push-tag ; : read-bang ( state-parser -- ) next dup { [ get-char CHAR: - = ] [ get-next CHAR: - = ] } 1&& [ @@ -93,7 +93,7 @@ SYMBOL: tagstack : (parse-attributes) ( state-parser -- ) skip-whitespace - dup string-parse-end? [ + dup state-parse-end? [ drop ] [ [ @@ -108,7 +108,7 @@ SYMBOL: tagstack : (parse-tag) ( string -- string' hashtable ) [ [ read-token >lower ] [ parse-attributes ] bi - ] string-parse ; + ] state-parse ; : read-< ( state-parser -- string/f ) next dup get-char [ @@ -126,7 +126,7 @@ SYMBOL: tagstack ] [ drop ] if ; : tag-parse ( quot -- vector ) - V{ } clone tagstack [ string-parse ] with-variable ; inline + V{ } clone tagstack [ state-parse ] with-variable ; inline : parse-html ( string -- vector ) [ (parse-html) tagstack get ] tag-parse ; diff --git a/extra/html/parser/state/state-tests.factor b/extra/html/parser/state/state-tests.factor index f676649aa8..f9862e1e69 100644 --- a/extra/html/parser/state/state-tests.factor +++ b/extra/html/parser/state/state-tests.factor @@ -2,29 +2,35 @@ USING: tools.test html.parser.state ascii kernel accessors ; IN: html.parser.state.tests [ "hello" ] -[ "hello" [ take-rest ] string-parse ] unit-test +[ "hello" [ take-rest ] state-parse ] unit-test [ "hi" " how are you?" ] [ "hi how are you?" - [ [ [ blank? ] take-until ] [ take-rest ] bi ] string-parse + [ [ [ blank? ] take-until ] [ take-rest ] bi ] state-parse ] unit-test [ "foo" ";bar" ] [ "foo;bar" [ - [ CHAR: ; take-until-char ] [ take-rest ] bi - ] string-parse + [ CHAR: ; take-until-object ] [ take-rest ] bi + ] state-parse ] unit-test [ "foo " " bar" ] [ "foo and bar" [ - [ "and" take-until-string ] [ take-rest ] bi - ] string-parse + [ "and" take-until-sequence ] [ take-rest ] bi + ] state-parse ] unit-test [ 6 ] [ - " foo " [ skip-whitespace i>> ] string-parse + " foo " [ skip-whitespace n>> ] state-parse ] unit-test + +[ { 1 2 } ] +[ { 1 2 3 } [ 3 = ] take-until ] unit-test + +[ { 1 2 } ] +[ { 1 2 3 4 } { 3 4 } take-until-sequence ] unit-test diff --git a/extra/html/parser/state/state.factor b/extra/html/parser/state/state.factor index c69fd76af5..2369b1d750 100644 --- a/extra/html/parser/state/state.factor +++ b/extra/html/parser/state/state.factor @@ -2,31 +2,32 @@ ! See http://factorcode.org/license.txt for BSD license. USING: namespaces math kernel sequences accessors fry circular unicode.case unicode.categories locals ; + IN: html.parser.state -TUPLE: state-parser string i ; +TUPLE: state-parser sequence n ; -: ( string -- state-parser ) +: ( sequence -- state-parser ) state-parser new - swap >>string - 0 >>i ; + swap >>sequence + 0 >>n ; -: (get-char) ( i state -- char/f ) - string>> ?nth ; inline +: (get-char) ( n state -- char/f ) + sequence>> ?nth ; inline : get-char ( state -- char/f ) - [ i>> ] keep (get-char) ; inline + [ n>> ] keep (get-char) ; inline : get-next ( state -- char/f ) - [ i>> 1+ ] keep (get-char) ; inline + [ n>> 1 + ] keep (get-char) ; inline : next ( state -- state ) - [ 1+ ] change-i ; inline + [ 1 + ] change-n ; inline : get+increment ( state -- char/f ) [ get-char ] [ next drop ] bi ; inline -: string-parse ( string quot -- ) +: state-parse ( sequence quot -- ) [ ] dip call ; inline :: skip-until ( state quot: ( obj -- ? ) -- ) @@ -34,17 +35,23 @@ TUPLE: state-parser string i ; quot call [ state next quot skip-until ] unless ] when* ; inline recursive -: take-until ( state quot: ( obj -- ? ) -- string ) - [ drop i>> ] - [ skip-until ] - [ drop [ i>> ] [ string>> ] bi ] 2tri subseq ; inline +: state-parse-end? ( state -- ? ) get-next not ; -:: take-until-string ( state-parser string -- string' ) - string length :> growing +: take-until ( state quot: ( obj -- ? ) -- sequence/f ) + over state-parse-end? [ + 2drop f + ] [ + [ drop n>> ] + [ skip-until ] + [ drop [ n>> ] [ sequence>> ] bi ] 2tri subseq + ] if ; inline + +:: take-until-sequence ( state-parser sequence -- sequence' ) + sequence length :> growing state-parser [ growing push-growing-circular - string growing sequence= + sequence growing sequence= ] take-until :> found found dup length growing length 1- - head @@ -53,10 +60,8 @@ TUPLE: state-parser string i ; : skip-whitespace ( state -- state ) [ [ blank? not ] take-until drop ] keep ; -: take-rest ( state -- string ) +: take-rest ( state -- sequence ) [ drop f ] take-until ; inline -: take-until-char ( state ch -- string ) +: take-until-object ( state obj -- sequence ) '[ _ = ] take-until ; - -: string-parse-end? ( state -- ? ) get-next not ; diff --git a/extra/spider/spider-docs.factor b/extra/spider/spider-docs.factor index cdbd5e7e09..4ed00d39f6 100644 --- a/extra/spider/spider-docs.factor +++ b/extra/spider/spider-docs.factor @@ -16,11 +16,6 @@ HELP: run-spider { "spider" spider } } { $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ; -HELP: slurp-heap-while -{ $values - { "heap" "a heap" } { "quot1" quotation } { "quot2" quotation } } -{ $description "Removes values from a heap that match the predicate quotation " { $snippet "quot1" } " and processes them with " { $snippet "quot2" } " until the predicate quotation no longer matches." } ; - ARTICLE: "spider-tutorial" "Spider tutorial" "To create a new spider, call the " { $link } " word with a link to the site you wish to spider." { $code <" "http://concatenative.org" "> } diff --git a/extra/spider/spider.factor b/extra/spider/spider.factor index bd5b2668be..d08276a9bb 100644 --- a/extra/spider/spider.factor +++ b/extra/spider/spider.factor @@ -3,8 +3,8 @@ USING: accessors fry html.parser html.parser.analyzer http.client kernel tools.time sets assocs sequences concurrency.combinators io threads namespaces math multiline -heaps math.parser inspector urls assoc-heaps logging -combinators.short-circuit continuations calendar prettyprint ; +math.parser inspector urls logging combinators.short-circuit +continuations calendar prettyprint dlists deques locals ; IN: spider TUPLE: spider base count max-count sleep max-depth initial-links @@ -13,12 +13,33 @@ filters spidered todo nonmatching quiet ; TUPLE: spider-result url depth headers fetch-time parsed-html links processing-time timestamp ; +TUPLE: todo-url url depth ; + +: ( url depth -- todo-url ) + todo-url new + swap >>depth + swap >>url ; + +TUPLE: unique-deque assoc deque ; + +: ( -- unique-deque ) + H{ } clone unique-deque boa ; + +: push-url ( url depth unique-deque -- ) + [ ] dip + [ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ] + [ deque>> push-back ] 2bi ; + +: pop-url ( unique-deque -- todo-url ) deque>> pop-front ; + +: peek-url ( unique-deque -- todo-url ) deque>> peek-front ; + : ( base -- spider ) >url spider new over >>base - swap 0 [ heap-push ] keep >>todo - >>nonmatching + swap 0 [ push-url ] keep >>todo + >>nonmatching 0 >>max-depth 0 >>count 1/0. >>max-count @@ -27,10 +48,10 @@ links processing-time timestamp ; > [ '[ _ 1&& ] filter ] when* ; + filters>> [ '[ [ _ 1&& ] filter ] call( seq -- seq' ) ] when* ; -: push-links ( links level assoc-heap -- ) - '[ _ _ heap-push ] each ; +: push-links ( links level unique-deque -- ) + '[ _ _ push-url ] each ; : add-todo ( links level spider -- ) todo>> push-links ; @@ -38,64 +59,72 @@ links processing-time timestamp ; : add-nonmatching ( links level spider -- ) nonmatching>> push-links ; -: filter-base ( spider spider-result -- base-links nonmatching-links ) +: filter-base-links ( spider spider-result -- base-links nonmatching-links ) [ base>> host>> ] [ links>> prune ] bi* [ host>> = ] with partition ; : add-spidered ( spider spider-result -- ) [ [ 1+ ] change-count ] dip 2dup [ spidered>> ] [ dup url>> ] bi* rot set-at - [ filter-base ] 2keep + [ filter-base-links ] 2keep depth>> 1+ swap [ add-nonmatching ] [ tuck [ apply-filters ] 2dip add-todo ] 2bi ; -: normalize-hrefs ( links -- links' ) - [ >url ] map - spider get base>> swap [ derive-url ] with map ; +: normalize-hrefs ( links spider -- links' ) + [ [ >url ] map ] dip + base>> swap [ derive-url ] with map ; : print-spidering ( url depth -- ) "depth: " write number>string write ", spidering: " write . yield ; -: (spider-page) ( url depth -- spider-result ) - f pick spider get spidered>> set-at - over '[ _ http-get ] benchmark swap - [ parse-html dup find-hrefs normalize-hrefs ] benchmark +:: new-spidered-result ( spider url depth -- spider-result ) + f url spider spidered>> set-at + [ url http-get ] benchmark :> fetch-time :> html :> headers + [ + html parse-html [ ] [ find-hrefs spider normalize-hrefs ] bi + ] benchmark :> processing-time :> links :> parsed-html + url depth headers fetch-time parsed-html links processing-time now spider-result boa ; -: spider-page ( url depth -- ) - spider get quiet>> [ 2dup print-spidering ] unless - (spider-page) - spider get [ quiet>> [ dup describe ] unless ] - [ swap add-spidered ] bi ; +:: spider-page ( spider url depth -- ) + spider quiet>> [ url depth print-spidering ] unless + spider url depth new-spidered-result :> spidered-result + spider quiet>> [ spidered-result describe ] unless + spider spidered-result add-spidered ; \ spider-page ERROR add-error-logging -: spider-sleep ( -- ) - spider get sleep>> [ sleep ] when* ; +: spider-sleep ( spider -- ) + sleep>> [ sleep ] when* ; -: queue-initial-links ( spider -- spider ) - [ initial-links>> normalize-hrefs 0 ] keep - [ add-todo ] keep ; +:: queue-initial-links ( spider -- spider ) + spider initial-links>> spider normalize-hrefs 0 spider add-todo spider ; -: slurp-heap-while ( heap quot1 quot2: ( value key -- ) -- ) - pick heap-empty? [ 3drop ] [ - [ [ heap-pop dup ] 2dip slip [ t ] compose [ 2drop f ] if ] - [ roll [ slurp-heap-while ] [ 3drop ] if ] 3bi - ] if ; inline recursive +: spider-page? ( spider -- ? ) + { + [ todo>> deque>> deque-empty? not ] + [ [ todo>> peek-url depth>> ] [ max-depth>> ] bi < ] + [ [ count>> ] [ max-count>> ] bi < ] + } 1&& ; + +: setup-next-url ( spider -- spider url depth ) + dup todo>> pop-url [ url>> ] [ depth>> ] bi ; + +: spider-next-page ( spider -- ) + setup-next-url spider-page ; PRIVATE> +: run-spider-loop ( spider -- ) + dup spider-page? [ + [ spider-next-page ] [ run-spider-loop ] bi + ] [ + drop + ] if ; + : run-spider ( spider -- spider ) "spider" [ - dup spider [ - queue-initial-links - [ todo>> ] [ max-depth>> ] bi - '[ - _ <= spider get - [ count>> ] [ max-count>> ] bi < and - ] [ spider-page spider-sleep ] slurp-heap-while - spider get - ] with-variable + queue-initial-links [ run-spider-loop ] keep ] with-logging ;