From ce04d1dfa96f0cfaf37155b8b8095c8122a172c0 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Tue, 31 Mar 2009 18:21:15 -0500 Subject: [PATCH 1/5] redo spider without dynamic variables --- extra/spider/spider-docs.factor | 5 -- extra/spider/spider.factor | 114 ++++++++++++++++++++------------ 2 files changed, 73 insertions(+), 46 deletions(-) diff --git a/extra/spider/spider-docs.factor b/extra/spider/spider-docs.factor index cdbd5e7e09..4ed00d39f6 100644 --- a/extra/spider/spider-docs.factor +++ b/extra/spider/spider-docs.factor @@ -16,11 +16,6 @@ HELP: run-spider { "spider" spider } } { $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ; -HELP: slurp-heap-while -{ $values - { "heap" "a heap" } { "quot1" quotation } { "quot2" quotation } } -{ $description "Removes values from a heap that match the predicate quotation " { $snippet "quot1" } " and processes them with " { $snippet "quot2" } " until the predicate quotation no longer matches." } ; - ARTICLE: "spider-tutorial" "Spider tutorial" "To create a new spider, call the " { $link } " word with a link to the site you wish to spider." { $code <" "http://concatenative.org" "> } diff --git a/extra/spider/spider.factor b/extra/spider/spider.factor index bd5b2668be..42f2485ebe 100644 --- a/extra/spider/spider.factor +++ b/extra/spider/spider.factor @@ -3,22 +3,44 @@ USING: accessors fry html.parser html.parser.analyzer http.client kernel tools.time sets assocs sequences concurrency.combinators io threads namespaces math multiline -heaps math.parser inspector urls assoc-heaps logging -combinators.short-circuit continuations calendar prettyprint ; +math.parser inspector urls logging combinators.short-circuit +continuations calendar prettyprint dlists deques locals ; IN: spider TUPLE: spider base count max-count sleep max-depth initial-links -filters spidered todo nonmatching quiet ; +filters spidered todo nonmatching filtered quiet ; TUPLE: spider-result url depth headers fetch-time parsed-html links processing-time timestamp ; +TUPLE: todo-url url depth ; + +: ( url depth -- todo-url ) + todo-url new + swap >>depth + swap >>url ; + +TUPLE: unique-deque assoc deque ; + +: ( -- unique-deque ) + H{ } clone unique-deque boa ; + +: store-url ( url depth unique-deque -- ) + [ ] dip + [ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ] + [ deque>> push-back ] 2bi ; + +: pop-url ( unique-deque -- todo-url ) deque>> pop-front ; + +: peek-url ( unique-deque -- todo-url ) deque>> peek-front ; + : ( base -- spider ) >url spider new over >>base - swap 0 [ heap-push ] keep >>todo - >>nonmatching + swap 0 [ store-url ] keep >>todo + >>nonmatching + >>filtered 0 >>max-depth 0 >>count 1/0. >>max-count @@ -27,10 +49,10 @@ links processing-time timestamp ; > [ '[ _ 1&& ] filter ] when* ; + filters>> [ '[ [ _ 1&& ] filter ] call( seq -- seq' ) ] when* ; -: push-links ( links level assoc-heap -- ) - '[ _ _ heap-push ] each ; +: push-links ( links level unique-deque -- ) + '[ _ _ store-url ] each ; : add-todo ( links level spider -- ) todo>> push-links ; @@ -38,64 +60,74 @@ links processing-time timestamp ; : add-nonmatching ( links level spider -- ) nonmatching>> push-links ; -: filter-base ( spider spider-result -- base-links nonmatching-links ) +: add-filtered ( links level spider -- ) + filtered>> push-links ; + +: filter-base-links ( spider spider-result -- base-links nonmatching-links ) [ base>> host>> ] [ links>> prune ] bi* [ host>> = ] with partition ; : add-spidered ( spider spider-result -- ) [ [ 1+ ] change-count ] dip 2dup [ spidered>> ] [ dup url>> ] bi* rot set-at - [ filter-base ] 2keep + [ filter-base-links ] 2keep depth>> 1+ swap [ add-nonmatching ] [ tuck [ apply-filters ] 2dip add-todo ] 2bi ; -: normalize-hrefs ( links -- links' ) - [ >url ] map - spider get base>> swap [ derive-url ] with map ; +: normalize-hrefs ( links spider -- links' ) + [ [ >url ] map ] dip + base>> swap [ derive-url ] with map ; : print-spidering ( url depth -- ) "depth: " write number>string write ", spidering: " write . yield ; -: (spider-page) ( url depth -- spider-result ) - f pick spider get spidered>> set-at - over '[ _ http-get ] benchmark swap - [ parse-html dup find-hrefs normalize-hrefs ] benchmark +:: new-spidered-result ( spider url depth -- spider-result ) + f url spider spidered>> set-at + [ url http-get ] benchmark :> fetch-time :> html :> headers + [ + html parse-html [ ] [ find-hrefs spider normalize-hrefs ] bi + ] benchmark :> processing-time :> links :> parsed-html + url depth headers fetch-time parsed-html links processing-time now spider-result boa ; -: spider-page ( url depth -- ) - spider get quiet>> [ 2dup print-spidering ] unless - (spider-page) - spider get [ quiet>> [ dup describe ] unless ] - [ swap add-spidered ] bi ; +:: spider-page ( spider url depth -- ) + spider quiet>> [ url depth print-spidering ] unless + spider url depth new-spidered-result :> spidered-result + spider quiet>> [ spidered-result describe ] unless + spider spidered-result add-spidered ; \ spider-page ERROR add-error-logging -: spider-sleep ( -- ) - spider get sleep>> [ sleep ] when* ; +: spider-sleep ( spider -- ) + sleep>> [ sleep ] when* ; -: queue-initial-links ( spider -- spider ) - [ initial-links>> normalize-hrefs 0 ] keep - [ add-todo ] keep ; +:: queue-initial-links ( spider -- spider ) + spider initial-links>> spider normalize-hrefs 0 spider add-todo spider ; -: slurp-heap-while ( heap quot1 quot2: ( value key -- ) -- ) - pick heap-empty? [ 3drop ] [ - [ [ heap-pop dup ] 2dip slip [ t ] compose [ 2drop f ] if ] - [ roll [ slurp-heap-while ] [ 3drop ] if ] 3bi - ] if ; inline recursive +: spider-page? ( spider -- ? ) + { + [ todo>> deque>> deque-empty? not ] + [ [ todo>> peek-url depth>> ] [ max-depth>> ] bi < ] + } 1&& ; + +: setup-next-url ( spider -- spider url depth ) + dup todo>> pop-url [ url>> ] [ depth>> ] bi ; + +: spider-next-page ( spider -- ) + setup-next-url spider-page ; PRIVATE> +: run-spider-loop ( spider -- ) + dup spider-page? [ + [ spider-next-page ] [ run-spider-loop ] bi + ] [ + drop + ] if ; + : run-spider ( spider -- spider ) "spider" [ - dup spider [ - queue-initial-links - [ todo>> ] [ max-depth>> ] bi - '[ - _ <= spider get - [ count>> ] [ max-count>> ] bi < and - ] [ spider-page spider-sleep ] slurp-heap-while - spider get - ] with-variable + queue-initial-links [ run-spider-loop ] keep ] with-logging ; From e22823f2c44ba8769664178b66c2ea9f69d73705 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Tue, 31 Mar 2009 18:23:02 -0500 Subject: [PATCH 2/5] rename word --- extra/spider/spider.factor | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extra/spider/spider.factor b/extra/spider/spider.factor index 42f2485ebe..0287d50692 100644 --- a/extra/spider/spider.factor +++ b/extra/spider/spider.factor @@ -25,7 +25,7 @@ TUPLE: unique-deque assoc deque ; : ( -- unique-deque ) H{ } clone unique-deque boa ; -: store-url ( url depth unique-deque -- ) +: push-url ( url depth unique-deque -- ) [ ] dip [ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ] [ deque>> push-back ] 2bi ; @@ -38,7 +38,7 @@ TUPLE: unique-deque assoc deque ; >url spider new over >>base - swap 0 [ store-url ] keep >>todo + swap 0 [ push-url ] keep >>todo >>nonmatching >>filtered 0 >>max-depth @@ -52,7 +52,7 @@ TUPLE: unique-deque assoc deque ; filters>> [ '[ [ _ 1&& ] filter ] call( seq -- seq' ) ] when* ; : push-links ( links level unique-deque -- ) - '[ _ _ store-url ] each ; + '[ _ _ push-url ] each ; : add-todo ( links level spider -- ) todo>> push-links ; From 8e26b19cc0aa008af012af16f2f1055a10faa251 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Tue, 31 Mar 2009 18:49:41 -0500 Subject: [PATCH 3/5] state-parser works with sequences, not strings fix bug with take-until --- extra/html/parser/parser.factor | 10 ++--- extra/html/parser/state/state-tests.factor | 20 +++++---- extra/html/parser/state/state.factor | 47 ++++++++++++---------- 3 files changed, 44 insertions(+), 33 deletions(-) diff --git a/extra/html/parser/parser.factor b/extra/html/parser/parser.factor index 677737618b..94ef59bdfd 100644 --- a/extra/html/parser/parser.factor +++ b/extra/html/parser/parser.factor @@ -68,10 +68,10 @@ SYMBOL: tagstack [ blank? ] trim ; : read-comment ( state-parser -- ) - "-->" take-until-string make-comment-tag push-tag ; + "-->" take-until-sequence make-comment-tag push-tag ; : read-dtd ( state-parser -- ) - ">" take-until-string make-dtd-tag push-tag ; + ">" take-until-sequence make-dtd-tag push-tag ; : read-bang ( state-parser -- ) next dup { [ get-char CHAR: - = ] [ get-next CHAR: - = ] } 1&& [ @@ -93,7 +93,7 @@ SYMBOL: tagstack : (parse-attributes) ( state-parser -- ) skip-whitespace - dup string-parse-end? [ + dup state-parse-end? [ drop ] [ [ @@ -108,7 +108,7 @@ SYMBOL: tagstack : (parse-tag) ( string -- string' hashtable ) [ [ read-token >lower ] [ parse-attributes ] bi - ] string-parse ; + ] state-parse ; : read-< ( state-parser -- string/f ) next dup get-char [ @@ -126,7 +126,7 @@ SYMBOL: tagstack ] [ drop ] if ; : tag-parse ( quot -- vector ) - V{ } clone tagstack [ string-parse ] with-variable ; inline + V{ } clone tagstack [ state-parse ] with-variable ; inline : parse-html ( string -- vector ) [ (parse-html) tagstack get ] tag-parse ; diff --git a/extra/html/parser/state/state-tests.factor b/extra/html/parser/state/state-tests.factor index f676649aa8..f9862e1e69 100644 --- a/extra/html/parser/state/state-tests.factor +++ b/extra/html/parser/state/state-tests.factor @@ -2,29 +2,35 @@ USING: tools.test html.parser.state ascii kernel accessors ; IN: html.parser.state.tests [ "hello" ] -[ "hello" [ take-rest ] string-parse ] unit-test +[ "hello" [ take-rest ] state-parse ] unit-test [ "hi" " how are you?" ] [ "hi how are you?" - [ [ [ blank? ] take-until ] [ take-rest ] bi ] string-parse + [ [ [ blank? ] take-until ] [ take-rest ] bi ] state-parse ] unit-test [ "foo" ";bar" ] [ "foo;bar" [ - [ CHAR: ; take-until-char ] [ take-rest ] bi - ] string-parse + [ CHAR: ; take-until-object ] [ take-rest ] bi + ] state-parse ] unit-test [ "foo " " bar" ] [ "foo and bar" [ - [ "and" take-until-string ] [ take-rest ] bi - ] string-parse + [ "and" take-until-sequence ] [ take-rest ] bi + ] state-parse ] unit-test [ 6 ] [ - " foo " [ skip-whitespace i>> ] string-parse + " foo " [ skip-whitespace n>> ] state-parse ] unit-test + +[ { 1 2 } ] +[ { 1 2 3 } [ 3 = ] take-until ] unit-test + +[ { 1 2 } ] +[ { 1 2 3 4 } { 3 4 } take-until-sequence ] unit-test diff --git a/extra/html/parser/state/state.factor b/extra/html/parser/state/state.factor index c69fd76af5..2369b1d750 100644 --- a/extra/html/parser/state/state.factor +++ b/extra/html/parser/state/state.factor @@ -2,31 +2,32 @@ ! See http://factorcode.org/license.txt for BSD license. USING: namespaces math kernel sequences accessors fry circular unicode.case unicode.categories locals ; + IN: html.parser.state -TUPLE: state-parser string i ; +TUPLE: state-parser sequence n ; -: ( string -- state-parser ) +: ( sequence -- state-parser ) state-parser new - swap >>string - 0 >>i ; + swap >>sequence + 0 >>n ; -: (get-char) ( i state -- char/f ) - string>> ?nth ; inline +: (get-char) ( n state -- char/f ) + sequence>> ?nth ; inline : get-char ( state -- char/f ) - [ i>> ] keep (get-char) ; inline + [ n>> ] keep (get-char) ; inline : get-next ( state -- char/f ) - [ i>> 1+ ] keep (get-char) ; inline + [ n>> 1 + ] keep (get-char) ; inline : next ( state -- state ) - [ 1+ ] change-i ; inline + [ 1 + ] change-n ; inline : get+increment ( state -- char/f ) [ get-char ] [ next drop ] bi ; inline -: string-parse ( string quot -- ) +: state-parse ( sequence quot -- ) [ ] dip call ; inline :: skip-until ( state quot: ( obj -- ? ) -- ) @@ -34,17 +35,23 @@ TUPLE: state-parser string i ; quot call [ state next quot skip-until ] unless ] when* ; inline recursive -: take-until ( state quot: ( obj -- ? ) -- string ) - [ drop i>> ] - [ skip-until ] - [ drop [ i>> ] [ string>> ] bi ] 2tri subseq ; inline +: state-parse-end? ( state -- ? ) get-next not ; -:: take-until-string ( state-parser string -- string' ) - string length :> growing +: take-until ( state quot: ( obj -- ? ) -- sequence/f ) + over state-parse-end? [ + 2drop f + ] [ + [ drop n>> ] + [ skip-until ] + [ drop [ n>> ] [ sequence>> ] bi ] 2tri subseq + ] if ; inline + +:: take-until-sequence ( state-parser sequence -- sequence' ) + sequence length :> growing state-parser [ growing push-growing-circular - string growing sequence= + sequence growing sequence= ] take-until :> found found dup length growing length 1- - head @@ -53,10 +60,8 @@ TUPLE: state-parser string i ; : skip-whitespace ( state -- state ) [ [ blank? not ] take-until drop ] keep ; -: take-rest ( state -- string ) +: take-rest ( state -- sequence ) [ drop f ] take-until ; inline -: take-until-char ( state ch -- string ) +: take-until-object ( state obj -- sequence ) '[ _ = ] take-until ; - -: string-parse-end? ( state -- ? ) get-next not ; From 19d8a6a552d30964f8d5a684fc9f1b99f96641bb Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Tue, 31 Mar 2009 23:04:59 -0500 Subject: [PATCH 4/5] remove some dead code, make spider use count and max-count again --- extra/spider/spider.factor | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/extra/spider/spider.factor b/extra/spider/spider.factor index 0287d50692..d08276a9bb 100644 --- a/extra/spider/spider.factor +++ b/extra/spider/spider.factor @@ -8,7 +8,7 @@ continuations calendar prettyprint dlists deques locals ; IN: spider TUPLE: spider base count max-count sleep max-depth initial-links -filters spidered todo nonmatching filtered quiet ; +filters spidered todo nonmatching quiet ; TUPLE: spider-result url depth headers fetch-time parsed-html links processing-time timestamp ; @@ -40,7 +40,6 @@ TUPLE: unique-deque assoc deque ; over >>base swap 0 [ push-url ] keep >>todo >>nonmatching - >>filtered 0 >>max-depth 0 >>count 1/0. >>max-count @@ -60,9 +59,6 @@ TUPLE: unique-deque assoc deque ; : add-nonmatching ( links level spider -- ) nonmatching>> push-links ; -: add-filtered ( links level spider -- ) - filtered>> push-links ; - : filter-base-links ( spider spider-result -- base-links nonmatching-links ) [ base>> host>> ] [ links>> prune ] bi* [ host>> = ] with partition ; @@ -110,6 +106,7 @@ TUPLE: unique-deque assoc deque ; { [ todo>> deque>> deque-empty? not ] [ [ todo>> peek-url depth>> ] [ max-depth>> ] bi < ] + [ [ count>> ] [ max-count>> ] bi < ] } 1&& ; : setup-next-url ( spider -- spider url depth ) From 9e9116f0eacd11f8daa5c4475472310f6b641615 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Tue, 31 Mar 2009 23:18:52 -0500 Subject: [PATCH 5/5] Better error message for syntax error in : foo ( : bar --- core/effects/parser/parser.factor | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/effects/parser/parser.factor b/core/effects/parser/parser.factor index b9cb0ddcc9..c8ed6da2aa 100644 --- a/core/effects/parser/parser.factor +++ b/core/effects/parser/parser.factor @@ -15,6 +15,7 @@ ERROR: bad-effect ; scan { { "(" [ ")" parse-effect ] } { f [ ")" unexpected-eof ] } + [ bad-effect ] } case 2array ] when ] if @@ -31,4 +32,4 @@ ERROR: bad-effect ; "(" expect ")" parse-effect ; : parse-call( ( accum word -- accum ) - [ ")" parse-effect ] dip 2array over push-all ; \ No newline at end of file + [ ")" parse-effect ] dip 2array over push-all ;