redo spider without dynamic variables

db4
Doug Coleman 2009-03-31 18:21:15 -05:00
parent 70d04c04e0
commit ce04d1dfa9
2 changed files with 73 additions and 46 deletions

View File

@ -16,11 +16,6 @@ HELP: run-spider
{ "spider" spider } } { "spider" spider } }
{ $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ; { $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ;
HELP: slurp-heap-while
{ $values
{ "heap" "a heap" } { "quot1" quotation } { "quot2" quotation } }
{ $description "Removes values from a heap that match the predicate quotation " { $snippet "quot1" } " and processes them with " { $snippet "quot2" } " until the predicate quotation no longer matches." } ;
ARTICLE: "spider-tutorial" "Spider tutorial" ARTICLE: "spider-tutorial" "Spider tutorial"
"To create a new spider, call the " { $link <spider> } " word with a link to the site you wish to spider." "To create a new spider, call the " { $link <spider> } " word with a link to the site you wish to spider."
{ $code <" "http://concatenative.org" <spider> "> } { $code <" "http://concatenative.org" <spider> "> }

View File

@ -3,22 +3,44 @@
USING: accessors fry html.parser html.parser.analyzer USING: accessors fry html.parser html.parser.analyzer
http.client kernel tools.time sets assocs sequences http.client kernel tools.time sets assocs sequences
concurrency.combinators io threads namespaces math multiline concurrency.combinators io threads namespaces math multiline
heaps math.parser inspector urls assoc-heaps logging math.parser inspector urls logging combinators.short-circuit
combinators.short-circuit continuations calendar prettyprint ; continuations calendar prettyprint dlists deques locals ;
IN: spider IN: spider
TUPLE: spider base count max-count sleep max-depth initial-links TUPLE: spider base count max-count sleep max-depth initial-links
filters spidered todo nonmatching quiet ; filters spidered todo nonmatching filtered quiet ;
TUPLE: spider-result url depth headers fetch-time parsed-html TUPLE: spider-result url depth headers fetch-time parsed-html
links processing-time timestamp ; links processing-time timestamp ;
TUPLE: todo-url url depth ;
: <todo-url> ( url depth -- todo-url )
todo-url new
swap >>depth
swap >>url ;
TUPLE: unique-deque assoc deque ;
: <unique-deque> ( -- unique-deque )
H{ } clone <dlist> unique-deque boa ;
: store-url ( url depth unique-deque -- )
[ <todo-url> ] dip
[ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ]
[ deque>> push-back ] 2bi ;
: pop-url ( unique-deque -- todo-url ) deque>> pop-front ;
: peek-url ( unique-deque -- todo-url ) deque>> peek-front ;
: <spider> ( base -- spider ) : <spider> ( base -- spider )
>url >url
spider new spider new
over >>base over >>base
swap 0 <unique-min-heap> [ heap-push ] keep >>todo swap 0 <unique-deque> [ store-url ] keep >>todo
<unique-min-heap> >>nonmatching <unique-deque> >>nonmatching
<unique-deque> >>filtered
0 >>max-depth 0 >>max-depth
0 >>count 0 >>count
1/0. >>max-count 1/0. >>max-count
@ -27,10 +49,10 @@ links processing-time timestamp ;
<PRIVATE <PRIVATE
: apply-filters ( links spider -- links' ) : apply-filters ( links spider -- links' )
filters>> [ '[ _ 1&& ] filter ] when* ; filters>> [ '[ [ _ 1&& ] filter ] call( seq -- seq' ) ] when* ;
: push-links ( links level assoc-heap -- ) : push-links ( links level unique-deque -- )
'[ _ _ heap-push ] each ; '[ _ _ store-url ] each ;
: add-todo ( links level spider -- ) : add-todo ( links level spider -- )
todo>> push-links ; todo>> push-links ;
@ -38,64 +60,74 @@ links processing-time timestamp ;
: add-nonmatching ( links level spider -- ) : add-nonmatching ( links level spider -- )
nonmatching>> push-links ; nonmatching>> push-links ;
: filter-base ( spider spider-result -- base-links nonmatching-links ) : add-filtered ( links level spider -- )
filtered>> push-links ;
: filter-base-links ( spider spider-result -- base-links nonmatching-links )
[ base>> host>> ] [ links>> prune ] bi* [ base>> host>> ] [ links>> prune ] bi*
[ host>> = ] with partition ; [ host>> = ] with partition ;
: add-spidered ( spider spider-result -- ) : add-spidered ( spider spider-result -- )
[ [ 1+ ] change-count ] dip [ [ 1+ ] change-count ] dip
2dup [ spidered>> ] [ dup url>> ] bi* rot set-at 2dup [ spidered>> ] [ dup url>> ] bi* rot set-at
[ filter-base ] 2keep [ filter-base-links ] 2keep
depth>> 1+ swap depth>> 1+ swap
[ add-nonmatching ] [ add-nonmatching ]
[ tuck [ apply-filters ] 2dip add-todo ] 2bi ; [ tuck [ apply-filters ] 2dip add-todo ] 2bi ;
: normalize-hrefs ( links -- links' ) : normalize-hrefs ( links spider -- links' )
[ >url ] map [ [ >url ] map ] dip
spider get base>> swap [ derive-url ] with map ; base>> swap [ derive-url ] with map ;
: print-spidering ( url depth -- ) : print-spidering ( url depth -- )
"depth: " write number>string write "depth: " write number>string write
", spidering: " write . yield ; ", spidering: " write . yield ;
: (spider-page) ( url depth -- spider-result ) :: new-spidered-result ( spider url depth -- spider-result )
f pick spider get spidered>> set-at f url spider spidered>> set-at
over '[ _ http-get ] benchmark swap [ url http-get ] benchmark :> fetch-time :> html :> headers
[ parse-html dup find-hrefs normalize-hrefs ] benchmark [
html parse-html [ ] [ find-hrefs spider normalize-hrefs ] bi
] benchmark :> processing-time :> links :> parsed-html
url depth headers fetch-time parsed-html links processing-time
now spider-result boa ; now spider-result boa ;
: spider-page ( url depth -- ) :: spider-page ( spider url depth -- )
spider get quiet>> [ 2dup print-spidering ] unless spider quiet>> [ url depth print-spidering ] unless
(spider-page) spider url depth new-spidered-result :> spidered-result
spider get [ quiet>> [ dup describe ] unless ] spider quiet>> [ spidered-result describe ] unless
[ swap add-spidered ] bi ; spider spidered-result add-spidered ;
\ spider-page ERROR add-error-logging \ spider-page ERROR add-error-logging
: spider-sleep ( -- ) : spider-sleep ( spider -- )
spider get sleep>> [ sleep ] when* ; sleep>> [ sleep ] when* ;
: queue-initial-links ( spider -- spider ) :: queue-initial-links ( spider -- spider )
[ initial-links>> normalize-hrefs 0 ] keep spider initial-links>> spider normalize-hrefs 0 spider add-todo spider ;
[ add-todo ] keep ;
: slurp-heap-while ( heap quot1 quot2: ( value key -- ) -- ) : spider-page? ( spider -- ? )
pick heap-empty? [ 3drop ] [ {
[ [ heap-pop dup ] 2dip slip [ t ] compose [ 2drop f ] if ] [ todo>> deque>> deque-empty? not ]
[ roll [ slurp-heap-while ] [ 3drop ] if ] 3bi [ [ todo>> peek-url depth>> ] [ max-depth>> ] bi < ]
] if ; inline recursive } 1&& ;
: setup-next-url ( spider -- spider url depth )
dup todo>> pop-url [ url>> ] [ depth>> ] bi ;
: spider-next-page ( spider -- )
setup-next-url spider-page ;
PRIVATE> PRIVATE>
: run-spider-loop ( spider -- )
dup spider-page? [
[ spider-next-page ] [ run-spider-loop ] bi
] [
drop
] if ;
: run-spider ( spider -- spider ) : run-spider ( spider -- spider )
"spider" [ "spider" [
dup spider [ queue-initial-links [ run-spider-loop ] keep
queue-initial-links
[ todo>> ] [ max-depth>> ] bi
'[
_ <= spider get
[ count>> ] [ max-count>> ] bi < and
] [ spider-page spider-sleep ] slurp-heap-while
spider get
] with-variable
] with-logging ; ] with-logging ;