refactoring spider

db4
Doug Coleman 2009-04-03 21:16:08 -05:00
parent 8875c2ba26
commit 1ee52e2090
1 changed files with 19 additions and 19 deletions

View File

@ -4,15 +4,15 @@ USING: accessors fry html.parser html.parser.analyzer
http.client kernel tools.time sets assocs sequences http.client kernel tools.time sets assocs sequences
concurrency.combinators io threads namespaces math multiline concurrency.combinators io threads namespaces math multiline
math.parser inspector urls logging combinators.short-circuit math.parser inspector urls logging combinators.short-circuit
continuations calendar prettyprint dlists deques locals continuations calendar prettyprint dlists deques locals ;
present ;
IN: spider IN: spider
TUPLE: spider base count max-count sleep max-depth initial-links TUPLE: spider base count max-count sleep max-depth initial-links
filters spidered todo nonmatching quiet currently-spidering ; filters spidered todo nonmatching quiet currently-spidering
#threads follow-robots ;
TUPLE: spider-result url depth headers fetch-time parsed-html TUPLE: spider-result url depth headers
links processing-time timestamp ; fetched-in parsed-html links processed-in fetched-at ;
TUPLE: todo-url url depth ; TUPLE: todo-url url depth ;
@ -51,7 +51,8 @@ TUPLE: unique-deque assoc deque ;
0 >>max-depth 0 >>max-depth
0 >>count 0 >>count
1/0. >>max-count 1/0. >>max-count
H{ } clone >>spidered ; H{ } clone >>spidered
1 >>#threads ;
<PRIVATE <PRIVATE
@ -79,12 +80,8 @@ TUPLE: unique-deque assoc deque ;
[ add-nonmatching ] [ add-nonmatching ]
[ tuck [ apply-filters ] 2dip add-todo ] 2bi ; [ tuck [ apply-filters ] 2dip add-todo ] 2bi ;
: url-absolute? ( url -- ? ) : normalize-hrefs ( base links -- links' )
present "http://" head? ; [ derive-url ] with map ;
: normalize-hrefs ( links spider -- links' )
currently-spidering>> present swap
[ [ >url ] bi@ derive-url ] with map ;
: print-spidering ( url depth -- ) : print-spidering ( url depth -- )
"depth: " write number>string write "depth: " write number>string write
@ -94,7 +91,9 @@ TUPLE: unique-deque assoc deque ;
f url spider spidered>> set-at f url spider spidered>> set-at
[ url http-get ] benchmark :> fetch-time :> html :> headers [ url http-get ] benchmark :> fetch-time :> html :> headers
[ [
html parse-html [ ] [ find-all-links spider normalize-hrefs ] bi html parse-html
spider currently-spidering>>
over find-all-links normalize-hrefs
] benchmark :> processing-time :> links :> parsed-html ] benchmark :> processing-time :> links :> parsed-html
url depth headers fetch-time parsed-html links processing-time url depth headers fetch-time parsed-html links processing-time
now spider-result boa ; now spider-result boa ;
@ -107,11 +106,12 @@ TUPLE: unique-deque assoc deque ;
\ spider-page ERROR add-error-logging \ spider-page ERROR add-error-logging
: spider-sleep ( spider -- ) : spider-sleep ( spider -- ) sleep>> [ sleep ] when* ;
sleep>> [ sleep ] when* ;
:: queue-initial-links ( spider -- spider ) : queue-initial-links ( spider -- )
spider initial-links>> spider normalize-hrefs 0 spider add-todo spider ; [
[ currently-spidering>> ] [ initial-links>> ] bi normalize-hrefs 0
] keep add-todo ;
: spider-page? ( spider -- ? ) : spider-page? ( spider -- ? )
{ {
@ -121,7 +121,7 @@ TUPLE: unique-deque assoc deque ;
} 1&& ; } 1&& ;
: setup-next-url ( spider -- spider url depth ) : setup-next-url ( spider -- spider url depth )
dup todo>> peek-url url>> present >>currently-spidering dup todo>> peek-url url>> >>currently-spidering
dup todo>> pop-url [ url>> ] [ depth>> ] bi ; dup todo>> pop-url [ url>> ] [ depth>> ] bi ;
: spider-next-page ( spider -- ) : spider-next-page ( spider -- )
@ -138,5 +138,5 @@ PRIVATE>
: run-spider ( spider -- spider ) : run-spider ( spider -- spider )
"spider" [ "spider" [
queue-initial-links [ run-spider-loop ] keep dup queue-initial-links [ run-spider-loop ] keep
] with-logging ; ] with-logging ;