refactoring spider

db4
Doug Coleman 2009-04-04 11:57:23 -05:00
parent 6f2c4fc02a
commit e929c397d1
4 changed files with 31 additions and 25 deletions

View File

@ -14,7 +14,7 @@ SYMBOL: timings
broken-pages push ; broken-pages push ;
: record-page-timings ( url spider-result -- ) : record-page-timings ( url spider-result -- )
fetch-time>> 2array timings get push ; fetched-in>> 2array timings get push ;
: record-network-failure ( url -- ) : record-network-failure ( url -- )
network-failures get push ; network-failures get push ;

View File

@ -4,12 +4,13 @@ USING: accessors fry html.parser html.parser.analyzer
http.client kernel tools.time sets assocs sequences http.client kernel tools.time sets assocs sequences
concurrency.combinators io threads namespaces math multiline concurrency.combinators io threads namespaces math multiline
math.parser inspector urls logging combinators.short-circuit math.parser inspector urls logging combinators.short-circuit
continuations calendar prettyprint dlists deques locals ; continuations calendar prettyprint dlists deques locals
spider.unique-deque ;
IN: spider IN: spider
TUPLE: spider base count max-count sleep max-depth initial-links TUPLE: spider base count max-count sleep max-depth initial-links
filters spidered todo nonmatching quiet currently-spidering filters spidered todo nonmatching quiet currently-spidering
#threads follow-robots ; #threads follow-robots? robots ;
TUPLE: spider-result url depth headers TUPLE: spider-result url depth headers
fetched-in parsed-html links processed-in fetched-at ; fetched-in parsed-html links processed-in fetched-at ;
@ -21,26 +22,6 @@ TUPLE: todo-url url depth ;
swap >>depth swap >>depth
swap >>url ; swap >>url ;
TUPLE: unique-deque assoc deque ;
: <unique-deque> ( -- unique-deque )
H{ } clone <dlist> unique-deque boa ;
: url-exists? ( url unique-deque -- ? )
[ url>> ] [ assoc>> ] bi* key? ;
: push-url ( url depth unique-deque -- )
[ <todo-url> ] dip 2dup url-exists? [
2drop
] [
[ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ]
[ deque>> push-back ] 2bi
] if ;
: pop-url ( unique-deque -- todo-url ) deque>> pop-front ;
: peek-url ( unique-deque -- todo-url ) deque>> peek-front ;
: <spider> ( base -- spider ) : <spider> ( base -- spider )
>url >url
spider new spider new
@ -89,13 +70,13 @@ TUPLE: unique-deque assoc deque ;
:: new-spidered-result ( spider url depth -- spider-result ) :: new-spidered-result ( spider url depth -- spider-result )
f url spider spidered>> set-at f url spider spidered>> set-at
[ url http-get ] benchmark :> fetch-time :> html :> headers [ url http-get ] benchmark :> fetched-at :> html :> headers
[ [
html parse-html html parse-html
spider currently-spidering>> spider currently-spidering>>
over find-all-links normalize-hrefs over find-all-links normalize-hrefs
] benchmark :> processing-time :> links :> parsed-html ] benchmark :> processing-time :> links :> parsed-html
url depth headers fetch-time parsed-html links processing-time url depth headers fetched-at parsed-html links processing-time
now spider-result boa ; now spider-result boa ;
:: spider-page ( spider url depth -- ) :: spider-page ( spider url depth -- )

View File

@ -0,0 +1 @@
Doug Coleman

View File

@ -0,0 +1,24 @@
! Copyright (C) 2009 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs deques dlists kernel spider ;
IN: spider.unique-deque
TUPLE: unique-deque assoc deque ;
: <unique-deque> ( -- unique-deque )
H{ } clone <dlist> unique-deque boa ;
: url-exists? ( url unique-deque -- ? )
[ url>> ] [ assoc>> ] bi* key? ;
: push-url ( url depth unique-deque -- )
[ <todo-url> ] dip 2dup url-exists? [
2drop
] [
[ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ]
[ deque>> push-back ] 2bi
] if ;
: pop-url ( unique-deque -- todo-url ) deque>> pop-front ;
: peek-url ( unique-deque -- todo-url ) deque>> peek-front ;