refactoring spider

db4
Doug Coleman 2009-04-04 11:57:23 -05:00
parent 6f2c4fc02a
commit e929c397d1
4 changed files with 31 additions and 25 deletions

View File

@ -14,7 +14,7 @@ SYMBOL: timings
broken-pages push ;
: record-page-timings ( url spider-result -- )
fetch-time>> 2array timings get push ;
fetched-in>> 2array timings get push ;
: record-network-failure ( url -- )
network-failures get push ;

View File

@ -4,12 +4,13 @@ USING: accessors fry html.parser html.parser.analyzer
http.client kernel tools.time sets assocs sequences
concurrency.combinators io threads namespaces math multiline
math.parser inspector urls logging combinators.short-circuit
continuations calendar prettyprint dlists deques locals ;
continuations calendar prettyprint dlists deques locals
spider.unique-deque ;
IN: spider
TUPLE: spider base count max-count sleep max-depth initial-links
filters spidered todo nonmatching quiet currently-spidering
#threads follow-robots ;
#threads follow-robots? robots ;
TUPLE: spider-result url depth headers
fetched-in parsed-html links processed-in fetched-at ;
@ -21,26 +22,6 @@ TUPLE: todo-url url depth ;
swap >>depth
swap >>url ;
TUPLE: unique-deque assoc deque ;
: <unique-deque> ( -- unique-deque )
H{ } clone <dlist> unique-deque boa ;
: url-exists? ( url unique-deque -- ? )
[ url>> ] [ assoc>> ] bi* key? ;
: push-url ( url depth unique-deque -- )
[ <todo-url> ] dip 2dup url-exists? [
2drop
] [
[ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ]
[ deque>> push-back ] 2bi
] if ;
: pop-url ( unique-deque -- todo-url ) deque>> pop-front ;
: peek-url ( unique-deque -- todo-url ) deque>> peek-front ;
: <spider> ( base -- spider )
>url
spider new
@ -89,13 +70,13 @@ TUPLE: unique-deque assoc deque ;
:: new-spidered-result ( spider url depth -- spider-result )
f url spider spidered>> set-at
[ url http-get ] benchmark :> fetch-time :> html :> headers
[ url http-get ] benchmark :> fetched-at :> html :> headers
[
html parse-html
spider currently-spidering>>
over find-all-links normalize-hrefs
] benchmark :> processing-time :> links :> parsed-html
url depth headers fetch-time parsed-html links processing-time
url depth headers fetched-at parsed-html links processing-time
now spider-result boa ;
:: spider-page ( spider url depth -- )

View File

@ -0,0 +1 @@
Doug Coleman

View File

@ -0,0 +1,24 @@
! Copyright (C) 2009 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs deques dlists kernel spider ;
IN: spider.unique-deque
TUPLE: unique-deque assoc deque ;
: <unique-deque> ( -- unique-deque )
H{ } clone <dlist> unique-deque boa ;
: url-exists? ( url unique-deque -- ? )
[ url>> ] [ assoc>> ] bi* key? ;
: push-url ( url depth unique-deque -- )
[ <todo-url> ] dip 2dup url-exists? [
2drop
] [
[ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ]
[ deque>> push-back ] 2bi
] if ;
: pop-url ( unique-deque -- todo-url ) deque>> pop-front ;
: peek-url ( unique-deque -- todo-url ) deque>> peek-front ;