diff --git a/extra/spider/report/report.factor b/extra/spider/report/report.factor index 8bb4f91f82..43952701d5 100644 --- a/extra/spider/report/report.factor +++ b/extra/spider/report/report.factor @@ -14,7 +14,7 @@ SYMBOL: timings broken-pages push ; : record-page-timings ( url spider-result -- ) - fetch-time>> 2array timings get push ; + fetched-in>> 2array timings get push ; : record-network-failure ( url -- ) network-failures get push ; diff --git a/extra/spider/spider.factor b/extra/spider/spider.factor index 07989860ff..39ee3b5d7b 100644 --- a/extra/spider/spider.factor +++ b/extra/spider/spider.factor @@ -4,12 +4,13 @@ USING: accessors fry html.parser html.parser.analyzer http.client kernel tools.time sets assocs sequences concurrency.combinators io threads namespaces math multiline math.parser inspector urls logging combinators.short-circuit -continuations calendar prettyprint dlists deques locals ; +continuations calendar prettyprint dlists deques locals +spider.unique-deque ; IN: spider TUPLE: spider base count max-count sleep max-depth initial-links filters spidered todo nonmatching quiet currently-spidering -#threads follow-robots ; +#threads follow-robots? robots ; TUPLE: spider-result url depth headers fetched-in parsed-html links processed-in fetched-at ; @@ -21,26 +22,6 @@ TUPLE: todo-url url depth ; swap >>depth swap >>url ; -TUPLE: unique-deque assoc deque ; - -: ( -- unique-deque ) - H{ } clone unique-deque boa ; - -: url-exists? ( url unique-deque -- ? ) - [ url>> ] [ assoc>> ] bi* key? ; - -: push-url ( url depth unique-deque -- ) - [ ] dip 2dup url-exists? [ - 2drop - ] [ - [ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ] - [ deque>> push-back ] 2bi - ] if ; - -: pop-url ( unique-deque -- todo-url ) deque>> pop-front ; - -: peek-url ( unique-deque -- todo-url ) deque>> peek-front ; - : ( base -- spider ) >url spider new @@ -89,13 +70,13 @@ TUPLE: unique-deque assoc deque ; :: new-spidered-result ( spider url depth -- spider-result ) f url spider spidered>> set-at - [ url http-get ] benchmark :> fetch-time :> html :> headers + [ url http-get ] benchmark :> fetched-at :> html :> headers [ html parse-html spider currently-spidering>> over find-all-links normalize-hrefs ] benchmark :> processing-time :> links :> parsed-html - url depth headers fetch-time parsed-html links processing-time + url depth headers fetched-at parsed-html links processing-time now spider-result boa ; :: spider-page ( spider url depth -- ) diff --git a/extra/spider/unique-deque/authors.txt b/extra/spider/unique-deque/authors.txt new file mode 100644 index 0000000000..b4bd0e7b35 --- /dev/null +++ b/extra/spider/unique-deque/authors.txt @@ -0,0 +1 @@ +Doug Coleman \ No newline at end of file diff --git a/extra/spider/unique-deque/unique-deque.factor b/extra/spider/unique-deque/unique-deque.factor new file mode 100644 index 0000000000..28d92633d1 --- /dev/null +++ b/extra/spider/unique-deque/unique-deque.factor @@ -0,0 +1,24 @@ +! Copyright (C) 2009 Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors assocs deques dlists kernel spider ; +IN: spider.unique-deque + +TUPLE: unique-deque assoc deque ; + +: ( -- unique-deque ) + H{ } clone unique-deque boa ; + +: url-exists? ( url unique-deque -- ? ) + [ url>> ] [ assoc>> ] bi* key? ; + +: push-url ( url depth unique-deque -- ) + [ ] dip 2dup url-exists? [ + 2drop + ] [ + [ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ] + [ deque>> push-back ] 2bi + ] if ; + +: pop-url ( unique-deque -- todo-url ) deque>> pop-front ; + +: peek-url ( unique-deque -- todo-url ) deque>> peek-front ;