spruce up the spider

2008-10-01 19:29:18 -05:00 · 2008-10-01 19:29:18 -05:00 · ebe8adfe31
parent 723cd453a7
commit ebe8adfe31
2 changed files with 67 additions and 4 deletions
--- a/extra/spider/spider-docs.factor
+++ b/extra/spider/spider-docs.factor
@ -0,0 +1,57 @@
+! Copyright (C) 2008 Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.markup help.syntax io.streams.string urls
+multiline ;
+IN: spider
+
+HELP: <spider>
+{ $values
+     { "base" "a string or url" }
+     { "spider" spider } }
+{ $description "Creates a new web spider with a given base url." } ;
+
+HELP: run-spider
+{ $values
+     { "spider" spider }
+     { "spider" spider } }
+{ $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ;
+
+HELP: spider
+{ $description "" } ;
+
+HELP: spider-result
+{ $description "" } ;
+
+ARTICLE: "spider-tutorial" "Spider tutorial"
+"To create a new spider, call the " { $link <spider> } " word with a link to the site you wish to spider."
+{ $code <" "http://concatentative.org" <spider> "> }
+"The max-depth is initialized to 0, which retrieves just the initial page. Let's initialize it to something more fun:"
+{ $code <" 1 >>max-depth "> }
+"Now the spider will retrieve the first page and all the pages it links to in the same domain." $nl
+"But suppose the front page contains thousands of links. To avoid grabbing them all, we can set " { $slot "max-count" } " to a reasonable limit."
+{ $code <" 10 >>max-count "> }
+"A timeout might keep the spider from hitting the server too hard:"
+{ $code <" USE: calendar 1.5 seconds >>sleep "> }
+"Since we happen to know that not all pages of a wiki are suitable for spidering, we will spider only the wiki view pages, not the edit or revisions pages. To do this, we add a filter through which new links are tested; links that pass the filter are added to the todo queue, while links that do not are discarded. You can add several filters to the filter array, but we'll just add a single one for now."
+{ $code <" { [ path>> "/wiki/view" head? ] } >>filters "> }
+"Finally, to start the spider, call the " { $link run-spider } " word."
+{ $code "run-spider" }
+"The full code from the tutorial."
+{ $code <" USING: spider calendar sequences accessors ;
+: spider-concatenative ( -- spider )
+    "http://concatenative.org" <spider>
+    1 >>max-depth
+    10 >>max-count
+    1.5 seconds >>sleep 
+    { [ path>> "/wiki/view" head? ] } >>filters
+    run-spider ;"> } ;
+
+ARTICLE: "spider" "Spider"
+"The " { $vocab-link "spider" } " vocabulary implements a simple web spider for retrieving sets of webpages."
+{ $subsection "spider-tutorial" }
+"Creating a new spider:"
+{ $subsection <spider> }
+"Running the spider:"
+{ $subsection run-spider } ;
+
+ABOUT: "spider"
--- a/extra/spider/spider.factor
+++ b/extra/spider/spider.factor
@ -7,11 +7,12 @@ heaps math.parser inspector urls assoc-deques logging
 combinators.short-circuit continuations calendar prettyprint ;
 IN: spider

-TUPLE: spider base count max-count sleep max-depth secure? agent timeout
-filters spidered todo nonmatching initial-links ;
+TUPLE: spider base count max-count sleep max-depth initial-links
+filters spidered todo nonmatching ;
+! secure? agent page-timeout data-timeout overall-timeout

 TUPLE: spider-result url depth headers fetch-time parsed-html
-links processing-time ;
+links processing-time timestamp ;

 : <spider> ( base -- spider )
    >url
@ -65,7 +66,7 @@ links processing-time ;
    f pick spider get spidered>> set-at
    over '[ _ http-get ] benchmark swap
    [ parse-html dup find-hrefs normalize-hrefs ] benchmark
-    spider-result boa
+    now spider-result boa
    dup describe ;

 : spider-page ( url depth -- )
@ -76,11 +77,16 @@ links processing-time ;
 : spider-sleep ( -- )
    spider get sleep>> [ sleep ] when* ;

+: queue-initial-links ( spider -- spider )
+    [ initial-links>> normalize-hrefs 0 ] keep
+    [ add-todo ] keep ;
+
 PRIVATE>

 : run-spider ( spider -- spider )
    "spider" [
        dup spider [
+            queue-initial-links
            [ todo>> ] [ max-depth>> ] bi
            '[
                _ <= spider get