spider: Don't try to parse non-html as html.

db4
Doug Coleman 2013-12-07 11:42:45 -08:00
parent 7fb1fc97ec
commit b4037edf48
1 changed files with 15 additions and 6 deletions

View File

@ -5,7 +5,8 @@ http.client kernel tools.time sets assocs sequences
concurrency.combinators io threads namespaces math multiline
math.parser inspector urls logging combinators.short-circuit
continuations calendar prettyprint dlists deques locals
spider.unique-deque combinators concurrency.semaphores ;
spider.unique-deque combinators concurrency.semaphores
io.pathnames ;
IN: spider
TUPLE: spider
@ -85,13 +86,20 @@ fetched-in parsed-html links processed-in fetched-at ;
"depth: " write number>string write
", spidering: " write . yield ;
: url-html? ( url -- ? )
path>> file-extension { ".htm" ".html" f } member? ;
:: fill-spidered-result ( spider spider-result -- )
f spider-result url>> spider spidered>> set-at
f spider-result url>> dup :> url spider spidered>> set-at
[ spider-result url>> http-get ] benchmark :> ( headers html fetched-in )
[
html parse-html
spider currently-spidering>>
over find-all-links normalize-hrefs
url url-html? [
html parse-html
spider currently-spidering>>
over find-all-links normalize-hrefs
] [
f { }
] if
] benchmark :> ( parsed-html links processed-in )
spider-result
headers >>headers
@ -128,7 +136,8 @@ fetched-in parsed-html links processed-in fetched-at ;
dup todo>> pop-url [ url>> ] [ depth>> ] bi <spider-result> ;
: spider-next-page ( spider -- )
setup-next-url spider-page ;
setup-next-url
spider-page ;
PRIVATE>