spider: Don't try to parse non-html as html.
parent
7fb1fc97ec
commit
b4037edf48
|
@ -5,7 +5,8 @@ http.client kernel tools.time sets assocs sequences
|
||||||
concurrency.combinators io threads namespaces math multiline
|
concurrency.combinators io threads namespaces math multiline
|
||||||
math.parser inspector urls logging combinators.short-circuit
|
math.parser inspector urls logging combinators.short-circuit
|
||||||
continuations calendar prettyprint dlists deques locals
|
continuations calendar prettyprint dlists deques locals
|
||||||
spider.unique-deque combinators concurrency.semaphores ;
|
spider.unique-deque combinators concurrency.semaphores
|
||||||
|
io.pathnames ;
|
||||||
IN: spider
|
IN: spider
|
||||||
|
|
||||||
TUPLE: spider
|
TUPLE: spider
|
||||||
|
@ -85,13 +86,20 @@ fetched-in parsed-html links processed-in fetched-at ;
|
||||||
"depth: " write number>string write
|
"depth: " write number>string write
|
||||||
", spidering: " write . yield ;
|
", spidering: " write . yield ;
|
||||||
|
|
||||||
|
: url-html? ( url -- ? )
|
||||||
|
path>> file-extension { ".htm" ".html" f } member? ;
|
||||||
|
|
||||||
:: fill-spidered-result ( spider spider-result -- )
|
:: fill-spidered-result ( spider spider-result -- )
|
||||||
f spider-result url>> spider spidered>> set-at
|
f spider-result url>> dup :> url spider spidered>> set-at
|
||||||
[ spider-result url>> http-get ] benchmark :> ( headers html fetched-in )
|
[ spider-result url>> http-get ] benchmark :> ( headers html fetched-in )
|
||||||
[
|
[
|
||||||
|
url url-html? [
|
||||||
html parse-html
|
html parse-html
|
||||||
spider currently-spidering>>
|
spider currently-spidering>>
|
||||||
over find-all-links normalize-hrefs
|
over find-all-links normalize-hrefs
|
||||||
|
] [
|
||||||
|
f { }
|
||||||
|
] if
|
||||||
] benchmark :> ( parsed-html links processed-in )
|
] benchmark :> ( parsed-html links processed-in )
|
||||||
spider-result
|
spider-result
|
||||||
headers >>headers
|
headers >>headers
|
||||||
|
@ -128,7 +136,8 @@ fetched-in parsed-html links processed-in fetched-at ;
|
||||||
dup todo>> pop-url [ url>> ] [ depth>> ] bi <spider-result> ;
|
dup todo>> pop-url [ url>> ] [ depth>> ] bi <spider-result> ;
|
||||||
|
|
||||||
: spider-next-page ( spider -- )
|
: spider-next-page ( spider -- )
|
||||||
setup-next-url spider-page ;
|
setup-next-url
|
||||||
|
spider-page ;
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue