Merge branch 'master' of git://factorcode.org/git/factor

db4
Slava Pestov 2009-03-31 23:39:36 -05:00
commit 51e43d69ab
6 changed files with 115 additions and 79 deletions

View File

@ -15,6 +15,7 @@ ERROR: bad-effect ;
scan { scan {
{ "(" [ ")" parse-effect ] } { "(" [ ")" parse-effect ] }
{ f [ ")" unexpected-eof ] } { f [ ")" unexpected-eof ] }
[ bad-effect ]
} case 2array } case 2array
] when ] when
] if ] if
@ -31,4 +32,4 @@ ERROR: bad-effect ;
"(" expect ")" parse-effect ; "(" expect ")" parse-effect ;
: parse-call( ( accum word -- accum ) : parse-call( ( accum word -- accum )
[ ")" parse-effect ] dip 2array over push-all ; [ ")" parse-effect ] dip 2array over push-all ;

View File

@ -68,10 +68,10 @@ SYMBOL: tagstack
[ blank? ] trim ; [ blank? ] trim ;
: read-comment ( state-parser -- ) : read-comment ( state-parser -- )
"-->" take-until-string make-comment-tag push-tag ; "-->" take-until-sequence make-comment-tag push-tag ;
: read-dtd ( state-parser -- ) : read-dtd ( state-parser -- )
">" take-until-string make-dtd-tag push-tag ; ">" take-until-sequence make-dtd-tag push-tag ;
: read-bang ( state-parser -- ) : read-bang ( state-parser -- )
next dup { [ get-char CHAR: - = ] [ get-next CHAR: - = ] } 1&& [ next dup { [ get-char CHAR: - = ] [ get-next CHAR: - = ] } 1&& [
@ -93,7 +93,7 @@ SYMBOL: tagstack
: (parse-attributes) ( state-parser -- ) : (parse-attributes) ( state-parser -- )
skip-whitespace skip-whitespace
dup string-parse-end? [ dup state-parse-end? [
drop drop
] [ ] [
[ [
@ -108,7 +108,7 @@ SYMBOL: tagstack
: (parse-tag) ( string -- string' hashtable ) : (parse-tag) ( string -- string' hashtable )
[ [
[ read-token >lower ] [ parse-attributes ] bi [ read-token >lower ] [ parse-attributes ] bi
] string-parse ; ] state-parse ;
: read-< ( state-parser -- string/f ) : read-< ( state-parser -- string/f )
next dup get-char [ next dup get-char [
@ -126,7 +126,7 @@ SYMBOL: tagstack
] [ drop ] if ; ] [ drop ] if ;
: tag-parse ( quot -- vector ) : tag-parse ( quot -- vector )
V{ } clone tagstack [ string-parse ] with-variable ; inline V{ } clone tagstack [ state-parse ] with-variable ; inline
: parse-html ( string -- vector ) : parse-html ( string -- vector )
[ (parse-html) tagstack get ] tag-parse ; [ (parse-html) tagstack get ] tag-parse ;

View File

@ -2,29 +2,35 @@ USING: tools.test html.parser.state ascii kernel accessors ;
IN: html.parser.state.tests IN: html.parser.state.tests
[ "hello" ] [ "hello" ]
[ "hello" [ take-rest ] string-parse ] unit-test [ "hello" [ take-rest ] state-parse ] unit-test
[ "hi" " how are you?" ] [ "hi" " how are you?" ]
[ [
"hi how are you?" "hi how are you?"
[ [ [ blank? ] take-until ] [ take-rest ] bi ] string-parse [ [ [ blank? ] take-until ] [ take-rest ] bi ] state-parse
] unit-test ] unit-test
[ "foo" ";bar" ] [ "foo" ";bar" ]
[ [
"foo;bar" [ "foo;bar" [
[ CHAR: ; take-until-char ] [ take-rest ] bi [ CHAR: ; take-until-object ] [ take-rest ] bi
] string-parse ] state-parse
] unit-test ] unit-test
[ "foo " " bar" ] [ "foo " " bar" ]
[ [
"foo and bar" [ "foo and bar" [
[ "and" take-until-string ] [ take-rest ] bi [ "and" take-until-sequence ] [ take-rest ] bi
] string-parse ] state-parse
] unit-test ] unit-test
[ 6 ] [ 6 ]
[ [
" foo " [ skip-whitespace i>> ] string-parse " foo " [ skip-whitespace n>> ] state-parse
] unit-test ] unit-test
[ { 1 2 } ]
[ { 1 2 3 } <state-parser> [ 3 = ] take-until ] unit-test
[ { 1 2 } ]
[ { 1 2 3 4 } <state-parser> { 3 4 } take-until-sequence ] unit-test

View File

@ -2,31 +2,32 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: namespaces math kernel sequences accessors fry circular USING: namespaces math kernel sequences accessors fry circular
unicode.case unicode.categories locals ; unicode.case unicode.categories locals ;
IN: html.parser.state IN: html.parser.state
TUPLE: state-parser string i ; TUPLE: state-parser sequence n ;
: <state-parser> ( string -- state-parser ) : <state-parser> ( sequence -- state-parser )
state-parser new state-parser new
swap >>string swap >>sequence
0 >>i ; 0 >>n ;
: (get-char) ( i state -- char/f ) : (get-char) ( n state -- char/f )
string>> ?nth ; inline sequence>> ?nth ; inline
: get-char ( state -- char/f ) : get-char ( state -- char/f )
[ i>> ] keep (get-char) ; inline [ n>> ] keep (get-char) ; inline
: get-next ( state -- char/f ) : get-next ( state -- char/f )
[ i>> 1+ ] keep (get-char) ; inline [ n>> 1 + ] keep (get-char) ; inline
: next ( state -- state ) : next ( state -- state )
[ 1+ ] change-i ; inline [ 1 + ] change-n ; inline
: get+increment ( state -- char/f ) : get+increment ( state -- char/f )
[ get-char ] [ next drop ] bi ; inline [ get-char ] [ next drop ] bi ; inline
: string-parse ( string quot -- ) : state-parse ( sequence quot -- )
[ <state-parser> ] dip call ; inline [ <state-parser> ] dip call ; inline
:: skip-until ( state quot: ( obj -- ? ) -- ) :: skip-until ( state quot: ( obj -- ? ) -- )
@ -34,17 +35,23 @@ TUPLE: state-parser string i ;
quot call [ state next quot skip-until ] unless quot call [ state next quot skip-until ] unless
] when* ; inline recursive ] when* ; inline recursive
: take-until ( state quot: ( obj -- ? ) -- string ) : state-parse-end? ( state -- ? ) get-next not ;
[ drop i>> ]
[ skip-until ]
[ drop [ i>> ] [ string>> ] bi ] 2tri subseq ; inline
:: take-until-string ( state-parser string -- string' ) : take-until ( state quot: ( obj -- ? ) -- sequence/f )
string length <growing-circular> :> growing over state-parse-end? [
2drop f
] [
[ drop n>> ]
[ skip-until ]
[ drop [ n>> ] [ sequence>> ] bi ] 2tri subseq
] if ; inline
:: take-until-sequence ( state-parser sequence -- sequence' )
sequence length <growing-circular> :> growing
state-parser state-parser
[ [
growing push-growing-circular growing push-growing-circular
string growing sequence= sequence growing sequence=
] take-until :> found ] take-until :> found
found dup length found dup length
growing length 1- - head growing length 1- - head
@ -53,10 +60,8 @@ TUPLE: state-parser string i ;
: skip-whitespace ( state -- state ) : skip-whitespace ( state -- state )
[ [ blank? not ] take-until drop ] keep ; [ [ blank? not ] take-until drop ] keep ;
: take-rest ( state -- string ) : take-rest ( state -- sequence )
[ drop f ] take-until ; inline [ drop f ] take-until ; inline
: take-until-char ( state ch -- string ) : take-until-object ( state obj -- sequence )
'[ _ = ] take-until ; '[ _ = ] take-until ;
: string-parse-end? ( state -- ? ) get-next not ;

View File

@ -16,11 +16,6 @@ HELP: run-spider
{ "spider" spider } } { "spider" spider } }
{ $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ; { $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ;
HELP: slurp-heap-while
{ $values
{ "heap" "a heap" } { "quot1" quotation } { "quot2" quotation } }
{ $description "Removes values from a heap that match the predicate quotation " { $snippet "quot1" } " and processes them with " { $snippet "quot2" } " until the predicate quotation no longer matches." } ;
ARTICLE: "spider-tutorial" "Spider tutorial" ARTICLE: "spider-tutorial" "Spider tutorial"
"To create a new spider, call the " { $link <spider> } " word with a link to the site you wish to spider." "To create a new spider, call the " { $link <spider> } " word with a link to the site you wish to spider."
{ $code <" "http://concatenative.org" <spider> "> } { $code <" "http://concatenative.org" <spider> "> }

View File

@ -3,8 +3,8 @@
USING: accessors fry html.parser html.parser.analyzer USING: accessors fry html.parser html.parser.analyzer
http.client kernel tools.time sets assocs sequences http.client kernel tools.time sets assocs sequences
concurrency.combinators io threads namespaces math multiline concurrency.combinators io threads namespaces math multiline
heaps math.parser inspector urls assoc-heaps logging math.parser inspector urls logging combinators.short-circuit
combinators.short-circuit continuations calendar prettyprint ; continuations calendar prettyprint dlists deques locals ;
IN: spider IN: spider
TUPLE: spider base count max-count sleep max-depth initial-links TUPLE: spider base count max-count sleep max-depth initial-links
@ -13,12 +13,33 @@ filters spidered todo nonmatching quiet ;
TUPLE: spider-result url depth headers fetch-time parsed-html TUPLE: spider-result url depth headers fetch-time parsed-html
links processing-time timestamp ; links processing-time timestamp ;
TUPLE: todo-url url depth ;
: <todo-url> ( url depth -- todo-url )
todo-url new
swap >>depth
swap >>url ;
TUPLE: unique-deque assoc deque ;
: <unique-deque> ( -- unique-deque )
H{ } clone <dlist> unique-deque boa ;
: push-url ( url depth unique-deque -- )
[ <todo-url> ] dip
[ [ [ t ] dip url>> ] [ assoc>> ] bi* set-at ]
[ deque>> push-back ] 2bi ;
: pop-url ( unique-deque -- todo-url ) deque>> pop-front ;
: peek-url ( unique-deque -- todo-url ) deque>> peek-front ;
: <spider> ( base -- spider ) : <spider> ( base -- spider )
>url >url
spider new spider new
over >>base over >>base
swap 0 <unique-min-heap> [ heap-push ] keep >>todo swap 0 <unique-deque> [ push-url ] keep >>todo
<unique-min-heap> >>nonmatching <unique-deque> >>nonmatching
0 >>max-depth 0 >>max-depth
0 >>count 0 >>count
1/0. >>max-count 1/0. >>max-count
@ -27,10 +48,10 @@ links processing-time timestamp ;
<PRIVATE <PRIVATE
: apply-filters ( links spider -- links' ) : apply-filters ( links spider -- links' )
filters>> [ '[ _ 1&& ] filter ] when* ; filters>> [ '[ [ _ 1&& ] filter ] call( seq -- seq' ) ] when* ;
: push-links ( links level assoc-heap -- ) : push-links ( links level unique-deque -- )
'[ _ _ heap-push ] each ; '[ _ _ push-url ] each ;
: add-todo ( links level spider -- ) : add-todo ( links level spider -- )
todo>> push-links ; todo>> push-links ;
@ -38,64 +59,72 @@ links processing-time timestamp ;
: add-nonmatching ( links level spider -- ) : add-nonmatching ( links level spider -- )
nonmatching>> push-links ; nonmatching>> push-links ;
: filter-base ( spider spider-result -- base-links nonmatching-links ) : filter-base-links ( spider spider-result -- base-links nonmatching-links )
[ base>> host>> ] [ links>> prune ] bi* [ base>> host>> ] [ links>> prune ] bi*
[ host>> = ] with partition ; [ host>> = ] with partition ;
: add-spidered ( spider spider-result -- ) : add-spidered ( spider spider-result -- )
[ [ 1+ ] change-count ] dip [ [ 1+ ] change-count ] dip
2dup [ spidered>> ] [ dup url>> ] bi* rot set-at 2dup [ spidered>> ] [ dup url>> ] bi* rot set-at
[ filter-base ] 2keep [ filter-base-links ] 2keep
depth>> 1+ swap depth>> 1+ swap
[ add-nonmatching ] [ add-nonmatching ]
[ tuck [ apply-filters ] 2dip add-todo ] 2bi ; [ tuck [ apply-filters ] 2dip add-todo ] 2bi ;
: normalize-hrefs ( links -- links' ) : normalize-hrefs ( links spider -- links' )
[ >url ] map [ [ >url ] map ] dip
spider get base>> swap [ derive-url ] with map ; base>> swap [ derive-url ] with map ;
: print-spidering ( url depth -- ) : print-spidering ( url depth -- )
"depth: " write number>string write "depth: " write number>string write
", spidering: " write . yield ; ", spidering: " write . yield ;
: (spider-page) ( url depth -- spider-result ) :: new-spidered-result ( spider url depth -- spider-result )
f pick spider get spidered>> set-at f url spider spidered>> set-at
over '[ _ http-get ] benchmark swap [ url http-get ] benchmark :> fetch-time :> html :> headers
[ parse-html dup find-hrefs normalize-hrefs ] benchmark [
html parse-html [ ] [ find-hrefs spider normalize-hrefs ] bi
] benchmark :> processing-time :> links :> parsed-html
url depth headers fetch-time parsed-html links processing-time
now spider-result boa ; now spider-result boa ;
: spider-page ( url depth -- ) :: spider-page ( spider url depth -- )
spider get quiet>> [ 2dup print-spidering ] unless spider quiet>> [ url depth print-spidering ] unless
(spider-page) spider url depth new-spidered-result :> spidered-result
spider get [ quiet>> [ dup describe ] unless ] spider quiet>> [ spidered-result describe ] unless
[ swap add-spidered ] bi ; spider spidered-result add-spidered ;
\ spider-page ERROR add-error-logging \ spider-page ERROR add-error-logging
: spider-sleep ( -- ) : spider-sleep ( spider -- )
spider get sleep>> [ sleep ] when* ; sleep>> [ sleep ] when* ;
: queue-initial-links ( spider -- spider ) :: queue-initial-links ( spider -- spider )
[ initial-links>> normalize-hrefs 0 ] keep spider initial-links>> spider normalize-hrefs 0 spider add-todo spider ;
[ add-todo ] keep ;
: slurp-heap-while ( heap quot1 quot2: ( value key -- ) -- ) : spider-page? ( spider -- ? )
pick heap-empty? [ 3drop ] [ {
[ [ heap-pop dup ] 2dip slip [ t ] compose [ 2drop f ] if ] [ todo>> deque>> deque-empty? not ]
[ roll [ slurp-heap-while ] [ 3drop ] if ] 3bi [ [ todo>> peek-url depth>> ] [ max-depth>> ] bi < ]
] if ; inline recursive [ [ count>> ] [ max-count>> ] bi < ]
} 1&& ;
: setup-next-url ( spider -- spider url depth )
dup todo>> pop-url [ url>> ] [ depth>> ] bi ;
: spider-next-page ( spider -- )
setup-next-url spider-page ;
PRIVATE> PRIVATE>
: run-spider-loop ( spider -- )
dup spider-page? [
[ spider-next-page ] [ run-spider-loop ] bi
] [
drop
] if ;
: run-spider ( spider -- spider ) : run-spider ( spider -- spider )
"spider" [ "spider" [
dup spider [ queue-initial-links [ run-spider-loop ] keep
queue-initial-links
[ todo>> ] [ max-depth>> ] bi
'[
_ <= spider get
[ count>> ] [ max-count>> ] bi < and
] [ spider-page spider-sleep ] slurp-heap-while
spider get
] with-variable
] with-logging ; ] with-logging ;