Merge branch 'master' of git://factorcode.org/git/factor
commit
cdf7091ae8
|
@ -2,7 +2,7 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors kernel math namespaces make sequences random
|
USING: accessors kernel math namespaces make sequences random
|
||||||
strings math.parser math.intervals combinators math.bitwise
|
strings math.parser math.intervals combinators math.bitwise
|
||||||
nmake db db.tuples db.types db.sql classes words shuffle arrays
|
nmake db db.tuples db.types classes words shuffle arrays
|
||||||
destructors continuations db.tuples.private prettyprint ;
|
destructors continuations db.tuples.private prettyprint ;
|
||||||
IN: db.queries
|
IN: db.queries
|
||||||
|
|
||||||
|
@ -186,18 +186,6 @@ M: db query>statement ( query -- tuple )
|
||||||
|
|
||||||
! select ID, NAME, SCORE from EXAM limit 1 offset 3
|
! select ID, NAME, SCORE from EXAM limit 1 offset 3
|
||||||
|
|
||||||
: select-tuples* ( tuple -- statement )
|
|
||||||
dup
|
|
||||||
[
|
|
||||||
select 0,
|
|
||||||
dup class db-columns [ ", " 0, ]
|
|
||||||
[ dup column-name>> 0, 2, ] interleave
|
|
||||||
from 0,
|
|
||||||
class name>> 0,
|
|
||||||
] { { } { } { } } nmake
|
|
||||||
>r >r parse-sql 4drop r> r>
|
|
||||||
<simple-statement> maybe-make-retryable do-select ;
|
|
||||||
|
|
||||||
M: db <count-statement> ( query -- statement )
|
M: db <count-statement> ( query -- statement )
|
||||||
[ tuple>> dup class ] keep
|
[ tuple>> dup class ] keep
|
||||||
[ [ "select count(*) from " 0% 0% where-clause ] query-make ]
|
[ [ "select count(*) from " 0% 0% where-clause ] query-make ]
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
USING: heaps.private help.markup help.syntax kernel math assocs
|
USING: heaps.private help.markup help.syntax kernel math assocs
|
||||||
math.order ;
|
math.order quotations ;
|
||||||
IN: heaps
|
IN: heaps
|
||||||
|
|
||||||
ARTICLE: "heaps" "Heaps"
|
ARTICLE: "heaps" "Heaps"
|
||||||
|
@ -28,7 +28,11 @@ $nl
|
||||||
"Removal:"
|
"Removal:"
|
||||||
{ $subsection heap-pop* }
|
{ $subsection heap-pop* }
|
||||||
{ $subsection heap-pop }
|
{ $subsection heap-pop }
|
||||||
{ $subsection heap-delete } ;
|
{ $subsection heap-delete }
|
||||||
|
$nl
|
||||||
|
"Processing heaps:"
|
||||||
|
{ $subsection slurp-heap }
|
||||||
|
{ $subsection slurp-heap-when } ;
|
||||||
|
|
||||||
ABOUT: "heaps"
|
ABOUT: "heaps"
|
||||||
|
|
||||||
|
@ -82,3 +86,13 @@ HELP: heap-delete
|
||||||
{ $description "Remove the specified entry from the heap." }
|
{ $description "Remove the specified entry from the heap." }
|
||||||
{ $errors "Throws an error if the entry is from another heap or if it has already been deleted." }
|
{ $errors "Throws an error if the entry is from another heap or if it has already been deleted." }
|
||||||
{ $side-effects "heap" } ;
|
{ $side-effects "heap" } ;
|
||||||
|
|
||||||
|
HELP: slurp-heap
|
||||||
|
{ $values
|
||||||
|
{ "heap" "a heap" } { "quot" quotation } }
|
||||||
|
{ $description "Removes values from a heap and processes them with the quotation until the heap is empty." } ;
|
||||||
|
|
||||||
|
HELP: slurp-heap-when
|
||||||
|
{ $values
|
||||||
|
{ "heap" "a heap" } { "quot1" quotation } { "quot2" quotation } }
|
||||||
|
{ $description "Removes values from a heap that match the predicate quotation " { $snippet "quot1" } " and processes them with " { $snippet "quot2" } " until the predicate quotation no longer matches." } ;
|
||||||
|
|
|
@ -195,3 +195,9 @@ M: heap heap-pop ( heap -- value key )
|
||||||
over heap-empty? [ 2drop ] [
|
over heap-empty? [ 2drop ] [
|
||||||
[ [ heap-pop drop ] dip call ] [ slurp-heap ] 2bi
|
[ [ heap-pop drop ] dip call ] [ slurp-heap ] 2bi
|
||||||
] if ; inline recursive
|
] if ; inline recursive
|
||||||
|
|
||||||
|
: slurp-heap-when ( heap quot1 quot2: ( value key -- ) -- )
|
||||||
|
pick heap-empty? [ 3drop ] [
|
||||||
|
[ [ heap-pop dup ] 2dip slip [ t ] compose [ 2drop f ] if ]
|
||||||
|
[ roll [ slurp-heap-when ] [ 3drop ] if ] 3bi
|
||||||
|
] if ; inline recursive
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Doug Coleman
|
|
@ -0,0 +1,4 @@
|
||||||
|
! Copyright (C) 2008 Slava Pestov.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: tools.test interpolate ;
|
||||||
|
IN: interpolate.tests
|
|
@ -0,0 +1,21 @@
|
||||||
|
! Copyright (C) 2008 Slava Pestov.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: io kernel macros make multiline namespaces parser
|
||||||
|
peg.ebnf present sequences strings ;
|
||||||
|
IN: interpolate
|
||||||
|
|
||||||
|
MACRO: interpolate ( string -- )
|
||||||
|
[EBNF
|
||||||
|
var = "${" [^}]+ "}" => [[ second >string [ get present write ] curry ]]
|
||||||
|
text = [^$]+ => [[ >string [ write ] curry ]]
|
||||||
|
interpolate = (var|text)* => [[ [ ] join ]]
|
||||||
|
EBNF] ;
|
||||||
|
|
||||||
|
EBNF: interpolate-locals
|
||||||
|
var = "${" [^}]+ "}" => [[ [ second >string search , [ present write ] % ] [ ] make ]]
|
||||||
|
text = [^$]+ => [[ [ >string , [ write ] % ] [ ] make ]]
|
||||||
|
interpolate = (var|text)* => [[ [ ] join ]]
|
||||||
|
;EBNF
|
||||||
|
|
||||||
|
: I[ "]I" parse-multiline-string
|
||||||
|
interpolate-locals parsed \ call parsed ; parsing
|
|
@ -13,8 +13,7 @@ HELP: help.
|
||||||
{ $description "Prints out scaffold help markup for a given word." } ;
|
{ $description "Prints out scaffold help markup for a given word." } ;
|
||||||
|
|
||||||
HELP: scaffold-help
|
HELP: scaffold-help
|
||||||
{ $values
|
{ $values { "string" string } }
|
||||||
{ "vocab-root" "a vocabulary root string" } { "string" string } }
|
|
||||||
{ $description "Takes an existing vocabulary and creates a help file with scaffolded help for each word. This word only works if no help file yet exists." } ;
|
{ $description "Takes an existing vocabulary and creates a help file with scaffolded help for each word. This word only works if no help file yet exists." } ;
|
||||||
|
|
||||||
HELP: scaffold-undocumented
|
HELP: scaffold-undocumented
|
||||||
|
|
|
@ -217,9 +217,9 @@ PRIVATE>
|
||||||
: help. ( word -- )
|
: help. ( word -- )
|
||||||
[ (help.) ] [ nl vocabulary>> link-vocab ] bi ;
|
[ (help.) ] [ nl vocabulary>> link-vocab ] bi ;
|
||||||
|
|
||||||
: scaffold-help ( vocab-root string -- )
|
: scaffold-help ( string -- )
|
||||||
[
|
[
|
||||||
check-vocab
|
[ find-vocab-root ] [ check-vocab ] bi
|
||||||
prepare-scaffold
|
prepare-scaffold
|
||||||
[ "-docs.factor" scaffold-path ] dip
|
[ "-docs.factor" scaffold-path ] dip
|
||||||
swap [ set-scaffold-help-file ] [ 2drop ] if
|
swap [ set-scaffold-help-file ] [ 2drop ] if
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: help.markup help.syntax io.streams.string ;
|
||||||
|
IN: assoc-deques
|
||||||
|
|
||||||
|
HELP: <assoc-deque>
|
||||||
|
{ $description "Constructs a new " { $link assoc-deque } " from two existing data structures." } ;
|
||||||
|
|
||||||
|
HELP: <unique-max-heap>
|
||||||
|
{ $values
|
||||||
|
|
||||||
|
{ "unique-heap" assoc-deque } }
|
||||||
|
{ $description "Creates a new " { $link assoc-deque } " where the assoc is a hashtable and the deque is a max-heap." } ;
|
||||||
|
|
||||||
|
HELP: <unique-min-heap>
|
||||||
|
{ $values
|
||||||
|
|
||||||
|
{ "unique-heap" assoc-deque } }
|
||||||
|
{ $description "Creates a new " { $link assoc-deque } " where the assoc is a hashtable and the deque is a min-heap." } ;
|
||||||
|
|
||||||
|
HELP: assoc-deque
|
||||||
|
{ $description "A data structure containing an assoc and a deque to get certain properties with better time constraints at the expense of more space and complexity. For instance, a hashtable and a heap can be combined into one assoc-deque to get a sorted data structure with O(1) lookup. Operations on assoc-deques should update both the assoc and the deque." } ;
|
||||||
|
|
||||||
|
ARTICLE: "assoc-deques" "Associative deques"
|
||||||
|
"The " { $vocab-link "assoc-deques" } " vocabulary combines exists to synthesize data structures with better time properties than either of the two component data structures alone." $nl
|
||||||
|
"Associative deque constructor:"
|
||||||
|
{ $subsection <assoc-deque> }
|
||||||
|
"Unique heaps:"
|
||||||
|
{ $subsection <unique-min-heap> }
|
||||||
|
{ $subsection <unique-max-heap> } ;
|
||||||
|
|
||||||
|
ABOUT: "assoc-deques"
|
|
@ -0,0 +1,4 @@
|
||||||
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: tools.test assoc-deques ;
|
||||||
|
IN: assoc-deques.tests
|
|
@ -0,0 +1,31 @@
|
||||||
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: accessors assocs deques hashtables heaps kernel ;
|
||||||
|
IN: assoc-deques
|
||||||
|
|
||||||
|
TUPLE: assoc-deque assoc deque ;
|
||||||
|
|
||||||
|
C: <assoc-deque> assoc-deque
|
||||||
|
|
||||||
|
: <unique-min-heap> ( -- unique-heap )
|
||||||
|
H{ } clone <min-heap> <assoc-deque> ;
|
||||||
|
|
||||||
|
: <unique-max-heap> ( -- unique-heap )
|
||||||
|
H{ } clone <max-heap> <assoc-deque> ;
|
||||||
|
|
||||||
|
M: assoc-deque heap-push* ( value key assoc-deque -- entry )
|
||||||
|
pick over assoc>> key? [
|
||||||
|
3drop f
|
||||||
|
] [
|
||||||
|
[ assoc>> swapd set-at ] [ deque>> heap-push* ] 3bi
|
||||||
|
] if ;
|
||||||
|
|
||||||
|
M: assoc-deque heap-pop ( assoc-deque -- value key )
|
||||||
|
[ deque>> heap-pop ] keep
|
||||||
|
[ over ] dip assoc>> delete-at ;
|
||||||
|
|
||||||
|
M: assoc-deque heap-peek ( assoc-deque -- value key )
|
||||||
|
deque>> heap-peek ;
|
||||||
|
|
||||||
|
M: assoc-deque heap-empty? ( assoc-deque -- value key )
|
||||||
|
deque>> heap-empty? ;
|
|
@ -0,0 +1 @@
|
||||||
|
Doug Coleman
|
|
@ -3,7 +3,7 @@
|
||||||
USING: assocs html.parser kernel math sequences strings ascii
|
USING: assocs html.parser kernel math sequences strings ascii
|
||||||
arrays generalizations shuffle unicode.case namespaces make
|
arrays generalizations shuffle unicode.case namespaces make
|
||||||
splitting http accessors io combinators http.client urls
|
splitting http accessors io combinators http.client urls
|
||||||
urls.encoding fry sequences.lib ;
|
urls.encoding fry ;
|
||||||
IN: html.parser.analyzer
|
IN: html.parser.analyzer
|
||||||
|
|
||||||
TUPLE: link attributes clickable ;
|
TUPLE: link attributes clickable ;
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Doug Coleman
|
|
@ -0,0 +1,57 @@
|
||||||
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: help.markup help.syntax io.streams.string urls
|
||||||
|
multiline ;
|
||||||
|
IN: spider
|
||||||
|
|
||||||
|
HELP: <spider>
|
||||||
|
{ $values
|
||||||
|
{ "base" "a string or url" }
|
||||||
|
{ "spider" spider } }
|
||||||
|
{ $description "Creates a new web spider with a given base url." } ;
|
||||||
|
|
||||||
|
HELP: run-spider
|
||||||
|
{ $values
|
||||||
|
{ "spider" spider }
|
||||||
|
{ "spider" spider } }
|
||||||
|
{ $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ;
|
||||||
|
|
||||||
|
HELP: spider
|
||||||
|
{ $description "" } ;
|
||||||
|
|
||||||
|
HELP: spider-result
|
||||||
|
{ $description "" } ;
|
||||||
|
|
||||||
|
ARTICLE: "spider-tutorial" "Spider tutorial"
|
||||||
|
"To create a new spider, call the " { $link <spider> } " word with a link to the site you wish to spider."
|
||||||
|
{ $code <" "http://concatentative.org" <spider> "> }
|
||||||
|
"The max-depth is initialized to 0, which retrieves just the initial page. Let's initialize it to something more fun:"
|
||||||
|
{ $code <" 1 >>max-depth "> }
|
||||||
|
"Now the spider will retrieve the first page and all the pages it links to in the same domain." $nl
|
||||||
|
"But suppose the front page contains thousands of links. To avoid grabbing them all, we can set " { $slot "max-count" } " to a reasonable limit."
|
||||||
|
{ $code <" 10 >>max-count "> }
|
||||||
|
"A timeout might keep the spider from hitting the server too hard:"
|
||||||
|
{ $code <" USE: calendar 1.5 seconds >>sleep "> }
|
||||||
|
"Since we happen to know that not all pages of a wiki are suitable for spidering, we will spider only the wiki view pages, not the edit or revisions pages. To do this, we add a filter through which new links are tested; links that pass the filter are added to the todo queue, while links that do not are discarded. You can add several filters to the filter array, but we'll just add a single one for now."
|
||||||
|
{ $code <" { [ path>> "/wiki/view" head? ] } >>filters "> }
|
||||||
|
"Finally, to start the spider, call the " { $link run-spider } " word."
|
||||||
|
{ $code "run-spider" }
|
||||||
|
"The full code from the tutorial."
|
||||||
|
{ $code <" USING: spider calendar sequences accessors ;
|
||||||
|
: spider-concatenative ( -- spider )
|
||||||
|
"http://concatenative.org" <spider>
|
||||||
|
1 >>max-depth
|
||||||
|
10 >>max-count
|
||||||
|
1.5 seconds >>sleep
|
||||||
|
{ [ path>> "/wiki/view" head? ] } >>filters
|
||||||
|
run-spider ;"> } ;
|
||||||
|
|
||||||
|
ARTICLE: "spider" "Spider"
|
||||||
|
"The " { $vocab-link "spider" } " vocabulary implements a simple web spider for retrieving sets of webpages."
|
||||||
|
{ $subsection "spider-tutorial" }
|
||||||
|
"Creating a new spider:"
|
||||||
|
{ $subsection <spider> }
|
||||||
|
"Running the spider:"
|
||||||
|
{ $subsection run-spider } ;
|
||||||
|
|
||||||
|
ABOUT: "spider"
|
|
@ -0,0 +1,97 @@
|
||||||
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: accessors fry html.parser html.parser.analyzer
|
||||||
|
http.client kernel tools.time sets assocs sequences
|
||||||
|
concurrency.combinators io threads namespaces math multiline
|
||||||
|
heaps math.parser inspector urls assoc-deques logging
|
||||||
|
combinators.short-circuit continuations calendar prettyprint ;
|
||||||
|
IN: spider
|
||||||
|
|
||||||
|
TUPLE: spider base count max-count sleep max-depth initial-links
|
||||||
|
filters spidered todo nonmatching ;
|
||||||
|
! secure? agent page-timeout data-timeout overall-timeout
|
||||||
|
|
||||||
|
TUPLE: spider-result url depth headers fetch-time parsed-html
|
||||||
|
links processing-time timestamp ;
|
||||||
|
|
||||||
|
: <spider> ( base -- spider )
|
||||||
|
>url
|
||||||
|
spider new
|
||||||
|
over >>base
|
||||||
|
swap 0 <unique-min-heap> [ heap-push ] keep >>todo
|
||||||
|
<unique-min-heap> >>nonmatching
|
||||||
|
0 >>max-depth
|
||||||
|
0 >>count
|
||||||
|
1/0. >>max-count
|
||||||
|
H{ } clone >>spidered ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
: apply-filters ( links spider -- links' )
|
||||||
|
filters>> [ '[ _ 1&& ] filter ] when* ;
|
||||||
|
|
||||||
|
: add-todo ( links level spider -- )
|
||||||
|
tuck [ apply-filters ] 2dip
|
||||||
|
tuck
|
||||||
|
[ spidered>> keys diff ]
|
||||||
|
[ todo>> ] 2bi* '[ _ _ heap-push ] each ;
|
||||||
|
|
||||||
|
: add-nonmatching ( links level spider -- )
|
||||||
|
nonmatching>> '[ _ _ heap-push ] each ;
|
||||||
|
|
||||||
|
: relative-url? ( url -- ? ) protocol>> not ;
|
||||||
|
|
||||||
|
: filter-base ( spider spider-result -- base-links nonmatching-links )
|
||||||
|
[ base>> host>> ] [ links>> prune ] bi*
|
||||||
|
[ host>> = ] with partition ;
|
||||||
|
|
||||||
|
: add-spidered ( spider spider-result -- )
|
||||||
|
[ [ 1+ ] change-count ] dip
|
||||||
|
2dup [ spidered>> ] [ dup url>> ] bi* rot set-at
|
||||||
|
[ filter-base ] 2keep
|
||||||
|
depth>> 1+ swap
|
||||||
|
[ add-nonmatching ]
|
||||||
|
[ add-todo ] 2bi ;
|
||||||
|
|
||||||
|
: print-spidering ( url depth -- )
|
||||||
|
"depth: " write number>string write
|
||||||
|
", spidering: " write . yield ;
|
||||||
|
|
||||||
|
: normalize-hrefs ( links -- links' )
|
||||||
|
[ >url ] map
|
||||||
|
spider get base>> swap [ derive-url ] with map ;
|
||||||
|
|
||||||
|
: (spider-page) ( url depth -- spider-result )
|
||||||
|
2dup print-spidering
|
||||||
|
f pick spider get spidered>> set-at
|
||||||
|
over '[ _ http-get ] benchmark swap
|
||||||
|
[ parse-html dup find-hrefs normalize-hrefs ] benchmark
|
||||||
|
now spider-result boa
|
||||||
|
dup describe ;
|
||||||
|
|
||||||
|
: spider-page ( url depth -- )
|
||||||
|
(spider-page) spider get swap add-spidered ;
|
||||||
|
|
||||||
|
\ spider-page ERROR add-error-logging
|
||||||
|
|
||||||
|
: spider-sleep ( -- )
|
||||||
|
spider get sleep>> [ sleep ] when* ;
|
||||||
|
|
||||||
|
: queue-initial-links ( spider -- spider )
|
||||||
|
[ initial-links>> normalize-hrefs 0 ] keep
|
||||||
|
[ add-todo ] keep ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: run-spider ( spider -- spider )
|
||||||
|
"spider" [
|
||||||
|
dup spider [
|
||||||
|
queue-initial-links
|
||||||
|
[ todo>> ] [ max-depth>> ] bi
|
||||||
|
'[
|
||||||
|
_ <= spider get
|
||||||
|
[ count>> ] [ max-count>> ] bi < and
|
||||||
|
] [ spider-page spider-sleep ] slurp-heap-when
|
||||||
|
spider get
|
||||||
|
] with-variable
|
||||||
|
] with-logging ;
|
Loading…
Reference in New Issue