Merge branch 'master' of git://factorcode.org/git/factor
commit
2ef6043566
|
@ -21,12 +21,12 @@ CONSTANT: epsilon T{ tagged-epsilon { tag t } }
|
||||||
TUPLE: concatenation first second ;
|
TUPLE: concatenation first second ;
|
||||||
|
|
||||||
: <concatenation> ( seq -- concatenation )
|
: <concatenation> ( seq -- concatenation )
|
||||||
[ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
|
[ epsilon ] [ [ ] [ concatenation boa ] map-reduce ] if-empty ;
|
||||||
|
|
||||||
TUPLE: alternation first second ;
|
TUPLE: alternation first second ;
|
||||||
|
|
||||||
: <alternation> ( seq -- alternation )
|
: <alternation> ( seq -- alternation )
|
||||||
unclip [ alternation boa ] reduce ;
|
[ ] [ alternation boa ] map-reduce ;
|
||||||
|
|
||||||
TUPLE: star term ;
|
TUPLE: star term ;
|
||||||
C: <star> star
|
C: <star> star
|
||||||
|
|
|
@ -11,12 +11,7 @@ TUPLE: transition-table transitions start-state final-states ;
|
||||||
H{ } clone >>transitions
|
H{ } clone >>transitions
|
||||||
H{ } clone >>final-states ;
|
H{ } clone >>final-states ;
|
||||||
|
|
||||||
: maybe-initialize-key ( key hashtable -- )
|
|
||||||
! Why do we have to do this?
|
|
||||||
2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ;
|
|
||||||
|
|
||||||
:: (set-transition) ( from to obj hash -- )
|
:: (set-transition) ( from to obj hash -- )
|
||||||
to condition? [ to hash maybe-initialize-key ] unless
|
|
||||||
from hash at
|
from hash at
|
||||||
[ [ to obj ] dip set-at ]
|
[ [ to obj ] dip set-at ]
|
||||||
[ to obj associate from hash set-at ] if* ;
|
[ to obj associate from hash set-at ] if* ;
|
||||||
|
@ -25,7 +20,6 @@ TUPLE: transition-table transitions start-state final-states ;
|
||||||
transitions>> (set-transition) ;
|
transitions>> (set-transition) ;
|
||||||
|
|
||||||
:: (add-transition) ( from to obj hash -- )
|
:: (add-transition) ( from to obj hash -- )
|
||||||
to hash maybe-initialize-key
|
|
||||||
from hash at
|
from hash at
|
||||||
[ [ to obj ] dip push-at ]
|
[ [ to obj ] dip push-at ]
|
||||||
[ to 1vector obj associate from hash set-at ] if* ;
|
[ to 1vector obj associate from hash set-at ] if* ;
|
||||||
|
|
|
@ -3,11 +3,21 @@
|
||||||
USING: accessors http.client kernel unicode.categories
|
USING: accessors http.client kernel unicode.categories
|
||||||
sequences urls splitting combinators splitting.monotonic
|
sequences urls splitting combinators splitting.monotonic
|
||||||
combinators.short-circuit assocs unicode.case arrays
|
combinators.short-circuit assocs unicode.case arrays
|
||||||
math.parser calendar.format make ;
|
math.parser calendar.format make fry present globs
|
||||||
|
multiline regexp.combinators regexp ;
|
||||||
IN: robots
|
IN: robots
|
||||||
|
|
||||||
! visit-time is GMT, request-rate is pages/second
|
! visit-time is GMT, request-rate is pages/second
|
||||||
! crawl-rate is seconds
|
! crawl-rate is seconds
|
||||||
|
|
||||||
|
TUPLE: robots site sitemap rules rules-quot ;
|
||||||
|
|
||||||
|
: <robots> ( site sitemap rules -- robots )
|
||||||
|
\ robots new
|
||||||
|
swap >>rules
|
||||||
|
swap >>sitemap
|
||||||
|
swap >>site ;
|
||||||
|
|
||||||
TUPLE: rules user-agents allows disallows
|
TUPLE: rules user-agents allows disallows
|
||||||
visit-time request-rate crawl-delay unknowns ;
|
visit-time request-rate crawl-delay unknowns ;
|
||||||
|
|
||||||
|
@ -40,8 +50,8 @@ visit-time request-rate crawl-delay unknowns ;
|
||||||
H{ } clone >>unknowns ;
|
H{ } clone >>unknowns ;
|
||||||
|
|
||||||
: add-user-agent ( rules agent -- rules ) over user-agents>> push ;
|
: add-user-agent ( rules agent -- rules ) over user-agents>> push ;
|
||||||
: add-allow ( rules allow -- rules ) over allows>> push ;
|
: add-allow ( rules allow -- rules ) >url over allows>> push ;
|
||||||
: add-disallow ( rules disallow -- rules ) over disallows>> push ;
|
: add-disallow ( rules disallow -- rules ) >url over disallows>> push ;
|
||||||
|
|
||||||
: parse-robots.txt-line ( rules seq -- rules )
|
: parse-robots.txt-line ( rules seq -- rules )
|
||||||
first2 swap {
|
first2 swap {
|
||||||
|
@ -57,12 +67,26 @@ visit-time request-rate crawl-delay unknowns ;
|
||||||
[ pick unknowns>> push-at ]
|
[ pick unknowns>> push-at ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
|
: derive-urls ( url seq -- seq' )
|
||||||
|
[ derive-url present ] with { } map-as ;
|
||||||
|
|
||||||
|
: robot-rules-quot ( robots -- quot )
|
||||||
|
[
|
||||||
|
[ site>> ] [ rules>> allows>> ] bi
|
||||||
|
derive-urls [ <glob> ] map
|
||||||
|
<or>
|
||||||
|
] [
|
||||||
|
[ site>> ] [ rules>> disallows>> ] bi
|
||||||
|
derive-urls [ <glob> ] map <and> <not>
|
||||||
|
] bi 2array <or> '[ _ matches? ] ;
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: parse-robots.txt ( string -- sitemaps rules-seq )
|
: parse-robots.txt ( string -- sitemaps rules-seq )
|
||||||
normalize-robots.txt [
|
normalize-robots.txt [
|
||||||
[ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
|
[ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
|
||||||
] map ;
|
] map first ;
|
||||||
|
|
||||||
: robots ( url -- sitemaps rules-seq )
|
: robots ( url -- robots )
|
||||||
get-robots.txt nip parse-robots.txt ;
|
>url
|
||||||
|
dup get-robots.txt nip parse-robots.txt <robots> ;
|
||||||
|
|
Loading…
Reference in New Issue