Merge branch 'master' of git://factorcode.org/git/factor

db4
Slava Pestov 2009-04-06 17:49:42 -05:00
commit 2ef6043566
3 changed files with 32 additions and 14 deletions

View File

@ -21,12 +21,12 @@ CONSTANT: epsilon T{ tagged-epsilon { tag t } }
TUPLE: concatenation first second ; TUPLE: concatenation first second ;
: <concatenation> ( seq -- concatenation ) : <concatenation> ( seq -- concatenation )
[ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ; [ epsilon ] [ [ ] [ concatenation boa ] map-reduce ] if-empty ;
TUPLE: alternation first second ; TUPLE: alternation first second ;
: <alternation> ( seq -- alternation ) : <alternation> ( seq -- alternation )
unclip [ alternation boa ] reduce ; [ ] [ alternation boa ] map-reduce ;
TUPLE: star term ; TUPLE: star term ;
C: <star> star C: <star> star

View File

@ -11,12 +11,7 @@ TUPLE: transition-table transitions start-state final-states ;
H{ } clone >>transitions H{ } clone >>transitions
H{ } clone >>final-states ; H{ } clone >>final-states ;
: maybe-initialize-key ( key hashtable -- )
! Why do we have to do this?
2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ;
:: (set-transition) ( from to obj hash -- ) :: (set-transition) ( from to obj hash -- )
to condition? [ to hash maybe-initialize-key ] unless
from hash at from hash at
[ [ to obj ] dip set-at ] [ [ to obj ] dip set-at ]
[ to obj associate from hash set-at ] if* ; [ to obj associate from hash set-at ] if* ;
@ -25,7 +20,6 @@ TUPLE: transition-table transitions start-state final-states ;
transitions>> (set-transition) ; transitions>> (set-transition) ;
:: (add-transition) ( from to obj hash -- ) :: (add-transition) ( from to obj hash -- )
to hash maybe-initialize-key
from hash at from hash at
[ [ to obj ] dip push-at ] [ [ to obj ] dip push-at ]
[ to 1vector obj associate from hash set-at ] if* ; [ to 1vector obj associate from hash set-at ] if* ;

View File

@ -3,11 +3,21 @@
USING: accessors http.client kernel unicode.categories USING: accessors http.client kernel unicode.categories
sequences urls splitting combinators splitting.monotonic sequences urls splitting combinators splitting.monotonic
combinators.short-circuit assocs unicode.case arrays combinators.short-circuit assocs unicode.case arrays
math.parser calendar.format make ; math.parser calendar.format make fry present globs
multiline regexp.combinators regexp ;
IN: robots IN: robots
! visit-time is GMT, request-rate is pages/second ! visit-time is GMT, request-rate is pages/second
! crawl-rate is seconds ! crawl-rate is seconds
TUPLE: robots site sitemap rules rules-quot ;
: <robots> ( site sitemap rules -- robots )
\ robots new
swap >>rules
swap >>sitemap
swap >>site ;
TUPLE: rules user-agents allows disallows TUPLE: rules user-agents allows disallows
visit-time request-rate crawl-delay unknowns ; visit-time request-rate crawl-delay unknowns ;
@ -40,8 +50,8 @@ visit-time request-rate crawl-delay unknowns ;
H{ } clone >>unknowns ; H{ } clone >>unknowns ;
: add-user-agent ( rules agent -- rules ) over user-agents>> push ; : add-user-agent ( rules agent -- rules ) over user-agents>> push ;
: add-allow ( rules allow -- rules ) over allows>> push ; : add-allow ( rules allow -- rules ) >url over allows>> push ;
: add-disallow ( rules disallow -- rules ) over disallows>> push ; : add-disallow ( rules disallow -- rules ) >url over disallows>> push ;
: parse-robots.txt-line ( rules seq -- rules ) : parse-robots.txt-line ( rules seq -- rules )
first2 swap { first2 swap {
@ -57,12 +67,26 @@ visit-time request-rate crawl-delay unknowns ;
[ pick unknowns>> push-at ] [ pick unknowns>> push-at ]
} case ; } case ;
: derive-urls ( url seq -- seq' )
[ derive-url present ] with { } map-as ;
: robot-rules-quot ( robots -- quot )
[
[ site>> ] [ rules>> allows>> ] bi
derive-urls [ <glob> ] map
<or>
] [
[ site>> ] [ rules>> disallows>> ] bi
derive-urls [ <glob> ] map <and> <not>
] bi 2array <or> '[ _ matches? ] ;
PRIVATE> PRIVATE>
: parse-robots.txt ( string -- sitemaps rules-seq ) : parse-robots.txt ( string -- sitemaps rules-seq )
normalize-robots.txt [ normalize-robots.txt [
[ <rules> dup ] dip [ parse-robots.txt-line drop ] with each [ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
] map ; ] map first ;
: robots ( url -- sitemaps rules-seq ) : robots ( url -- robots )
get-robots.txt nip parse-robots.txt ; >url
dup get-robots.txt nip parse-robots.txt <robots> ;