diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index be657227e5..2916ef7c32 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -21,12 +21,12 @@ CONSTANT: epsilon T{ tagged-epsilon { tag t } } TUPLE: concatenation first second ; : ( seq -- concatenation ) - [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ; + [ epsilon ] [ [ ] [ concatenation boa ] map-reduce ] if-empty ; TUPLE: alternation first second ; : ( seq -- alternation ) - unclip [ alternation boa ] reduce ; + [ ] [ alternation boa ] map-reduce ; TUPLE: star term ; C: star diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index 3c33ae8846..f452e3d24a 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -11,12 +11,7 @@ TUPLE: transition-table transitions start-state final-states ; H{ } clone >>transitions H{ } clone >>final-states ; -: maybe-initialize-key ( key hashtable -- ) - ! Why do we have to do this? - 2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ; - :: (set-transition) ( from to obj hash -- ) - to condition? [ to hash maybe-initialize-key ] unless from hash at [ [ to obj ] dip set-at ] [ to obj associate from hash set-at ] if* ; @@ -25,7 +20,6 @@ TUPLE: transition-table transitions start-state final-states ; transitions>> (set-transition) ; :: (add-transition) ( from to obj hash -- ) - to hash maybe-initialize-key from hash at [ [ to obj ] dip push-at ] [ to 1vector obj associate from hash set-at ] if* ; diff --git a/extra/robots/robots.factor b/extra/robots/robots.factor index 1b2422f06e..242aa1dea2 100644 --- a/extra/robots/robots.factor +++ b/extra/robots/robots.factor @@ -3,11 +3,21 @@ USING: accessors http.client kernel unicode.categories sequences urls splitting combinators splitting.monotonic combinators.short-circuit assocs unicode.case arrays -math.parser calendar.format make ; +math.parser calendar.format make fry present globs +multiline regexp.combinators regexp ; IN: robots ! visit-time is GMT, request-rate is pages/second ! crawl-rate is seconds + +TUPLE: robots site sitemap rules rules-quot ; + +: ( site sitemap rules -- robots ) + \ robots new + swap >>rules + swap >>sitemap + swap >>site ; + TUPLE: rules user-agents allows disallows visit-time request-rate crawl-delay unknowns ; @@ -40,8 +50,8 @@ visit-time request-rate crawl-delay unknowns ; H{ } clone >>unknowns ; : add-user-agent ( rules agent -- rules ) over user-agents>> push ; -: add-allow ( rules allow -- rules ) over allows>> push ; -: add-disallow ( rules disallow -- rules ) over disallows>> push ; +: add-allow ( rules allow -- rules ) >url over allows>> push ; +: add-disallow ( rules disallow -- rules ) >url over disallows>> push ; : parse-robots.txt-line ( rules seq -- rules ) first2 swap { @@ -57,12 +67,26 @@ visit-time request-rate crawl-delay unknowns ; [ pick unknowns>> push-at ] } case ; +: derive-urls ( url seq -- seq' ) + [ derive-url present ] with { } map-as ; + +: robot-rules-quot ( robots -- quot ) + [ + [ site>> ] [ rules>> allows>> ] bi + derive-urls [ ] map + + ] [ + [ site>> ] [ rules>> disallows>> ] bi + derive-urls [ ] map + ] bi 2array '[ _ matches? ] ; + PRIVATE> : parse-robots.txt ( string -- sitemaps rules-seq ) normalize-robots.txt [ [ dup ] dip [ parse-robots.txt-line drop ] with each - ] map ; + ] map first ; -: robots ( url -- sitemaps rules-seq ) - get-robots.txt nip parse-robots.txt ; +: robots ( url -- robots ) + >url + dup get-robots.txt nip parse-robots.txt ;