From 77f99eb70cda805feb463f1e77c0eb34fb25cc7a Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 6 Apr 2009 17:32:20 -0500 Subject: [PATCH 1/2] use map-reduce instead of unclip reduce, "Why do we need this?" --> we don't --- basis/regexp/ast/ast.factor | 4 ++-- basis/regexp/transition-tables/transition-tables.factor | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index be657227e5..2916ef7c32 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -21,12 +21,12 @@ CONSTANT: epsilon T{ tagged-epsilon { tag t } } TUPLE: concatenation first second ; : ( seq -- concatenation ) - [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ; + [ epsilon ] [ [ ] [ concatenation boa ] map-reduce ] if-empty ; TUPLE: alternation first second ; : ( seq -- alternation ) - unclip [ alternation boa ] reduce ; + [ ] [ alternation boa ] map-reduce ; TUPLE: star term ; C: star diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index 3c33ae8846..f452e3d24a 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -11,12 +11,7 @@ TUPLE: transition-table transitions start-state final-states ; H{ } clone >>transitions H{ } clone >>final-states ; -: maybe-initialize-key ( key hashtable -- ) - ! Why do we have to do this? - 2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ; - :: (set-transition) ( from to obj hash -- ) - to condition? [ to hash maybe-initialize-key ] unless from hash at [ [ to obj ] dip set-at ] [ to obj associate from hash set-at ] if* ; @@ -25,7 +20,6 @@ TUPLE: transition-table transitions start-state final-states ; transitions>> (set-transition) ; :: (add-transition) ( from to obj hash -- ) - to hash maybe-initialize-key from hash at [ [ to obj ] dip push-at ] [ to 1vector obj associate from hash set-at ] if* ; From b6064813c92acb6ad64985258995dd2479356248 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 6 Apr 2009 17:39:44 -0500 Subject: [PATCH 2/2] construct a glob-matching quote for robots.txt files --- extra/robots/robots.factor | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/extra/robots/robots.factor b/extra/robots/robots.factor index 1b2422f06e..242aa1dea2 100644 --- a/extra/robots/robots.factor +++ b/extra/robots/robots.factor @@ -3,11 +3,21 @@ USING: accessors http.client kernel unicode.categories sequences urls splitting combinators splitting.monotonic combinators.short-circuit assocs unicode.case arrays -math.parser calendar.format make ; +math.parser calendar.format make fry present globs +multiline regexp.combinators regexp ; IN: robots ! visit-time is GMT, request-rate is pages/second ! crawl-rate is seconds + +TUPLE: robots site sitemap rules rules-quot ; + +: ( site sitemap rules -- robots ) + \ robots new + swap >>rules + swap >>sitemap + swap >>site ; + TUPLE: rules user-agents allows disallows visit-time request-rate crawl-delay unknowns ; @@ -40,8 +50,8 @@ visit-time request-rate crawl-delay unknowns ; H{ } clone >>unknowns ; : add-user-agent ( rules agent -- rules ) over user-agents>> push ; -: add-allow ( rules allow -- rules ) over allows>> push ; -: add-disallow ( rules disallow -- rules ) over disallows>> push ; +: add-allow ( rules allow -- rules ) >url over allows>> push ; +: add-disallow ( rules disallow -- rules ) >url over disallows>> push ; : parse-robots.txt-line ( rules seq -- rules ) first2 swap { @@ -57,12 +67,26 @@ visit-time request-rate crawl-delay unknowns ; [ pick unknowns>> push-at ] } case ; +: derive-urls ( url seq -- seq' ) + [ derive-url present ] with { } map-as ; + +: robot-rules-quot ( robots -- quot ) + [ + [ site>> ] [ rules>> allows>> ] bi + derive-urls [ ] map + + ] [ + [ site>> ] [ rules>> disallows>> ] bi + derive-urls [ ] map + ] bi 2array '[ _ matches? ] ; + PRIVATE> : parse-robots.txt ( string -- sitemaps rules-seq ) normalize-robots.txt [ [ dup ] dip [ parse-robots.txt-line drop ] with each - ] map ; + ] map first ; -: robots ( url -- sitemaps rules-seq ) - get-robots.txt nip parse-robots.txt ; +: robots ( url -- robots ) + >url + dup get-robots.txt nip parse-robots.txt ;