From b6064813c92acb6ad64985258995dd2479356248 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 6 Apr 2009 17:39:44 -0500 Subject: [PATCH] construct a glob-matching quote for robots.txt files --- extra/robots/robots.factor | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/extra/robots/robots.factor b/extra/robots/robots.factor index 1b2422f06e..242aa1dea2 100644 --- a/extra/robots/robots.factor +++ b/extra/robots/robots.factor @@ -3,11 +3,21 @@ USING: accessors http.client kernel unicode.categories sequences urls splitting combinators splitting.monotonic combinators.short-circuit assocs unicode.case arrays -math.parser calendar.format make ; +math.parser calendar.format make fry present globs +multiline regexp.combinators regexp ; IN: robots ! visit-time is GMT, request-rate is pages/second ! crawl-rate is seconds + +TUPLE: robots site sitemap rules rules-quot ; + +: ( site sitemap rules -- robots ) + \ robots new + swap >>rules + swap >>sitemap + swap >>site ; + TUPLE: rules user-agents allows disallows visit-time request-rate crawl-delay unknowns ; @@ -40,8 +50,8 @@ visit-time request-rate crawl-delay unknowns ; H{ } clone >>unknowns ; : add-user-agent ( rules agent -- rules ) over user-agents>> push ; -: add-allow ( rules allow -- rules ) over allows>> push ; -: add-disallow ( rules disallow -- rules ) over disallows>> push ; +: add-allow ( rules allow -- rules ) >url over allows>> push ; +: add-disallow ( rules disallow -- rules ) >url over disallows>> push ; : parse-robots.txt-line ( rules seq -- rules ) first2 swap { @@ -57,12 +67,26 @@ visit-time request-rate crawl-delay unknowns ; [ pick unknowns>> push-at ] } case ; +: derive-urls ( url seq -- seq' ) + [ derive-url present ] with { } map-as ; + +: robot-rules-quot ( robots -- quot ) + [ + [ site>> ] [ rules>> allows>> ] bi + derive-urls [ ] map + + ] [ + [ site>> ] [ rules>> disallows>> ] bi + derive-urls [ ] map + ] bi 2array '[ _ matches? ] ; + PRIVATE> : parse-robots.txt ( string -- sitemaps rules-seq ) normalize-robots.txt [ [ dup ] dip [ parse-robots.txt-line drop ] with each - ] map ; + ] map first ; -: robots ( url -- sitemaps rules-seq ) - get-robots.txt nip parse-robots.txt ; +: robots ( url -- robots ) + >url + dup get-robots.txt nip parse-robots.txt ;