From 1e14a83ee1f657477ab8ede7720ff8ffbab59d2e Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Fri, 19 Jun 2009 14:41:48 -0500 Subject: [PATCH] allow robot-identifiers to be set for robots.txt --- extra/robots/robots.factor | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/extra/robots/robots.factor b/extra/robots/robots.factor index 3c0eb045f7..af039ef8c4 100644 --- a/extra/robots/robots.factor +++ b/extra/robots/robots.factor @@ -1,15 +1,18 @@ ! Copyright (C) 2009 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors http.client kernel unicode.categories -sequences urls splitting combinators splitting.monotonic -combinators.short-circuit assocs unicode.case arrays -math.parser calendar.format make fry present globs -multiline regexp.combinators regexp ; +USING: accessors arrays assocs calendar.format combinators +combinators.short-circuit fry globs http.client kernel make +math.parser multiline namespaces present regexp +regexp.combinators sequences sets splitting splitting.monotonic +unicode.case unicode.categories urls ; IN: robots ! visit-time is GMT, request-rate is pages/second ! crawl-rate is seconds +SYMBOL: robot-identities +robot-identities [ { "FactorSpider" } ] initialize + TUPLE: robots site sitemap rules rules-quot ; : ( site sitemap rules -- robots ) @@ -80,6 +83,13 @@ visit-time request-rate crawl-delay unknowns ; derive-urls [ ] map ] bi 2array '[ _ matches? ] ; +: relevant-rules ( robots -- rules ) + [ + user-agents>> [ + robot-identities get [ swap glob-matches? ] with any? + ] any? + ] filter ; + PRIVATE> : parse-robots.txt ( string -- sitemaps rules-seq )