allow robot-identifiers to be set for robots.txt

db4
Doug Coleman 2009-06-19 14:41:48 -05:00
parent 9c45840b5d
commit 1e14a83ee1
1 changed files with 15 additions and 5 deletions

View File

@ -1,15 +1,18 @@
! Copyright (C) 2009 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors http.client kernel unicode.categories
sequences urls splitting combinators splitting.monotonic
combinators.short-circuit assocs unicode.case arrays
math.parser calendar.format make fry present globs
multiline regexp.combinators regexp ;
USING: accessors arrays assocs calendar.format combinators
combinators.short-circuit fry globs http.client kernel make
math.parser multiline namespaces present regexp
regexp.combinators sequences sets splitting splitting.monotonic
unicode.case unicode.categories urls ;
IN: robots
! visit-time is GMT, request-rate is pages/second
! crawl-rate is seconds
SYMBOL: robot-identities
robot-identities [ { "FactorSpider" } ] initialize
TUPLE: robots site sitemap rules rules-quot ;
: <robots> ( site sitemap rules -- robots )
@ -80,6 +83,13 @@ visit-time request-rate crawl-delay unknowns ;
derive-urls [ <glob> ] map <and> <not>
] bi 2array <or> '[ _ matches? ] ;
: relevant-rules ( robots -- rules )
[
user-agents>> [
robot-identities get [ swap glob-matches? ] with any?
] any?
] filter ;
PRIVATE>
: parse-robots.txt ( string -- sitemaps rules-seq )