| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | ! Copyright (C) 2009 Doug Coleman. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2009-06-19 15:41:48 -04:00
										 |  |  | USING: accessors arrays assocs calendar.format combinators | 
					
						
							|  |  |  | combinators.short-circuit fry globs http.client kernel make | 
					
						
							|  |  |  | math.parser multiline namespaces present regexp | 
					
						
							|  |  |  | regexp.combinators sequences sets splitting splitting.monotonic | 
					
						
							|  |  |  | unicode.case unicode.categories urls ;
 | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | IN: robots | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! visit-time is GMT, request-rate is pages/second  | 
					
						
							|  |  |  | ! crawl-rate is seconds | 
					
						
							| 
									
										
										
										
											2009-04-06 18:39:44 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-06-19 15:41:48 -04:00
										 |  |  | SYMBOL: robot-identities | 
					
						
							|  |  |  | robot-identities [ { "FactorSpider" } ] initialize
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-06 18:39:44 -04:00
										 |  |  | TUPLE: robots site sitemap rules rules-quot ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : <robots> ( site sitemap rules -- robots )
 | 
					
						
							|  |  |  |     \ robots new
 | 
					
						
							|  |  |  |         swap >>rules | 
					
						
							|  |  |  |         swap >>sitemap | 
					
						
							|  |  |  |         swap >>site ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | TUPLE: rules user-agents allows disallows | 
					
						
							|  |  |  | visit-time request-rate crawl-delay unknowns ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | <PRIVATE
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : >robots.txt-url ( url -- url' )
 | 
					
						
							|  |  |  |     >url URL" robots.txt" derive-url ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-10-12 13:25:42 -04:00
										 |  |  | : get-robots.txt ( url -- robots.txt )
 | 
					
						
							| 
									
										
										
										
											2014-03-12 22:29:35 -04:00
										 |  |  |     >robots.txt-url http-get nip ;
 | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : normalize-robots.txt ( string -- sitemaps seq )
 | 
					
						
							|  |  |  |     string-lines | 
					
						
							|  |  |  |     [ [ blank? ] trim ] map
 | 
					
						
							|  |  |  |     [ "#" head? not ] filter harvest
 | 
					
						
							|  |  |  |     [ ":" split1 [ [ blank? ] trim ] bi@ [ >lower ] dip  ] { } map>assoc
 | 
					
						
							|  |  |  |     [ first "sitemap" = ] partition [ values ] dip
 | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         { | 
					
						
							| 
									
										
										
										
											2011-10-15 22:19:44 -04:00
										 |  |  |             [ [ first "user-agent" = ] both? ] | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  |             [ nip first "user-agent" = not ] | 
					
						
							|  |  |  |         } 2||  | 
					
						
							|  |  |  |     ] monotonic-split ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : <rules> ( -- rules )
 | 
					
						
							|  |  |  |     rules new
 | 
					
						
							|  |  |  |         V{ } clone >>user-agents | 
					
						
							|  |  |  |         V{ } clone >>allows | 
					
						
							|  |  |  |         V{ } clone >>disallows | 
					
						
							|  |  |  |         H{ } clone >>unknowns ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : add-user-agent ( rules agent -- rules ) over user-agents>> push ;
 | 
					
						
							| 
									
										
										
										
											2009-04-06 18:39:44 -04:00
										 |  |  | : add-allow ( rules allow -- rules ) >url over allows>> push ;
 | 
					
						
							|  |  |  | : add-disallow ( rules disallow -- rules ) >url over disallows>> push ;
 | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : parse-robots.txt-line ( rules seq -- rules )
 | 
					
						
							|  |  |  |     first2 swap { | 
					
						
							|  |  |  |         { "user-agent" [ add-user-agent ] } | 
					
						
							|  |  |  |         { "allow" [ add-allow ] } | 
					
						
							|  |  |  |         { "disallow" [ add-disallow ] } | 
					
						
							|  |  |  |         { "crawl-delay" [ string>number >>crawl-delay ] } | 
					
						
							|  |  |  |         { "request-rate" [ string>number >>request-rate ] } | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             "visit-time" [ "-" split1 [ hhmm>timestamp ] bi@ 2array
 | 
					
						
							|  |  |  |             >>visit-time | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         [ pick unknowns>> push-at ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-06 18:39:44 -04:00
										 |  |  | : derive-urls ( url seq -- seq' )
 | 
					
						
							|  |  |  |     [ derive-url present ] with { } map-as ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : robot-rules-quot ( robots -- quot )
 | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         [ site>> ] [ rules>> allows>> ] bi
 | 
					
						
							|  |  |  |         derive-urls [ <glob> ] map
 | 
					
						
							|  |  |  |         <or> | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         [ site>> ] [ rules>> disallows>> ] bi
 | 
					
						
							|  |  |  |         derive-urls [ <glob> ] map <and> <not> | 
					
						
							|  |  |  |     ] bi 2array <or> '[ _ matches? ] ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-06-19 15:41:48 -04:00
										 |  |  | : relevant-rules ( robots -- rules )
 | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         user-agents>> [ | 
					
						
							|  |  |  |             robot-identities get [ swap glob-matches? ] with any?
 | 
					
						
							|  |  |  |         ] any?
 | 
					
						
							|  |  |  |     ] filter ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | PRIVATE>
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-robots.txt ( string -- sitemaps rules-seq )
 | 
					
						
							|  |  |  |     normalize-robots.txt [ | 
					
						
							|  |  |  |         [ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
 | 
					
						
							| 
									
										
										
										
											2009-04-07 11:08:01 -04:00
										 |  |  |     ] map ;
 | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-03-24 13:07:31 -04:00
										 |  |  | : url>robots ( url -- robots )
 | 
					
						
							| 
									
										
										
										
											2013-10-12 13:25:42 -04:00
										 |  |  |     >url dup get-robots.txt parse-robots.txt <robots> ;
 |