| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | ! Copyright (C) 2009 Doug Coleman. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2009-04-07 11:08:01 -04:00
										 |  |  | USING: calendar io.encodings.utf8 io.files robots tools.test | 
					
						
							|  |  |  | urls ;
 | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  | IN: robots.tests | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | [ | 
					
						
							| 
									
										
										
										
											2009-04-07 11:08:01 -04:00
										 |  |  |     { "http://www.chiplist.com/sitemap.txt" } | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "*" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows | 
					
						
							|  |  |  |                 V{ | 
					
						
							|  |  |  |                     URL" /cgi-bin/" | 
					
						
							|  |  |  |                     URL" /scripts/" | 
					
						
							|  |  |  |                     URL" /ChipList2/scripts/" | 
					
						
							|  |  |  |                     URL" /ChipList2/styles/" | 
					
						
							|  |  |  |                     URL" /ads/" | 
					
						
							|  |  |  |                     URL" /ChipList2/ads/" | 
					
						
							|  |  |  |                     URL" /advertisements/" | 
					
						
							|  |  |  |                     URL" /ChipList2/advertisements/" | 
					
						
							|  |  |  |                     URL" /graphics/" | 
					
						
							|  |  |  |                     URL" /ChipList2/graphics/" | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  |             } | 
					
						
							| 
									
										
										
										
											2009-04-07 11:08:01 -04:00
										 |  |  |             { visit-time | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     T{ timestamp { hour 2 } } | 
					
						
							|  |  |  |                     T{ timestamp { hour 5 } } | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  |             } | 
					
						
							| 
									
										
										
										
											2009-04-07 11:08:01 -04:00
										 |  |  |             { request-rate 1 } | 
					
						
							|  |  |  |             { crawl-delay 1 } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  |         } | 
					
						
							| 
									
										
										
										
											2009-04-07 11:08:01 -04:00
										 |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "UbiCrawler" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "DOC" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "Zao" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "sitecheck.internetseer.com" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "Zealbot" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "MSIECrawler" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "SiteSnagger" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "WebStripper" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "WebCopier" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "Fetch" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "Offline Explorer" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "Teleport" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "TeleportPro" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "WebZIP" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "linko" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "HTTrack" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "Microsoft.URL.Control" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "Xenu" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "larbin" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "libwww" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "ZyBORG" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "Download Ninja" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "wget" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "grub-client" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "k2spider" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "NPBot" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents V{ "WebReaper" } } | 
					
						
							|  |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         T{ rules | 
					
						
							|  |  |  |             { user-agents | 
					
						
							|  |  |  |                 V{ | 
					
						
							|  |  |  |                     "abot" | 
					
						
							|  |  |  |                     "ALeadSoftbot" | 
					
						
							|  |  |  |                     "BeijingCrawler" | 
					
						
							|  |  |  |                     "BilgiBot" | 
					
						
							|  |  |  |                     "bot" | 
					
						
							|  |  |  |                     "botlist" | 
					
						
							|  |  |  |                     "BOTW Spider" | 
					
						
							|  |  |  |                     "bumblebee" | 
					
						
							|  |  |  |                     "Bumblebee" | 
					
						
							|  |  |  |                     "BuzzRankingBot" | 
					
						
							|  |  |  |                     "Charlotte" | 
					
						
							|  |  |  |                     "Clushbot" | 
					
						
							|  |  |  |                     "Crawler" | 
					
						
							|  |  |  |                     "CydralSpider" | 
					
						
							|  |  |  |                     "DataFountains" | 
					
						
							|  |  |  |                     "DiamondBot" | 
					
						
							|  |  |  |                     "Dulance bot" | 
					
						
							|  |  |  |                     "DYNAMIC" | 
					
						
							|  |  |  |                     "EARTHCOM.info" | 
					
						
							|  |  |  |                     "EDI" | 
					
						
							|  |  |  |                     "envolk" | 
					
						
							|  |  |  |                     "Exabot" | 
					
						
							|  |  |  |                     "Exabot-Images" | 
					
						
							|  |  |  |                     "Exabot-Test" | 
					
						
							|  |  |  |                     "exactseek-pagereaper" | 
					
						
							|  |  |  |                     "Exalead NG" | 
					
						
							|  |  |  |                     "FANGCrawl" | 
					
						
							|  |  |  |                     "Feed::Find" | 
					
						
							|  |  |  |                     "flatlandbot" | 
					
						
							|  |  |  |                     "Gigabot" | 
					
						
							|  |  |  |                     "GigabotSiteSearch" | 
					
						
							|  |  |  |                     "GurujiBot" | 
					
						
							|  |  |  |                     "Hatena Antenna" | 
					
						
							|  |  |  |                     "Hatena Bookmark" | 
					
						
							|  |  |  |                     "Hatena RSS" | 
					
						
							|  |  |  |                     "HatenaScreenshot" | 
					
						
							|  |  |  |                     "Helix" | 
					
						
							|  |  |  |                     "HiddenMarket" | 
					
						
							|  |  |  |                     "HyperEstraier" | 
					
						
							|  |  |  |                     "iaskspider" | 
					
						
							|  |  |  |                     "IIITBOT" | 
					
						
							|  |  |  |                     "InfociousBot" | 
					
						
							|  |  |  |                     "iVia" | 
					
						
							|  |  |  |                     "iVia Page Fetcher" | 
					
						
							|  |  |  |                     "Jetbot" | 
					
						
							|  |  |  |                     "Kolinka Forum Search" | 
					
						
							|  |  |  |                     "KRetrieve" | 
					
						
							|  |  |  |                     "LetsCrawl.com" | 
					
						
							|  |  |  |                     "Lincoln State Web Browser" | 
					
						
							|  |  |  |                     "Links4US-Crawler" | 
					
						
							|  |  |  |                     "LOOQ" | 
					
						
							|  |  |  |                     "Lsearch/sondeur" | 
					
						
							|  |  |  |                     "MapoftheInternet.com" | 
					
						
							|  |  |  |                     "NationalDirectory" | 
					
						
							|  |  |  |                     "NetCarta_WebMapper" | 
					
						
							|  |  |  |                     "NewsGator" | 
					
						
							|  |  |  |                     "NextGenSearchBot" | 
					
						
							|  |  |  |                     "ng" | 
					
						
							|  |  |  |                     "nicebot" | 
					
						
							|  |  |  |                     "NP" | 
					
						
							|  |  |  |                     "NPBot" | 
					
						
							|  |  |  |                     "Nudelsalat" | 
					
						
							|  |  |  |                     "Nutch" | 
					
						
							|  |  |  |                     "OmniExplorer_Bot" | 
					
						
							|  |  |  |                     "OpenIntelligenceData" | 
					
						
							|  |  |  |                     "Oracle Enterprise Search" | 
					
						
							|  |  |  |                     "Pajaczek" | 
					
						
							|  |  |  |                     "panscient.com" | 
					
						
							|  |  |  |                     "PeerFactor 404 crawler" | 
					
						
							|  |  |  |                     "PeerFactor Crawler" | 
					
						
							|  |  |  |                     "PlantyNet" | 
					
						
							|  |  |  |                     "PlantyNet_WebRobot" | 
					
						
							|  |  |  |                     "plinki" | 
					
						
							|  |  |  |                     "PMAFind" | 
					
						
							|  |  |  |                     "Pogodak!" | 
					
						
							|  |  |  |                     "QuickFinder Crawler" | 
					
						
							|  |  |  |                     "Radiation Retriever" | 
					
						
							|  |  |  |                     "Reaper" | 
					
						
							|  |  |  |                     "RedCarpet" | 
					
						
							|  |  |  |                     "ScorpionBot" | 
					
						
							|  |  |  |                     "Scrubby" | 
					
						
							|  |  |  |                     "Scumbot" | 
					
						
							|  |  |  |                     "searchbot" | 
					
						
							|  |  |  |                     "Seeker.lookseek.com" | 
					
						
							|  |  |  |                     "SeznamBot" | 
					
						
							|  |  |  |                     "ShowXML" | 
					
						
							|  |  |  |                     "snap.com" | 
					
						
							|  |  |  |                     "snap.com beta crawler" | 
					
						
							|  |  |  |                     "Snapbot" | 
					
						
							|  |  |  |                     "SnapPreviewBot" | 
					
						
							|  |  |  |                     "sohu" | 
					
						
							|  |  |  |                     "SpankBot" | 
					
						
							|  |  |  |                     "Speedy Spider" | 
					
						
							|  |  |  |                     "Speedy_Spider" | 
					
						
							|  |  |  |                     "SpeedySpider" | 
					
						
							|  |  |  |                     "spider" | 
					
						
							|  |  |  |                     "SquigglebotBot" | 
					
						
							|  |  |  |                     "SurveyBot" | 
					
						
							|  |  |  |                     "SynapticSearch" | 
					
						
							|  |  |  |                     "T-H-U-N-D-E-R-S-T-O-N-E" | 
					
						
							|  |  |  |                     "Talkro Web-Shot" | 
					
						
							|  |  |  |                     "Tarantula" | 
					
						
							|  |  |  |                     "TerrawizBot" | 
					
						
							|  |  |  |                     "TheInformant" | 
					
						
							|  |  |  |                     "TMCrawler" | 
					
						
							|  |  |  |                     "TridentSpider" | 
					
						
							|  |  |  |                     "Tutorial Crawler" | 
					
						
							|  |  |  |                     "Twiceler" | 
					
						
							|  |  |  |                     "unwrapbot" | 
					
						
							|  |  |  |                     "URI::Fetch" | 
					
						
							|  |  |  |                     "VengaBot" | 
					
						
							|  |  |  |                     "Vonna.com b o t" | 
					
						
							|  |  |  |                     "Vortex" | 
					
						
							|  |  |  |                     "Votay bot" | 
					
						
							|  |  |  |                     "WebAlta Crawler" | 
					
						
							|  |  |  |                     "Webbot" | 
					
						
							|  |  |  |                     "Webclipping.com" | 
					
						
							|  |  |  |                     "WebCorp" | 
					
						
							|  |  |  |                     "Webinator" | 
					
						
							|  |  |  |                     "WIRE" | 
					
						
							|  |  |  |                     "WISEbot" | 
					
						
							|  |  |  |                     "Xerka WebBot" | 
					
						
							|  |  |  |                     "XSpider" | 
					
						
							|  |  |  |                     "YodaoBot" | 
					
						
							|  |  |  |                     "Yoono" | 
					
						
							|  |  |  |                     "yoono" | 
					
						
							|  |  |  |                 } | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  |             } | 
					
						
							| 
									
										
										
										
											2009-04-07 11:08:01 -04:00
										 |  |  |             { allows V{ } } | 
					
						
							|  |  |  |             { disallows V{ URL" /" } } | 
					
						
							|  |  |  |             { unknowns H{ } } | 
					
						
							| 
									
										
										
										
											2009-03-20 22:56:48 -04:00
										 |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | ] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test |