336 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Factor
		
	
	
			
		
		
	
	
			336 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Factor
		
	
	
! Copyright (C) 2009 Doug Coleman.
 | 
						|
! See http://factorcode.org/license.txt for BSD license.
 | 
						|
USING: calendar io.encodings.utf8 io.files robots tools.test
 | 
						|
urls ;
 | 
						|
IN: robots.tests
 | 
						|
 | 
						|
{
 | 
						|
    { "http://www.chiplist.com/sitemap.txt" }
 | 
						|
    {
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "*" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows
 | 
						|
                V{
 | 
						|
                    URL" /cgi-bin/"
 | 
						|
                    URL" /scripts/"
 | 
						|
                    URL" /ChipList2/scripts/"
 | 
						|
                    URL" /ChipList2/styles/"
 | 
						|
                    URL" /ads/"
 | 
						|
                    URL" /ChipList2/ads/"
 | 
						|
                    URL" /advertisements/"
 | 
						|
                    URL" /ChipList2/advertisements/"
 | 
						|
                    URL" /graphics/"
 | 
						|
                    URL" /ChipList2/graphics/"
 | 
						|
                }
 | 
						|
            }
 | 
						|
            { visit-time
 | 
						|
                {
 | 
						|
                    T{ timestamp { hour 2 } }
 | 
						|
                    T{ timestamp { hour 5 } }
 | 
						|
                }
 | 
						|
            }
 | 
						|
            { request-rate 1 }
 | 
						|
            { crawl-delay 1 }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "UbiCrawler" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "DOC" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "Zao" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "sitecheck.internetseer.com" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "Zealbot" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "MSIECrawler" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "SiteSnagger" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "WebStripper" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "WebCopier" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "Fetch" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "Offline Explorer" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "Teleport" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "TeleportPro" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "WebZIP" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "linko" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "HTTrack" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "Microsoft.URL.Control" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "Xenu" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "larbin" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "libwww" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "ZyBORG" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "Download Ninja" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "wget" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "grub-client" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "k2spider" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "NPBot" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents V{ "WebReaper" } }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
        T{ rules
 | 
						|
            { user-agents
 | 
						|
                V{
 | 
						|
                    "abot"
 | 
						|
                    "ALeadSoftbot"
 | 
						|
                    "BeijingCrawler"
 | 
						|
                    "BilgiBot"
 | 
						|
                    "bot"
 | 
						|
                    "botlist"
 | 
						|
                    "BOTW Spider"
 | 
						|
                    "bumblebee"
 | 
						|
                    "Bumblebee"
 | 
						|
                    "BuzzRankingBot"
 | 
						|
                    "Charlotte"
 | 
						|
                    "Clushbot"
 | 
						|
                    "Crawler"
 | 
						|
                    "CydralSpider"
 | 
						|
                    "DataFountains"
 | 
						|
                    "DiamondBot"
 | 
						|
                    "Dulance bot"
 | 
						|
                    "DYNAMIC"
 | 
						|
                    "EARTHCOM.info"
 | 
						|
                    "EDI"
 | 
						|
                    "envolk"
 | 
						|
                    "Exabot"
 | 
						|
                    "Exabot-Images"
 | 
						|
                    "Exabot-Test"
 | 
						|
                    "exactseek-pagereaper"
 | 
						|
                    "Exalead NG"
 | 
						|
                    "FANGCrawl"
 | 
						|
                    "Feed::Find"
 | 
						|
                    "flatlandbot"
 | 
						|
                    "Gigabot"
 | 
						|
                    "GigabotSiteSearch"
 | 
						|
                    "GurujiBot"
 | 
						|
                    "Hatena Antenna"
 | 
						|
                    "Hatena Bookmark"
 | 
						|
                    "Hatena RSS"
 | 
						|
                    "HatenaScreenshot"
 | 
						|
                    "Helix"
 | 
						|
                    "HiddenMarket"
 | 
						|
                    "HyperEstraier"
 | 
						|
                    "iaskspider"
 | 
						|
                    "IIITBOT"
 | 
						|
                    "InfociousBot"
 | 
						|
                    "iVia"
 | 
						|
                    "iVia Page Fetcher"
 | 
						|
                    "Jetbot"
 | 
						|
                    "Kolinka Forum Search"
 | 
						|
                    "KRetrieve"
 | 
						|
                    "LetsCrawl.com"
 | 
						|
                    "Lincoln State Web Browser"
 | 
						|
                    "Links4US-Crawler"
 | 
						|
                    "LOOQ"
 | 
						|
                    "Lsearch/sondeur"
 | 
						|
                    "MapoftheInternet.com"
 | 
						|
                    "NationalDirectory"
 | 
						|
                    "NetCarta_WebMapper"
 | 
						|
                    "NewsGator"
 | 
						|
                    "NextGenSearchBot"
 | 
						|
                    "ng"
 | 
						|
                    "nicebot"
 | 
						|
                    "NP"
 | 
						|
                    "NPBot"
 | 
						|
                    "Nudelsalat"
 | 
						|
                    "Nutch"
 | 
						|
                    "OmniExplorer_Bot"
 | 
						|
                    "OpenIntelligenceData"
 | 
						|
                    "Oracle Enterprise Search"
 | 
						|
                    "Pajaczek"
 | 
						|
                    "panscient.com"
 | 
						|
                    "PeerFactor 404 crawler"
 | 
						|
                    "PeerFactor Crawler"
 | 
						|
                    "PlantyNet"
 | 
						|
                    "PlantyNet_WebRobot"
 | 
						|
                    "plinki"
 | 
						|
                    "PMAFind"
 | 
						|
                    "Pogodak!"
 | 
						|
                    "QuickFinder Crawler"
 | 
						|
                    "Radiation Retriever"
 | 
						|
                    "Reaper"
 | 
						|
                    "RedCarpet"
 | 
						|
                    "ScorpionBot"
 | 
						|
                    "Scrubby"
 | 
						|
                    "Scumbot"
 | 
						|
                    "searchbot"
 | 
						|
                    "Seeker.lookseek.com"
 | 
						|
                    "SeznamBot"
 | 
						|
                    "ShowXML"
 | 
						|
                    "snap.com"
 | 
						|
                    "snap.com beta crawler"
 | 
						|
                    "Snapbot"
 | 
						|
                    "SnapPreviewBot"
 | 
						|
                    "sohu"
 | 
						|
                    "SpankBot"
 | 
						|
                    "Speedy Spider"
 | 
						|
                    "Speedy_Spider"
 | 
						|
                    "SpeedySpider"
 | 
						|
                    "spider"
 | 
						|
                    "SquigglebotBot"
 | 
						|
                    "SurveyBot"
 | 
						|
                    "SynapticSearch"
 | 
						|
                    "T-H-U-N-D-E-R-S-T-O-N-E"
 | 
						|
                    "Talkro Web-Shot"
 | 
						|
                    "Tarantula"
 | 
						|
                    "TerrawizBot"
 | 
						|
                    "TheInformant"
 | 
						|
                    "TMCrawler"
 | 
						|
                    "TridentSpider"
 | 
						|
                    "Tutorial Crawler"
 | 
						|
                    "Twiceler"
 | 
						|
                    "unwrapbot"
 | 
						|
                    "URI::Fetch"
 | 
						|
                    "VengaBot"
 | 
						|
                    "Vonna.com b o t"
 | 
						|
                    "Vortex"
 | 
						|
                    "Votay bot"
 | 
						|
                    "WebAlta Crawler"
 | 
						|
                    "Webbot"
 | 
						|
                    "Webclipping.com"
 | 
						|
                    "WebCorp"
 | 
						|
                    "Webinator"
 | 
						|
                    "WIRE"
 | 
						|
                    "WISEbot"
 | 
						|
                    "Xerka WebBot"
 | 
						|
                    "XSpider"
 | 
						|
                    "YodaoBot"
 | 
						|
                    "Yoono"
 | 
						|
                    "yoono"
 | 
						|
                }
 | 
						|
            }
 | 
						|
            { allows V{ } }
 | 
						|
            { disallows V{ URL" /" } }
 | 
						|
            { unknowns H{ } }
 | 
						|
        }
 | 
						|
    }
 | 
						|
} [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test
 |