fix unit test in robots

db4
Doug Coleman 2009-04-07 10:08:01 -05:00
parent 63cf5b04e1
commit 65802b6aaa
2 changed files with 324 additions and 323 deletions

View File

@ -1,334 +1,335 @@
! Copyright (C) 2009 Doug Coleman. ! Copyright (C) 2009 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: calendar io.encodings.utf8 io.files robots tools.test ; USING: calendar io.encodings.utf8 io.files robots tools.test
urls ;
IN: robots.tests IN: robots.tests
[ [
{ "http://www.chiplist.com/sitemap.txt" } { "http://www.chiplist.com/sitemap.txt" }
{ {
T{ rules T{ rules
{ user-agents V{ "*" } } { user-agents V{ "*" } }
{ allows V{ } } { allows V{ } }
{ disallows { disallows
V{ V{
"/cgi-bin/" URL" /cgi-bin/"
"/scripts/" URL" /scripts/"
"/ChipList2/scripts/" URL" /ChipList2/scripts/"
"/ChipList2/styles/" URL" /ChipList2/styles/"
"/ads/" URL" /ads/"
"/ChipList2/ads/" URL" /ChipList2/ads/"
"/advertisements/" URL" /advertisements/"
"/ChipList2/advertisements/" URL" /ChipList2/advertisements/"
"/graphics/" URL" /graphics/"
"/ChipList2/graphics/" URL" /ChipList2/graphics/"
}
} }
} { visit-time
{ visit-time {
{ T{ timestamp { hour 2 } }
T{ timestamp { hour 2 } } T{ timestamp { hour 5 } }
T{ timestamp { hour 5 } } }
} }
{ request-rate 1 }
{ crawl-delay 1 }
{ unknowns H{ } }
} }
{ request-rate 1 } T{ rules
{ crawl-delay 1 } { user-agents V{ "UbiCrawler" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "UbiCrawler" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "DOC" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "DOC" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Zao" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Zao" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "sitecheck.internetseer.com" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "sitecheck.internetseer.com" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Zealbot" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Zealbot" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "MSIECrawler" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "MSIECrawler" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "SiteSnagger" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "SiteSnagger" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "WebStripper" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "WebStripper" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "WebCopier" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "WebCopier" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Fetch" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Fetch" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Offline Explorer" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Offline Explorer" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Teleport" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Teleport" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "TeleportPro" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "TeleportPro" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "WebZIP" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "WebZIP" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "linko" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "linko" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "HTTrack" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "HTTrack" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Microsoft.URL.Control" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Microsoft.URL.Control" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Xenu" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Xenu" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "larbin" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "larbin" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "libwww" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "libwww" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "ZyBORG" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "ZyBORG" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Download Ninja" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Download Ninja" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "wget" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "wget" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "grub-client" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "grub-client" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "k2spider" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "k2spider" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "NPBot" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "NPBot" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "WebReaper" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "WebReaper" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents
{ unknowns H{ } } V{
} "abot"
T{ rules "ALeadSoftbot"
{ user-agents "BeijingCrawler"
V{ "BilgiBot"
"abot" "bot"
"ALeadSoftbot" "botlist"
"BeijingCrawler" "BOTW Spider"
"BilgiBot" "bumblebee"
"bot" "Bumblebee"
"botlist" "BuzzRankingBot"
"BOTW Spider" "Charlotte"
"bumblebee" "Clushbot"
"Bumblebee" "Crawler"
"BuzzRankingBot" "CydralSpider"
"Charlotte" "DataFountains"
"Clushbot" "DiamondBot"
"Crawler" "Dulance bot"
"CydralSpider" "DYNAMIC"
"DataFountains" "EARTHCOM.info"
"DiamondBot" "EDI"
"Dulance bot" "envolk"
"DYNAMIC" "Exabot"
"EARTHCOM.info" "Exabot-Images"
"EDI" "Exabot-Test"
"envolk" "exactseek-pagereaper"
"Exabot" "Exalead NG"
"Exabot-Images" "FANGCrawl"
"Exabot-Test" "Feed::Find"
"exactseek-pagereaper" "flatlandbot"
"Exalead NG" "Gigabot"
"FANGCrawl" "GigabotSiteSearch"
"Feed::Find" "GurujiBot"
"flatlandbot" "Hatena Antenna"
"Gigabot" "Hatena Bookmark"
"GigabotSiteSearch" "Hatena RSS"
"GurujiBot" "HatenaScreenshot"
"Hatena Antenna" "Helix"
"Hatena Bookmark" "HiddenMarket"
"Hatena RSS" "HyperEstraier"
"HatenaScreenshot" "iaskspider"
"Helix" "IIITBOT"
"HiddenMarket" "InfociousBot"
"HyperEstraier" "iVia"
"iaskspider" "iVia Page Fetcher"
"IIITBOT" "Jetbot"
"InfociousBot" "Kolinka Forum Search"
"iVia" "KRetrieve"
"iVia Page Fetcher" "LetsCrawl.com"
"Jetbot" "Lincoln State Web Browser"
"Kolinka Forum Search" "Links4US-Crawler"
"KRetrieve" "LOOQ"
"LetsCrawl.com" "Lsearch/sondeur"
"Lincoln State Web Browser" "MapoftheInternet.com"
"Links4US-Crawler" "NationalDirectory"
"LOOQ" "NetCarta_WebMapper"
"Lsearch/sondeur" "NewsGator"
"MapoftheInternet.com" "NextGenSearchBot"
"NationalDirectory" "ng"
"NetCarta_WebMapper" "nicebot"
"NewsGator" "NP"
"NextGenSearchBot" "NPBot"
"ng" "Nudelsalat"
"nicebot" "Nutch"
"NP" "OmniExplorer_Bot"
"NPBot" "OpenIntelligenceData"
"Nudelsalat" "Oracle Enterprise Search"
"Nutch" "Pajaczek"
"OmniExplorer_Bot" "panscient.com"
"OpenIntelligenceData" "PeerFactor 404 crawler"
"Oracle Enterprise Search" "PeerFactor Crawler"
"Pajaczek" "PlantyNet"
"panscient.com" "PlantyNet_WebRobot"
"PeerFactor 404 crawler" "plinki"
"PeerFactor Crawler" "PMAFind"
"PlantyNet" "Pogodak!"
"PlantyNet_WebRobot" "QuickFinder Crawler"
"plinki" "Radiation Retriever"
"PMAFind" "Reaper"
"Pogodak!" "RedCarpet"
"QuickFinder Crawler" "ScorpionBot"
"Radiation Retriever" "Scrubby"
"Reaper" "Scumbot"
"RedCarpet" "searchbot"
"ScorpionBot" "Seeker.lookseek.com"
"Scrubby" "SeznamBot"
"Scumbot" "ShowXML"
"searchbot" "snap.com"
"Seeker.lookseek.com" "snap.com beta crawler"
"SeznamBot" "Snapbot"
"ShowXML" "SnapPreviewBot"
"snap.com" "sohu"
"snap.com beta crawler" "SpankBot"
"Snapbot" "Speedy Spider"
"SnapPreviewBot" "Speedy_Spider"
"sohu" "SpeedySpider"
"SpankBot" "spider"
"Speedy Spider" "SquigglebotBot"
"Speedy_Spider" "SurveyBot"
"SpeedySpider" "SynapticSearch"
"spider" "T-H-U-N-D-E-R-S-T-O-N-E"
"SquigglebotBot" "Talkro Web-Shot"
"SurveyBot" "Tarantula"
"SynapticSearch" "TerrawizBot"
"T-H-U-N-D-E-R-S-T-O-N-E" "TheInformant"
"Talkro Web-Shot" "TMCrawler"
"Tarantula" "TridentSpider"
"TerrawizBot" "Tutorial Crawler"
"TheInformant" "Twiceler"
"TMCrawler" "unwrapbot"
"TridentSpider" "URI::Fetch"
"Tutorial Crawler" "VengaBot"
"Twiceler" "Vonna.com b o t"
"unwrapbot" "Vortex"
"URI::Fetch" "Votay bot"
"VengaBot" "WebAlta Crawler"
"Vonna.com b o t" "Webbot"
"Vortex" "Webclipping.com"
"Votay bot" "WebCorp"
"WebAlta Crawler" "Webinator"
"Webbot" "WIRE"
"Webclipping.com" "WISEbot"
"WebCorp" "Xerka WebBot"
"Webinator" "XSpider"
"WIRE" "YodaoBot"
"WISEbot" "Yoono"
"Xerka WebBot" "yoono"
"XSpider" }
"YodaoBot"
"Yoono"
"yoono"
} }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
} }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
} }
}
] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test ] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test

View File

@ -85,7 +85,7 @@ PRIVATE>
: parse-robots.txt ( string -- sitemaps rules-seq ) : parse-robots.txt ( string -- sitemaps rules-seq )
normalize-robots.txt [ normalize-robots.txt [
[ <rules> dup ] dip [ parse-robots.txt-line drop ] with each [ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
] map first ; ] map ;
: robots ( url -- robots ) : robots ( url -- robots )
>url >url