diff --git a/extra/robots/robots-tests.factor b/extra/robots/robots-tests.factor index a590d9eee0..54b4892680 100644 --- a/extra/robots/robots-tests.factor +++ b/extra/robots/robots-tests.factor @@ -1,334 +1,335 @@ ! Copyright (C) 2009 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: calendar io.encodings.utf8 io.files robots tools.test ; +USING: calendar io.encodings.utf8 io.files robots tools.test +urls ; IN: robots.tests [ -{ "http://www.chiplist.com/sitemap.txt" } -{ - T{ rules - { user-agents V{ "*" } } - { allows V{ } } - { disallows - V{ - "/cgi-bin/" - "/scripts/" - "/ChipList2/scripts/" - "/ChipList2/styles/" - "/ads/" - "/ChipList2/ads/" - "/advertisements/" - "/ChipList2/advertisements/" - "/graphics/" - "/ChipList2/graphics/" + { "http://www.chiplist.com/sitemap.txt" } + { + T{ rules + { user-agents V{ "*" } } + { allows V{ } } + { disallows + V{ + URL" /cgi-bin/" + URL" /scripts/" + URL" /ChipList2/scripts/" + URL" /ChipList2/styles/" + URL" /ads/" + URL" /ChipList2/ads/" + URL" /advertisements/" + URL" /ChipList2/advertisements/" + URL" /graphics/" + URL" /ChipList2/graphics/" + } } - } - { visit-time - { - T{ timestamp { hour 2 } } - T{ timestamp { hour 5 } } + { visit-time + { + T{ timestamp { hour 2 } } + T{ timestamp { hour 5 } } + } } + { request-rate 1 } + { crawl-delay 1 } + { unknowns H{ } } } - { request-rate 1 } - { crawl-delay 1 } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "UbiCrawler" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "DOC" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "Zao" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "sitecheck.internetseer.com" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "Zealbot" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "MSIECrawler" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "SiteSnagger" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "WebStripper" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "WebCopier" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "Fetch" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "Offline Explorer" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "Teleport" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "TeleportPro" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "WebZIP" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "linko" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "HTTrack" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "Microsoft.URL.Control" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "Xenu" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "larbin" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "libwww" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "ZyBORG" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "Download Ninja" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "wget" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "grub-client" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "k2spider" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "NPBot" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents V{ "WebReaper" } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } - } - T{ rules - { user-agents - V{ - "abot" - "ALeadSoftbot" - "BeijingCrawler" - "BilgiBot" - "bot" - "botlist" - "BOTW Spider" - "bumblebee" - "Bumblebee" - "BuzzRankingBot" - "Charlotte" - "Clushbot" - "Crawler" - "CydralSpider" - "DataFountains" - "DiamondBot" - "Dulance bot" - "DYNAMIC" - "EARTHCOM.info" - "EDI" - "envolk" - "Exabot" - "Exabot-Images" - "Exabot-Test" - "exactseek-pagereaper" - "Exalead NG" - "FANGCrawl" - "Feed::Find" - "flatlandbot" - "Gigabot" - "GigabotSiteSearch" - "GurujiBot" - "Hatena Antenna" - "Hatena Bookmark" - "Hatena RSS" - "HatenaScreenshot" - "Helix" - "HiddenMarket" - "HyperEstraier" - "iaskspider" - "IIITBOT" - "InfociousBot" - "iVia" - "iVia Page Fetcher" - "Jetbot" - "Kolinka Forum Search" - "KRetrieve" - "LetsCrawl.com" - "Lincoln State Web Browser" - "Links4US-Crawler" - "LOOQ" - "Lsearch/sondeur" - "MapoftheInternet.com" - "NationalDirectory" - "NetCarta_WebMapper" - "NewsGator" - "NextGenSearchBot" - "ng" - "nicebot" - "NP" - "NPBot" - "Nudelsalat" - "Nutch" - "OmniExplorer_Bot" - "OpenIntelligenceData" - "Oracle Enterprise Search" - "Pajaczek" - "panscient.com" - "PeerFactor 404 crawler" - "PeerFactor Crawler" - "PlantyNet" - "PlantyNet_WebRobot" - "plinki" - "PMAFind" - "Pogodak!" - "QuickFinder Crawler" - "Radiation Retriever" - "Reaper" - "RedCarpet" - "ScorpionBot" - "Scrubby" - "Scumbot" - "searchbot" - "Seeker.lookseek.com" - "SeznamBot" - "ShowXML" - "snap.com" - "snap.com beta crawler" - "Snapbot" - "SnapPreviewBot" - "sohu" - "SpankBot" - "Speedy Spider" - "Speedy_Spider" - "SpeedySpider" - "spider" - "SquigglebotBot" - "SurveyBot" - "SynapticSearch" - "T-H-U-N-D-E-R-S-T-O-N-E" - "Talkro Web-Shot" - "Tarantula" - "TerrawizBot" - "TheInformant" - "TMCrawler" - "TridentSpider" - "Tutorial Crawler" - "Twiceler" - "unwrapbot" - "URI::Fetch" - "VengaBot" - "Vonna.com b o t" - "Vortex" - "Votay bot" - "WebAlta Crawler" - "Webbot" - "Webclipping.com" - "WebCorp" - "Webinator" - "WIRE" - "WISEbot" - "Xerka WebBot" - "XSpider" - "YodaoBot" - "Yoono" - "yoono" + T{ rules + { user-agents V{ "UbiCrawler" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "DOC" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "Zao" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "sitecheck.internetseer.com" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "Zealbot" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "MSIECrawler" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "SiteSnagger" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "WebStripper" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "WebCopier" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "Fetch" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "Offline Explorer" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "Teleport" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "TeleportPro" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "WebZIP" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "linko" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "HTTrack" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "Microsoft.URL.Control" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "Xenu" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "larbin" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "libwww" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "ZyBORG" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "Download Ninja" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "wget" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "grub-client" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "k2spider" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "NPBot" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents V{ "WebReaper" } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } + } + T{ rules + { user-agents + V{ + "abot" + "ALeadSoftbot" + "BeijingCrawler" + "BilgiBot" + "bot" + "botlist" + "BOTW Spider" + "bumblebee" + "Bumblebee" + "BuzzRankingBot" + "Charlotte" + "Clushbot" + "Crawler" + "CydralSpider" + "DataFountains" + "DiamondBot" + "Dulance bot" + "DYNAMIC" + "EARTHCOM.info" + "EDI" + "envolk" + "Exabot" + "Exabot-Images" + "Exabot-Test" + "exactseek-pagereaper" + "Exalead NG" + "FANGCrawl" + "Feed::Find" + "flatlandbot" + "Gigabot" + "GigabotSiteSearch" + "GurujiBot" + "Hatena Antenna" + "Hatena Bookmark" + "Hatena RSS" + "HatenaScreenshot" + "Helix" + "HiddenMarket" + "HyperEstraier" + "iaskspider" + "IIITBOT" + "InfociousBot" + "iVia" + "iVia Page Fetcher" + "Jetbot" + "Kolinka Forum Search" + "KRetrieve" + "LetsCrawl.com" + "Lincoln State Web Browser" + "Links4US-Crawler" + "LOOQ" + "Lsearch/sondeur" + "MapoftheInternet.com" + "NationalDirectory" + "NetCarta_WebMapper" + "NewsGator" + "NextGenSearchBot" + "ng" + "nicebot" + "NP" + "NPBot" + "Nudelsalat" + "Nutch" + "OmniExplorer_Bot" + "OpenIntelligenceData" + "Oracle Enterprise Search" + "Pajaczek" + "panscient.com" + "PeerFactor 404 crawler" + "PeerFactor Crawler" + "PlantyNet" + "PlantyNet_WebRobot" + "plinki" + "PMAFind" + "Pogodak!" + "QuickFinder Crawler" + "Radiation Retriever" + "Reaper" + "RedCarpet" + "ScorpionBot" + "Scrubby" + "Scumbot" + "searchbot" + "Seeker.lookseek.com" + "SeznamBot" + "ShowXML" + "snap.com" + "snap.com beta crawler" + "Snapbot" + "SnapPreviewBot" + "sohu" + "SpankBot" + "Speedy Spider" + "Speedy_Spider" + "SpeedySpider" + "spider" + "SquigglebotBot" + "SurveyBot" + "SynapticSearch" + "T-H-U-N-D-E-R-S-T-O-N-E" + "Talkro Web-Shot" + "Tarantula" + "TerrawizBot" + "TheInformant" + "TMCrawler" + "TridentSpider" + "Tutorial Crawler" + "Twiceler" + "unwrapbot" + "URI::Fetch" + "VengaBot" + "Vonna.com b o t" + "Vortex" + "Votay bot" + "WebAlta Crawler" + "Webbot" + "Webclipping.com" + "WebCorp" + "Webinator" + "WIRE" + "WISEbot" + "Xerka WebBot" + "XSpider" + "YodaoBot" + "Yoono" + "yoono" + } } + { allows V{ } } + { disallows V{ URL" /" } } + { unknowns H{ } } } - { allows V{ } } - { disallows V{ "/" } } - { unknowns H{ } } } -} ] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test diff --git a/extra/robots/robots.factor b/extra/robots/robots.factor index 242aa1dea2..3c0eb045f7 100644 --- a/extra/robots/robots.factor +++ b/extra/robots/robots.factor @@ -85,7 +85,7 @@ PRIVATE> : parse-robots.txt ( string -- sitemaps rules-seq ) normalize-robots.txt [ [ dup ] dip [ parse-robots.txt-line drop ] with each - ] map first ; + ] map ; : robots ( url -- robots ) >url