Merge branch 'master' of git://factorcode.org/git/factor

db4
John Benediktsson 2009-04-07 13:59:59 -07:00
commit 9f6f53e571
8 changed files with 381 additions and 338 deletions

View File

@ -6,7 +6,7 @@ math.order hashtables byte-arrays destructors
io io.sockets io.streams.string io.files io.timeouts
io.pathnames io.encodings io.encodings.string io.encodings.ascii
io.encodings.utf8 io.encodings.8-bit io.encodings.binary io.crlf
io.streams.duplex fry ascii urls urls.encoding present
io.streams.duplex fry ascii urls urls.encoding present locals
http http.parsers http.client.post-data ;
IN: http.client
@ -77,12 +77,13 @@ SYMBOL: redirects
: redirect? ( response -- ? )
code>> 300 399 between? ;
: do-redirect ( quot: ( chunk -- ) response -- response )
:: do-redirect ( quot: ( chunk -- ) response -- response )
redirects inc
redirects get max-redirects < [
request get clone
swap "location" header redirect-url
"GET" >>method swap (with-http-request)
response "location" header redirect-url
response code>> 307 = [ "GET" >>method ] unless
quot (with-http-request)
] [ too-many-redirects ] if ; inline recursive
: read-chunk-size ( -- n )

View File

@ -1,8 +1,8 @@
USING: http http.server http.client http.client.private tools.test multiline
io.streams.string io.encodings.utf8 io.encodings.8-bit
io.encodings.binary io.encodings.string kernel arrays splitting
sequences assocs io.sockets db db.sqlite continuations urls
hashtables accessors namespaces xml.data ;
USING: http http.server http.client http.client.private tools.test
multiline io.streams.string io.encodings.utf8 io.encodings.8-bit
io.encodings.binary io.encodings.string io.encodings.ascii kernel
arrays splitting sequences assocs io.sockets db db.sqlite
continuations urls hashtables accessors namespaces xml.data ;
IN: http.tests
[ "text/plain" latin1 ] [ "text/plain" parse-content-type ] unit-test
@ -359,4 +359,37 @@ SYMBOL: a
! Test basic auth
[ "Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ==" ] [ <request> "Aladdin" "open sesame" set-basic-auth "Authorization" header ] unit-test
! Test a corner case with static responder
[ ] [
<dispatcher>
add-quit-action
"vocab:http/test/foo.html" <static> >>default
test-httpd
] unit-test
[ t ] [
"http://localhost/" add-port http-get nip
"vocab:http/test/foo.html" ascii file-contents =
] unit-test
[ ] [ "http://localhost/quit" add-port http-get 2drop ] unit-test
! Check behavior of 307 redirect (reported by Chris Double)
[ ] [
<dispatcher>
add-quit-action
<action>
[ "b" <temporary-redirect> ] >>submit
"a" add-responder
<action>
[
request get post-data>> data>> "data" =
[ "OK" "text/plain" <content> ] [ "OOPS" throw ] if
] >>submit
"b" add-responder
test-httpd
] unit-test
[ "OK" ] [ "data" "http://localhost/a" add-port http-post nip ] unit-test
[ ] [ "http://localhost/quit" add-port http-get 2drop ] unit-test

View File

@ -47,8 +47,8 @@ TUPLE: file-responder root hook special allow-listings ;
if ;
: serving-path ( filename -- filename )
[ file-responder get root>> trim-tail-separators "/" ] dip
"" or trim-head-separators 3append ;
[ file-responder get root>> trim-tail-separators ] dip
[ "/" swap trim-head-separators 3append ] unless-empty ;
: serve-file ( filename -- response )
dup mime-type

View File

@ -76,3 +76,9 @@ IN: io.streams.limited.tests
[ decoder? ] both?
] with-destructors
] unit-test
[ "HELL" ] [
"HELLO"
[ f stream-throws limit-input 4 read ]
with-string-reader
] unit-test

View File

@ -22,7 +22,7 @@ M: decoder limit ( stream limit mode -- stream' )
[ clone ] 2dip '[ _ _ limit ] change-stream ;
M: object limit ( stream limit mode -- stream' )
<limited-stream> ;
over [ <limited-stream> ] [ 2drop ] if ;
GENERIC: unlimited ( stream -- stream' )
@ -32,9 +32,11 @@ M: decoder unlimited ( stream -- stream' )
M: object unlimited ( stream -- stream' )
stream>> stream>> ;
: limit-input ( limit mode -- ) input-stream [ -rot limit ] change ;
: limit-input ( limit mode -- )
[ input-stream ] 2dip '[ _ _ limit ] change ;
: unlimited-input ( -- ) input-stream [ unlimited ] change ;
: unlimited-input ( -- )
input-stream [ unlimited ] change ;
: with-unlimited-stream ( stream quot -- )
[ clone unlimited ] dip call ; inline

View File

@ -1,334 +1,335 @@
! Copyright (C) 2009 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: calendar io.encodings.utf8 io.files robots tools.test ;
USING: calendar io.encodings.utf8 io.files robots tools.test
urls ;
IN: robots.tests
[
{ "http://www.chiplist.com/sitemap.txt" }
{
T{ rules
{ user-agents V{ "*" } }
{ allows V{ } }
{ disallows
V{
"/cgi-bin/"
"/scripts/"
"/ChipList2/scripts/"
"/ChipList2/styles/"
"/ads/"
"/ChipList2/ads/"
"/advertisements/"
"/ChipList2/advertisements/"
"/graphics/"
"/ChipList2/graphics/"
{ "http://www.chiplist.com/sitemap.txt" }
{
T{ rules
{ user-agents V{ "*" } }
{ allows V{ } }
{ disallows
V{
URL" /cgi-bin/"
URL" /scripts/"
URL" /ChipList2/scripts/"
URL" /ChipList2/styles/"
URL" /ads/"
URL" /ChipList2/ads/"
URL" /advertisements/"
URL" /ChipList2/advertisements/"
URL" /graphics/"
URL" /ChipList2/graphics/"
}
}
}
{ visit-time
{
T{ timestamp { hour 2 } }
T{ timestamp { hour 5 } }
{ visit-time
{
T{ timestamp { hour 2 } }
T{ timestamp { hour 5 } }
}
}
{ request-rate 1 }
{ crawl-delay 1 }
{ unknowns H{ } }
}
{ request-rate 1 }
{ crawl-delay 1 }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "UbiCrawler" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "DOC" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Zao" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "sitecheck.internetseer.com" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Zealbot" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "MSIECrawler" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "SiteSnagger" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "WebStripper" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "WebCopier" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Fetch" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Offline Explorer" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Teleport" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "TeleportPro" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "WebZIP" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "linko" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "HTTrack" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Microsoft.URL.Control" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Xenu" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "larbin" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "libwww" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "ZyBORG" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Download Ninja" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "wget" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "grub-client" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "k2spider" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "NPBot" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "WebReaper" } }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents
V{
"abot"
"ALeadSoftbot"
"BeijingCrawler"
"BilgiBot"
"bot"
"botlist"
"BOTW Spider"
"bumblebee"
"Bumblebee"
"BuzzRankingBot"
"Charlotte"
"Clushbot"
"Crawler"
"CydralSpider"
"DataFountains"
"DiamondBot"
"Dulance bot"
"DYNAMIC"
"EARTHCOM.info"
"EDI"
"envolk"
"Exabot"
"Exabot-Images"
"Exabot-Test"
"exactseek-pagereaper"
"Exalead NG"
"FANGCrawl"
"Feed::Find"
"flatlandbot"
"Gigabot"
"GigabotSiteSearch"
"GurujiBot"
"Hatena Antenna"
"Hatena Bookmark"
"Hatena RSS"
"HatenaScreenshot"
"Helix"
"HiddenMarket"
"HyperEstraier"
"iaskspider"
"IIITBOT"
"InfociousBot"
"iVia"
"iVia Page Fetcher"
"Jetbot"
"Kolinka Forum Search"
"KRetrieve"
"LetsCrawl.com"
"Lincoln State Web Browser"
"Links4US-Crawler"
"LOOQ"
"Lsearch/sondeur"
"MapoftheInternet.com"
"NationalDirectory"
"NetCarta_WebMapper"
"NewsGator"
"NextGenSearchBot"
"ng"
"nicebot"
"NP"
"NPBot"
"Nudelsalat"
"Nutch"
"OmniExplorer_Bot"
"OpenIntelligenceData"
"Oracle Enterprise Search"
"Pajaczek"
"panscient.com"
"PeerFactor 404 crawler"
"PeerFactor Crawler"
"PlantyNet"
"PlantyNet_WebRobot"
"plinki"
"PMAFind"
"Pogodak!"
"QuickFinder Crawler"
"Radiation Retriever"
"Reaper"
"RedCarpet"
"ScorpionBot"
"Scrubby"
"Scumbot"
"searchbot"
"Seeker.lookseek.com"
"SeznamBot"
"ShowXML"
"snap.com"
"snap.com beta crawler"
"Snapbot"
"SnapPreviewBot"
"sohu"
"SpankBot"
"Speedy Spider"
"Speedy_Spider"
"SpeedySpider"
"spider"
"SquigglebotBot"
"SurveyBot"
"SynapticSearch"
"T-H-U-N-D-E-R-S-T-O-N-E"
"Talkro Web-Shot"
"Tarantula"
"TerrawizBot"
"TheInformant"
"TMCrawler"
"TridentSpider"
"Tutorial Crawler"
"Twiceler"
"unwrapbot"
"URI::Fetch"
"VengaBot"
"Vonna.com b o t"
"Vortex"
"Votay bot"
"WebAlta Crawler"
"Webbot"
"Webclipping.com"
"WebCorp"
"Webinator"
"WIRE"
"WISEbot"
"Xerka WebBot"
"XSpider"
"YodaoBot"
"Yoono"
"yoono"
T{ rules
{ user-agents V{ "UbiCrawler" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "DOC" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Zao" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "sitecheck.internetseer.com" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Zealbot" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "MSIECrawler" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "SiteSnagger" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "WebStripper" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "WebCopier" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Fetch" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Offline Explorer" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Teleport" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "TeleportPro" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "WebZIP" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "linko" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "HTTrack" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Microsoft.URL.Control" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Xenu" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "larbin" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "libwww" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "ZyBORG" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "Download Ninja" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "wget" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "grub-client" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "k2spider" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "NPBot" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents V{ "WebReaper" } }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
T{ rules
{ user-agents
V{
"abot"
"ALeadSoftbot"
"BeijingCrawler"
"BilgiBot"
"bot"
"botlist"
"BOTW Spider"
"bumblebee"
"Bumblebee"
"BuzzRankingBot"
"Charlotte"
"Clushbot"
"Crawler"
"CydralSpider"
"DataFountains"
"DiamondBot"
"Dulance bot"
"DYNAMIC"
"EARTHCOM.info"
"EDI"
"envolk"
"Exabot"
"Exabot-Images"
"Exabot-Test"
"exactseek-pagereaper"
"Exalead NG"
"FANGCrawl"
"Feed::Find"
"flatlandbot"
"Gigabot"
"GigabotSiteSearch"
"GurujiBot"
"Hatena Antenna"
"Hatena Bookmark"
"Hatena RSS"
"HatenaScreenshot"
"Helix"
"HiddenMarket"
"HyperEstraier"
"iaskspider"
"IIITBOT"
"InfociousBot"
"iVia"
"iVia Page Fetcher"
"Jetbot"
"Kolinka Forum Search"
"KRetrieve"
"LetsCrawl.com"
"Lincoln State Web Browser"
"Links4US-Crawler"
"LOOQ"
"Lsearch/sondeur"
"MapoftheInternet.com"
"NationalDirectory"
"NetCarta_WebMapper"
"NewsGator"
"NextGenSearchBot"
"ng"
"nicebot"
"NP"
"NPBot"
"Nudelsalat"
"Nutch"
"OmniExplorer_Bot"
"OpenIntelligenceData"
"Oracle Enterprise Search"
"Pajaczek"
"panscient.com"
"PeerFactor 404 crawler"
"PeerFactor Crawler"
"PlantyNet"
"PlantyNet_WebRobot"
"plinki"
"PMAFind"
"Pogodak!"
"QuickFinder Crawler"
"Radiation Retriever"
"Reaper"
"RedCarpet"
"ScorpionBot"
"Scrubby"
"Scumbot"
"searchbot"
"Seeker.lookseek.com"
"SeznamBot"
"ShowXML"
"snap.com"
"snap.com beta crawler"
"Snapbot"
"SnapPreviewBot"
"sohu"
"SpankBot"
"Speedy Spider"
"Speedy_Spider"
"SpeedySpider"
"spider"
"SquigglebotBot"
"SurveyBot"
"SynapticSearch"
"T-H-U-N-D-E-R-S-T-O-N-E"
"Talkro Web-Shot"
"Tarantula"
"TerrawizBot"
"TheInformant"
"TMCrawler"
"TridentSpider"
"Tutorial Crawler"
"Twiceler"
"unwrapbot"
"URI::Fetch"
"VengaBot"
"Vonna.com b o t"
"Vortex"
"Votay bot"
"WebAlta Crawler"
"Webbot"
"Webclipping.com"
"WebCorp"
"Webinator"
"WIRE"
"WISEbot"
"Xerka WebBot"
"XSpider"
"YodaoBot"
"Yoono"
"yoono"
}
}
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
}
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
}
}
] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test

View File

@ -85,7 +85,7 @@ PRIVATE>
: parse-robots.txt ( string -- sitemaps rules-seq )
normalize-robots.txt [
[ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
] map first ;
] map ;
: robots ( url -- robots )
>url

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license.
USING: db.tuples locals site-watcher site-watcher.db
site-watcher.private kernel db io.directories io.files.temp
continuations site-watcher.db.private db.sqlite
continuations db.sqlite
sequences tools.test ;
IN: site-watcher.tests