Merge branch 'master' of git://factorcode.org/git/factor

db4
John Benediktsson 2009-04-07 13:59:59 -07:00
commit 9f6f53e571
8 changed files with 381 additions and 338 deletions

View File

@ -6,7 +6,7 @@ math.order hashtables byte-arrays destructors
io io.sockets io.streams.string io.files io.timeouts io io.sockets io.streams.string io.files io.timeouts
io.pathnames io.encodings io.encodings.string io.encodings.ascii io.pathnames io.encodings io.encodings.string io.encodings.ascii
io.encodings.utf8 io.encodings.8-bit io.encodings.binary io.crlf io.encodings.utf8 io.encodings.8-bit io.encodings.binary io.crlf
io.streams.duplex fry ascii urls urls.encoding present io.streams.duplex fry ascii urls urls.encoding present locals
http http.parsers http.client.post-data ; http http.parsers http.client.post-data ;
IN: http.client IN: http.client
@ -77,12 +77,13 @@ SYMBOL: redirects
: redirect? ( response -- ? ) : redirect? ( response -- ? )
code>> 300 399 between? ; code>> 300 399 between? ;
: do-redirect ( quot: ( chunk -- ) response -- response ) :: do-redirect ( quot: ( chunk -- ) response -- response )
redirects inc redirects inc
redirects get max-redirects < [ redirects get max-redirects < [
request get clone request get clone
swap "location" header redirect-url response "location" header redirect-url
"GET" >>method swap (with-http-request) response code>> 307 = [ "GET" >>method ] unless
quot (with-http-request)
] [ too-many-redirects ] if ; inline recursive ] [ too-many-redirects ] if ; inline recursive
: read-chunk-size ( -- n ) : read-chunk-size ( -- n )

View File

@ -1,8 +1,8 @@
USING: http http.server http.client http.client.private tools.test multiline USING: http http.server http.client http.client.private tools.test
io.streams.string io.encodings.utf8 io.encodings.8-bit multiline io.streams.string io.encodings.utf8 io.encodings.8-bit
io.encodings.binary io.encodings.string kernel arrays splitting io.encodings.binary io.encodings.string io.encodings.ascii kernel
sequences assocs io.sockets db db.sqlite continuations urls arrays splitting sequences assocs io.sockets db db.sqlite
hashtables accessors namespaces xml.data ; continuations urls hashtables accessors namespaces xml.data ;
IN: http.tests IN: http.tests
[ "text/plain" latin1 ] [ "text/plain" parse-content-type ] unit-test [ "text/plain" latin1 ] [ "text/plain" parse-content-type ] unit-test
@ -359,4 +359,37 @@ SYMBOL: a
! Test basic auth ! Test basic auth
[ "Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ==" ] [ <request> "Aladdin" "open sesame" set-basic-auth "Authorization" header ] unit-test [ "Basic QWxhZGRpbjpvcGVuIHNlc2FtZQ==" ] [ <request> "Aladdin" "open sesame" set-basic-auth "Authorization" header ] unit-test
! Test a corner case with static responder
[ ] [
<dispatcher>
add-quit-action
"vocab:http/test/foo.html" <static> >>default
test-httpd
] unit-test
[ t ] [
"http://localhost/" add-port http-get nip
"vocab:http/test/foo.html" ascii file-contents =
] unit-test
[ ] [ "http://localhost/quit" add-port http-get 2drop ] unit-test
! Check behavior of 307 redirect (reported by Chris Double)
[ ] [
<dispatcher>
add-quit-action
<action>
[ "b" <temporary-redirect> ] >>submit
"a" add-responder
<action>
[
request get post-data>> data>> "data" =
[ "OK" "text/plain" <content> ] [ "OOPS" throw ] if
] >>submit
"b" add-responder
test-httpd
] unit-test
[ "OK" ] [ "data" "http://localhost/a" add-port http-post nip ] unit-test
[ ] [ "http://localhost/quit" add-port http-get 2drop ] unit-test

View File

@ -47,8 +47,8 @@ TUPLE: file-responder root hook special allow-listings ;
if ; if ;
: serving-path ( filename -- filename ) : serving-path ( filename -- filename )
[ file-responder get root>> trim-tail-separators "/" ] dip [ file-responder get root>> trim-tail-separators ] dip
"" or trim-head-separators 3append ; [ "/" swap trim-head-separators 3append ] unless-empty ;
: serve-file ( filename -- response ) : serve-file ( filename -- response )
dup mime-type dup mime-type

View File

@ -76,3 +76,9 @@ IN: io.streams.limited.tests
[ decoder? ] both? [ decoder? ] both?
] with-destructors ] with-destructors
] unit-test ] unit-test
[ "HELL" ] [
"HELLO"
[ f stream-throws limit-input 4 read ]
with-string-reader
] unit-test

View File

@ -22,7 +22,7 @@ M: decoder limit ( stream limit mode -- stream' )
[ clone ] 2dip '[ _ _ limit ] change-stream ; [ clone ] 2dip '[ _ _ limit ] change-stream ;
M: object limit ( stream limit mode -- stream' ) M: object limit ( stream limit mode -- stream' )
<limited-stream> ; over [ <limited-stream> ] [ 2drop ] if ;
GENERIC: unlimited ( stream -- stream' ) GENERIC: unlimited ( stream -- stream' )
@ -32,9 +32,11 @@ M: decoder unlimited ( stream -- stream' )
M: object unlimited ( stream -- stream' ) M: object unlimited ( stream -- stream' )
stream>> stream>> ; stream>> stream>> ;
: limit-input ( limit mode -- ) input-stream [ -rot limit ] change ; : limit-input ( limit mode -- )
[ input-stream ] 2dip '[ _ _ limit ] change ;
: unlimited-input ( -- ) input-stream [ unlimited ] change ; : unlimited-input ( -- )
input-stream [ unlimited ] change ;
: with-unlimited-stream ( stream quot -- ) : with-unlimited-stream ( stream quot -- )
[ clone unlimited ] dip call ; inline [ clone unlimited ] dip call ; inline

View File

@ -1,334 +1,335 @@
! Copyright (C) 2009 Doug Coleman. ! Copyright (C) 2009 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: calendar io.encodings.utf8 io.files robots tools.test ; USING: calendar io.encodings.utf8 io.files robots tools.test
urls ;
IN: robots.tests IN: robots.tests
[ [
{ "http://www.chiplist.com/sitemap.txt" } { "http://www.chiplist.com/sitemap.txt" }
{ {
T{ rules T{ rules
{ user-agents V{ "*" } } { user-agents V{ "*" } }
{ allows V{ } } { allows V{ } }
{ disallows { disallows
V{ V{
"/cgi-bin/" URL" /cgi-bin/"
"/scripts/" URL" /scripts/"
"/ChipList2/scripts/" URL" /ChipList2/scripts/"
"/ChipList2/styles/" URL" /ChipList2/styles/"
"/ads/" URL" /ads/"
"/ChipList2/ads/" URL" /ChipList2/ads/"
"/advertisements/" URL" /advertisements/"
"/ChipList2/advertisements/" URL" /ChipList2/advertisements/"
"/graphics/" URL" /graphics/"
"/ChipList2/graphics/" URL" /ChipList2/graphics/"
}
} }
} { visit-time
{ visit-time {
{ T{ timestamp { hour 2 } }
T{ timestamp { hour 2 } } T{ timestamp { hour 5 } }
T{ timestamp { hour 5 } } }
} }
{ request-rate 1 }
{ crawl-delay 1 }
{ unknowns H{ } }
} }
{ request-rate 1 } T{ rules
{ crawl-delay 1 } { user-agents V{ "UbiCrawler" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "UbiCrawler" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "DOC" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "DOC" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Zao" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Zao" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "sitecheck.internetseer.com" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "sitecheck.internetseer.com" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Zealbot" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Zealbot" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "MSIECrawler" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "MSIECrawler" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "SiteSnagger" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "SiteSnagger" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "WebStripper" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "WebStripper" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "WebCopier" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "WebCopier" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Fetch" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Fetch" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Offline Explorer" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Offline Explorer" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Teleport" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Teleport" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "TeleportPro" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "TeleportPro" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "WebZIP" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "WebZIP" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "linko" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "linko" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "HTTrack" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "HTTrack" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Microsoft.URL.Control" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Microsoft.URL.Control" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Xenu" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Xenu" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "larbin" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "larbin" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "libwww" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "libwww" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "ZyBORG" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "ZyBORG" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "Download Ninja" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "Download Ninja" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "wget" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "wget" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "grub-client" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "grub-client" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "k2spider" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "k2spider" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "NPBot" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "NPBot" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents V{ "WebReaper" } }
{ unknowns H{ } } { allows V{ } }
} { disallows V{ URL" /" } }
T{ rules { unknowns H{ } }
{ user-agents V{ "WebReaper" } } }
{ allows V{ } } T{ rules
{ disallows V{ "/" } } { user-agents
{ unknowns H{ } } V{
} "abot"
T{ rules "ALeadSoftbot"
{ user-agents "BeijingCrawler"
V{ "BilgiBot"
"abot" "bot"
"ALeadSoftbot" "botlist"
"BeijingCrawler" "BOTW Spider"
"BilgiBot" "bumblebee"
"bot" "Bumblebee"
"botlist" "BuzzRankingBot"
"BOTW Spider" "Charlotte"
"bumblebee" "Clushbot"
"Bumblebee" "Crawler"
"BuzzRankingBot" "CydralSpider"
"Charlotte" "DataFountains"
"Clushbot" "DiamondBot"
"Crawler" "Dulance bot"
"CydralSpider" "DYNAMIC"
"DataFountains" "EARTHCOM.info"
"DiamondBot" "EDI"
"Dulance bot" "envolk"
"DYNAMIC" "Exabot"
"EARTHCOM.info" "Exabot-Images"
"EDI" "Exabot-Test"
"envolk" "exactseek-pagereaper"
"Exabot" "Exalead NG"
"Exabot-Images" "FANGCrawl"
"Exabot-Test" "Feed::Find"
"exactseek-pagereaper" "flatlandbot"
"Exalead NG" "Gigabot"
"FANGCrawl" "GigabotSiteSearch"
"Feed::Find" "GurujiBot"
"flatlandbot" "Hatena Antenna"
"Gigabot" "Hatena Bookmark"
"GigabotSiteSearch" "Hatena RSS"
"GurujiBot" "HatenaScreenshot"
"Hatena Antenna" "Helix"
"Hatena Bookmark" "HiddenMarket"
"Hatena RSS" "HyperEstraier"
"HatenaScreenshot" "iaskspider"
"Helix" "IIITBOT"
"HiddenMarket" "InfociousBot"
"HyperEstraier" "iVia"
"iaskspider" "iVia Page Fetcher"
"IIITBOT" "Jetbot"
"InfociousBot" "Kolinka Forum Search"
"iVia" "KRetrieve"
"iVia Page Fetcher" "LetsCrawl.com"
"Jetbot" "Lincoln State Web Browser"
"Kolinka Forum Search" "Links4US-Crawler"
"KRetrieve" "LOOQ"
"LetsCrawl.com" "Lsearch/sondeur"
"Lincoln State Web Browser" "MapoftheInternet.com"
"Links4US-Crawler" "NationalDirectory"
"LOOQ" "NetCarta_WebMapper"
"Lsearch/sondeur" "NewsGator"
"MapoftheInternet.com" "NextGenSearchBot"
"NationalDirectory" "ng"
"NetCarta_WebMapper" "nicebot"
"NewsGator" "NP"
"NextGenSearchBot" "NPBot"
"ng" "Nudelsalat"
"nicebot" "Nutch"
"NP" "OmniExplorer_Bot"
"NPBot" "OpenIntelligenceData"
"Nudelsalat" "Oracle Enterprise Search"
"Nutch" "Pajaczek"
"OmniExplorer_Bot" "panscient.com"
"OpenIntelligenceData" "PeerFactor 404 crawler"
"Oracle Enterprise Search" "PeerFactor Crawler"
"Pajaczek" "PlantyNet"
"panscient.com" "PlantyNet_WebRobot"
"PeerFactor 404 crawler" "plinki"
"PeerFactor Crawler" "PMAFind"
"PlantyNet" "Pogodak!"
"PlantyNet_WebRobot" "QuickFinder Crawler"
"plinki" "Radiation Retriever"
"PMAFind" "Reaper"
"Pogodak!" "RedCarpet"
"QuickFinder Crawler" "ScorpionBot"
"Radiation Retriever" "Scrubby"
"Reaper" "Scumbot"
"RedCarpet" "searchbot"
"ScorpionBot" "Seeker.lookseek.com"
"Scrubby" "SeznamBot"
"Scumbot" "ShowXML"
"searchbot" "snap.com"
"Seeker.lookseek.com" "snap.com beta crawler"
"SeznamBot" "Snapbot"
"ShowXML" "SnapPreviewBot"
"snap.com" "sohu"
"snap.com beta crawler" "SpankBot"
"Snapbot" "Speedy Spider"
"SnapPreviewBot" "Speedy_Spider"
"sohu" "SpeedySpider"
"SpankBot" "spider"
"Speedy Spider" "SquigglebotBot"
"Speedy_Spider" "SurveyBot"
"SpeedySpider" "SynapticSearch"
"spider" "T-H-U-N-D-E-R-S-T-O-N-E"
"SquigglebotBot" "Talkro Web-Shot"
"SurveyBot" "Tarantula"
"SynapticSearch" "TerrawizBot"
"T-H-U-N-D-E-R-S-T-O-N-E" "TheInformant"
"Talkro Web-Shot" "TMCrawler"
"Tarantula" "TridentSpider"
"TerrawizBot" "Tutorial Crawler"
"TheInformant" "Twiceler"
"TMCrawler" "unwrapbot"
"TridentSpider" "URI::Fetch"
"Tutorial Crawler" "VengaBot"
"Twiceler" "Vonna.com b o t"
"unwrapbot" "Vortex"
"URI::Fetch" "Votay bot"
"VengaBot" "WebAlta Crawler"
"Vonna.com b o t" "Webbot"
"Vortex" "Webclipping.com"
"Votay bot" "WebCorp"
"WebAlta Crawler" "Webinator"
"Webbot" "WIRE"
"Webclipping.com" "WISEbot"
"WebCorp" "Xerka WebBot"
"Webinator" "XSpider"
"WIRE" "YodaoBot"
"WISEbot" "Yoono"
"Xerka WebBot" "yoono"
"XSpider" }
"YodaoBot"
"Yoono"
"yoono"
} }
{ allows V{ } }
{ disallows V{ URL" /" } }
{ unknowns H{ } }
} }
{ allows V{ } }
{ disallows V{ "/" } }
{ unknowns H{ } }
} }
}
] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test ] [ "vocab:robots/robots.txt" utf8 file-contents parse-robots.txt ] unit-test

View File

@ -85,7 +85,7 @@ PRIVATE>
: parse-robots.txt ( string -- sitemaps rules-seq ) : parse-robots.txt ( string -- sitemaps rules-seq )
normalize-robots.txt [ normalize-robots.txt [
[ <rules> dup ] dip [ parse-robots.txt-line drop ] with each [ <rules> dup ] dip [ parse-robots.txt-line drop ] with each
] map first ; ] map ;
: robots ( url -- robots ) : robots ( url -- robots )
>url >url

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: db.tuples locals site-watcher site-watcher.db USING: db.tuples locals site-watcher site-watcher.db
site-watcher.private kernel db io.directories io.files.temp site-watcher.private kernel db io.directories io.files.temp
continuations site-watcher.db.private db.sqlite continuations db.sqlite
sequences tools.test ; sequences tools.test ;
IN: site-watcher.tests IN: site-watcher.tests