New, more correct URL parser

db4
Slava Pestov 2008-09-26 18:24:58 -05:00
parent 943ac501a2
commit 35b5bd9898
2 changed files with 70 additions and 26 deletions

View File

@ -227,3 +227,27 @@ urls [
[ "foo#3" ] [ URL" foo" clone 3 >>anchor present ] unit-test [ "foo#3" ] [ URL" foo" clone 3 >>anchor present ] unit-test
[ "http://www.foo.com/" ] [ "http://www.foo.com:80" >url present ] unit-test [ "http://www.foo.com/" ] [ "http://www.foo.com:80" >url present ] unit-test
[ f ] [ URL" /gp/redirect.html/002-7009742-0004012?location=http://advantage.amazon.com/gp/vendor/public/join%26token%3d77E3769AB3A5B6CF611699E150DC33010761CE12" protocol>> ] unit-test
[
T{ url
{ protocol "http" }
{ host "localhost" }
{ query H{ { "foo" "bar" } } }
{ path "/" }
}
]
[ "http://localhost?foo=bar" >url ] unit-test
[
T{ url
{ protocol "http" }
{ host "localhost" }
{ query H{ { "foo" "bar" } } }
{ path "/" }
}
]
[ "http://localhost/?foo=bar" >url ] unit-test
[ "/" ] [ "http://www.jedit.org" >url path>> ] unit-test

View File

@ -4,7 +4,8 @@ USING: kernel ascii combinators combinators.short-circuit
sequences splitting fry namespaces make assocs arrays strings sequences splitting fry namespaces make assocs arrays strings
io.sockets io.sockets.secure io.encodings.string io.sockets io.sockets.secure io.encodings.string
io.encodings.utf8 math math.parser accessors parser io.encodings.utf8 math math.parser accessors parser
strings.parser lexer prettyprint.backend hashtables present ; strings.parser lexer prettyprint.backend hashtables present
peg.ebnf ;
IN: urls IN: urls
: url-quotable? ( ch -- ? ) : url-quotable? ( ch -- ? )
@ -122,38 +123,57 @@ TUPLE: url protocol username password host port path query anchor ;
] when ] when
] bi* ; ] bi* ;
<PRIVATE
: parse-host-part ( url protocol rest -- url string' )
[ >>protocol ] [
"//" ?head [ "Invalid URL" throw ] unless
"@" split1 [
[
":" split1 [ >>username ] [ >>password ] bi*
] dip
] when*
"/" split1 [
parse-host [ >>host ] [ >>port ] bi*
] [ "/" prepend ] bi*
] bi* ;
PRIVATE>
GENERIC: >url ( obj -- url ) GENERIC: >url ( obj -- url )
M: f >url drop <url> ; M: f >url drop <url> ;
M: url >url ; M: url >url ;
<PRIVATE
EBNF: parse-url
protocol = [a-z]+ => [[ url-decode ]]
username = [^/:@#?]+ => [[ url-decode ]]
password = [^/:@#?]+ => [[ url-decode ]]
pathname = [^#?]+ => [[ url-decode ]]
query = [^#]+ => [[ query>assoc ]]
anchor = .+ => [[ url-decode ]]
hostname = [^/#?]+ => [[ url-decode ]]
hostname-spec = hostname ("/"|!(.)) => [[ first ]]
auth = (username (":" password => [[ second ]])? "@"
=> [[ first2 2array ]])?
url = ((protocol "://") => [[ first ]] auth hostname)?
(pathname)?
("?" query => [[ second ]])?
("#" anchor => [[ second ]])?
;EBNF
PRIVATE>
M: string >url M: string >url
<url> swap parse-url {
":" split1 [ parse-host-part ] when* [
"#" split1 [ first [
"?" split1 [ first ] ! protocol
[ url-decode >>path ] [
[ [ query>assoc >>query ] when* ] bi* second
] [ first [ first2 ] [ f f ] if* ] ! username, password
[ url-decode >>anchor ] bi* ; [ second parse-host ] ! host, port
bi
] bi
] [ f f f f f ] if*
]
[ second ] ! pathname
[ third ] ! query
[ fourth ] ! anchor
} cleave url boa
dup host>> [ [ "/" or ] change-path ] when ;
: protocol-port ( protocol -- port ) : protocol-port ( protocol -- port )
{ {