New, more correct URL parser

Slava Pestov 2008-09-26 18:24:58 -05:00
parent 943ac501a2
commit 35b5bd9898
2 changed files with 70 additions and 26 deletions

View File

@ -227,3 +227,27 @@ urls [
[ "foo#3" ] [ URL" foo" clone 3 >>anchor present ] unit-test
[ "" ] [ "" >url present ] unit-test
[ f ] [ URL" /gp/redirect.html/002-7009742-0004012?location=" protocol>> ] unit-test
T{ url
{ protocol "http" }
{ host "localhost" }
{ query H{ { "foo" "bar" } } }
{ path "/" }
[ "http://localhost?foo=bar" >url ] unit-test
T{ url
{ protocol "http" }
{ host "localhost" }
{ query H{ { "foo" "bar" } } }
{ path "/" }
[ "http://localhost/?foo=bar" >url ] unit-test
[ "/" ] [ "" >url path>> ] unit-test

View File

@ -4,7 +4,8 @@ USING: kernel ascii combinators combinators.short-circuit
sequences splitting fry namespaces make assocs arrays strings
io.sockets io.encodings.string
io.encodings.utf8 math math.parser accessors parser
strings.parser lexer prettyprint.backend hashtables present ;
strings.parser lexer prettyprint.backend hashtables present
peg.ebnf ;
IN: urls
: url-quotable? ( ch -- ? )
@ -122,38 +123,57 @@ TUPLE: url protocol username password host port path query anchor ;
] when
] bi* ;
: parse-host-part ( url protocol rest -- url string' )
[ >>protocol ] [
"//" ?head [ "Invalid URL" throw ] unless
"@" split1 [
":" split1 [ >>username ] [ >>password ] bi*
] dip
] when*
"/" split1 [
parse-host [ >>host ] [ >>port ] bi*
] [ "/" prepend ] bi*
] bi* ;
GENERIC: >url ( obj -- url )
M: f >url drop <url> ;
M: url >url ;
EBNF: parse-url
protocol = [a-z]+ => [[ url-decode ]]
username = [^/:@#?]+ => [[ url-decode ]]
password = [^/:@#?]+ => [[ url-decode ]]
pathname = [^#?]+ => [[ url-decode ]]
query = [^#]+ => [[ query>assoc ]]
anchor = .+ => [[ url-decode ]]
hostname = [^/#?]+ => [[ url-decode ]]
hostname-spec = hostname ("/"|!(.)) => [[ first ]]
auth = (username (":" password => [[ second ]])? "@"
=> [[ first2 2array ]])?
url = ((protocol "://") => [[ first ]] auth hostname)?
("?" query => [[ second ]])?
("#" anchor => [[ second ]])?
M: string >url
<url> swap
":" split1 [ parse-host-part ] when*
"#" split1 [
"?" split1
[ url-decode >>path ]
[ [ query>assoc >>query ] when* ] bi*
[ url-decode >>anchor ] bi* ;
parse-url {
first [
[ first ] ! protocol
[ first [ first2 ] [ f f ] if* ] ! username, password
[ second parse-host ] ! host, port
] bi
] [ f f f f f ] if*
[ second ] ! pathname
[ third ] ! query
[ fourth ] ! anchor
} cleave url boa
dup host>> [ [ "/" or ] change-path ] when ;
: protocol-port ( protocol -- port )