From 35b5bd98981217a6b22ed286b795fad7fbba58cd Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Fri, 26 Sep 2008 18:24:58 -0500 Subject: [PATCH] New, more correct URL parser --- basis/urls/urls-tests.factor | 24 ++++++++++++ basis/urls/urls.factor | 72 +++++++++++++++++++++++------------- 2 files changed, 70 insertions(+), 26 deletions(-) diff --git a/basis/urls/urls-tests.factor b/basis/urls/urls-tests.factor index d1415a9dde..b0bf950178 100644 --- a/basis/urls/urls-tests.factor +++ b/basis/urls/urls-tests.factor @@ -227,3 +227,27 @@ urls [ [ "foo#3" ] [ URL" foo" clone 3 >>anchor present ] unit-test [ "http://www.foo.com/" ] [ "http://www.foo.com:80" >url present ] unit-test + +[ f ] [ URL" /gp/redirect.html/002-7009742-0004012?location=http://advantage.amazon.com/gp/vendor/public/join%26token%3d77E3769AB3A5B6CF611699E150DC33010761CE12" protocol>> ] unit-test + +[ + T{ url + { protocol "http" } + { host "localhost" } + { query H{ { "foo" "bar" } } } + { path "/" } + } +] +[ "http://localhost?foo=bar" >url ] unit-test + +[ + T{ url + { protocol "http" } + { host "localhost" } + { query H{ { "foo" "bar" } } } + { path "/" } + } +] +[ "http://localhost/?foo=bar" >url ] unit-test + +[ "/" ] [ "http://www.jedit.org" >url path>> ] unit-test diff --git a/basis/urls/urls.factor b/basis/urls/urls.factor index 17b309f37f..5fe9bbb5a0 100644 --- a/basis/urls/urls.factor +++ b/basis/urls/urls.factor @@ -4,7 +4,8 @@ USING: kernel ascii combinators combinators.short-circuit sequences splitting fry namespaces make assocs arrays strings io.sockets io.sockets.secure io.encodings.string io.encodings.utf8 math math.parser accessors parser -strings.parser lexer prettyprint.backend hashtables present ; +strings.parser lexer prettyprint.backend hashtables present +peg.ebnf ; IN: urls : url-quotable? ( ch -- ? ) @@ -122,38 +123,57 @@ TUPLE: url protocol username password host port path query anchor ; ] when ] bi* ; ->protocol ] [ - "//" ?head [ "Invalid URL" throw ] unless - "@" split1 [ - [ - ":" split1 [ >>username ] [ >>password ] bi* - ] dip - ] when* - "/" split1 [ - parse-host [ >>host ] [ >>port ] bi* - ] [ "/" prepend ] bi* - ] bi* ; - -PRIVATE> - GENERIC: >url ( obj -- url ) M: f >url drop ; M: url >url ; + [[ url-decode ]] +username = [^/:@#?]+ => [[ url-decode ]] +password = [^/:@#?]+ => [[ url-decode ]] +pathname = [^#?]+ => [[ url-decode ]] +query = [^#]+ => [[ query>assoc ]] +anchor = .+ => [[ url-decode ]] + +hostname = [^/#?]+ => [[ url-decode ]] + +hostname-spec = hostname ("/"|!(.)) => [[ first ]] + +auth = (username (":" password => [[ second ]])? "@" + => [[ first2 2array ]])? + +url = ((protocol "://") => [[ first ]] auth hostname)? + (pathname)? + ("?" query => [[ second ]])? + ("#" anchor => [[ second ]])? + +;EBNF + +PRIVATE> + M: string >url - swap - ":" split1 [ parse-host-part ] when* - "#" split1 [ - "?" split1 - [ url-decode >>path ] - [ [ query>assoc >>query ] when* ] bi* - ] - [ url-decode >>anchor ] bi* ; + parse-url { + [ + first [ + [ first ] ! protocol + [ + second + [ first [ first2 ] [ f f ] if* ] ! username, password + [ second parse-host ] ! host, port + bi + ] bi + ] [ f f f f f ] if* + ] + [ second ] ! pathname + [ third ] ! query + [ fourth ] ! anchor + } cleave url boa + dup host>> [ [ "/" or ] change-path ] when ; : protocol-port ( protocol -- port ) {