From c5a3f89b0409487b8bc1e178d635f668efa4405f Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 21 Sep 2008 11:58:09 -0500 Subject: [PATCH 1/4] add feed:// since firefox and safari support it, throw a better error than "fall-through in case" --- basis/urls/urls.factor | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/basis/urls/urls.factor b/basis/urls/urls.factor index 4f2639975b..f4a6a7d792 100644 --- a/basis/urls/urls.factor +++ b/basis/urls/urls.factor @@ -212,11 +212,15 @@ PRIVATE> [ [ host>> ] [ port>> ] bi ] [ protocol>> ] bi secure-protocol? [ ] when ; +ERROR: no-protocol-found protocol ; + : protocol-port ( protocol -- port ) { { "http" [ 80 ] } { "https" [ 443 ] } + { "feed" [ 80 ] } { "ftp" [ 21 ] } + [ no-protocol-found ] } case ; : ensure-port ( url -- url' ) From ee4faceb4190f65a1faa232653d0123597c833a4 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 21 Sep 2008 21:45:27 -0500 Subject: [PATCH 2/4] traversal flags machinery in place, lookahead works but shouldnt create capturing groups --- unfinished/regexp/dfa/dfa.factor | 16 +++++++----- unfinished/regexp/regexp-tests.factor | 4 +-- unfinished/regexp/traversal/traversal.factor | 26 +++++++++++++++----- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/unfinished/regexp/dfa/dfa.factor b/unfinished/regexp/dfa/dfa.factor index 6200a1b3c0..cd6dab6a06 100644 --- a/unfinished/regexp/dfa/dfa.factor +++ b/unfinished/regexp/dfa/dfa.factor @@ -68,12 +68,16 @@ IN: regexp.dfa 1vector >>new-states drop ; : set-traversal-flags ( regexp -- ) - [ dfa-table>> transitions>> keys ] + dup [ nfa-traversal-flags>> ] - bi 2drop ; + [ dfa-table>> transitions>> keys ] bi + [ tuck [ swap at ] with map concat ] with H{ } map>assoc + >>dfa-traversal-flags drop ; : construct-dfa ( regexp -- ) - [ set-initial-state ] - [ new-transitions ] - [ set-final-states ] tri ; - ! [ set-traversal-flags ] quad ; + { + [ set-initial-state ] + [ new-transitions ] + [ set-final-states ] + [ set-traversal-flags ] + } cleave ; diff --git a/unfinished/regexp/regexp-tests.factor b/unfinished/regexp/regexp-tests.factor index 78098952d3..ab3bca9ead 100644 --- a/unfinished/regexp/regexp-tests.factor +++ b/unfinished/regexp/regexp-tests.factor @@ -251,8 +251,8 @@ IN: regexp-tests ! [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test ! [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test -! [ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test -! [ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test +[ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test +[ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test ! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test diff --git a/unfinished/regexp/traversal/traversal.factor b/unfinished/regexp/traversal/traversal.factor index cfc97aff29..6f41b16d95 100644 --- a/unfinished/regexp/traversal/traversal.factor +++ b/unfinished/regexp/traversal/traversal.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors assocs combinators kernel math math.ranges -quotations sequences regexp.parser regexp.classes -combinators.short-circuit regexp.utils ; +quotations sequences regexp.parser regexp.classes fry +combinators.short-circuit regexp.utils prettyprint regexp.nfa ; IN: regexp.traversal TUPLE: dfa-traverser @@ -10,7 +10,7 @@ TUPLE: dfa-traverser traversal-flags capture-groups { capture-group-index integer } - { lookahead-counter integer } + lookahead-counters last-state current-state text start-index current-index @@ -26,7 +26,8 @@ TUPLE: dfa-traverser 0 >>start-index 0 >>current-index V{ } clone >>matches - V{ } clone >>capture-groups ; + V{ } clone >>capture-groups + V{ } clone >>lookahead-counters ; : final-state? ( dfa-traverser -- ? ) [ current-state>> ] [ dfa-table>> final-states>> ] bi @@ -43,9 +44,21 @@ TUPLE: dfa-traverser dup save-final-state ] when text-finished? ; -: print-flags ( dfa-traverser -- dfa-traverser ) +GENERIC: flag-action ( dfa-traverser flag -- ) + +M: lookahead-on flag-action ( dfa-traverser flag -- ) + drop + lookahead-counters>> 0 swap push ; + +M: lookahead-off flag-action ( dfa-traverser flag -- ) + drop + dup lookahead-counters>> pop + '[ _ - ] change-current-index drop ; + +: process-flags ( dfa-traverser -- ) + [ [ 1+ ] map ] change-lookahead-counters dup [ current-state>> ] [ traversal-flags>> ] bi - ; + at [ dup . flag-action ] with each ; : increment-state ( dfa-traverser state -- dfa-traverser ) [ @@ -79,6 +92,7 @@ TUPLE: dfa-traverser [ nth ] 2dip ; : do-match ( dfa-traverser -- dfa-traverser ) + dup process-flags dup match-done? [ dup setup-match match-transition [ increment-state do-match ] when* From da35e13153f91a39e2b53971fbcdd103fa9b1145 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 22 Sep 2008 10:48:01 -0500 Subject: [PATCH 3/4] fix the parser for groups --- basis/regexp/parser/parser.factor | 76 ++++++++++++++----------------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index eaee70210e..fc3f949670 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -1,10 +1,10 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators io io.streams.string -kernel math math.parser multi-methods namespaces qualified sets +kernel math math.parser namespaces qualified sets quotations sequences splitting symbols vectors math.order unicode.categories strings regexp.backend regexp.utils -unicode.case ; +unicode.case words ; IN: regexp.parser FROM: math.ranges => [a,b] ; @@ -25,11 +25,21 @@ TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node TUPLE: negation term ; INSTANCE: negation node TUPLE: constant char ; INSTANCE: constant node TUPLE: range from to ; INSTANCE: range node + +MIXIN: parentheses-group TUPLE: lookahead term ; INSTANCE: lookahead node +INSTANCE: lookahead parentheses-group TUPLE: lookbehind term ; INSTANCE: lookbehind node +INSTANCE: lookbehind parentheses-group TUPLE: capture-group term ; INSTANCE: capture-group node +INSTANCE: capture-group parentheses-group TUPLE: non-capture-group term ; INSTANCE: non-capture-group node +INSTANCE: non-capture-group parentheses-group TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group +INSTANCE: independent-group parentheses-group +TUPLE: comment-group term ; INSTANCE: comment-group node +INSTANCE: comment-group parentheses-group + TUPLE: character-class-range from to ; INSTANCE: character-class-range node SINGLETON: epsilon INSTANCE: epsilon node SINGLETON: any-char INSTANCE: any-char node @@ -98,25 +108,6 @@ left-parenthesis pipe caret dash ; ERROR: unmatched-parentheses ; -: make-positive-lookahead ( string -- ) - lookahead boa push-stack ; - -: make-negative-lookahead ( string -- ) - lookahead boa push-stack ; - -: make-independent-group ( string -- ) - #! no backtracking - independent-group boa push-stack ; - -: make-positive-lookbehind ( string -- ) - lookbehind boa push-stack ; - -: make-negative-lookbehind ( string -- ) - lookbehind boa push-stack ; - -: make-non-capturing-group ( string -- ) - non-capture-group boa push-stack ; - ERROR: bad-option ch ; : option ( ch -- singleton ) @@ -143,33 +134,38 @@ ERROR: bad-option ch ; DEFER: (parse-regexp) : parse-special-group ( -- ) - beginning-of-group push-stack - (parse-regexp) pop-stack make-non-capturing-group ; + ; + ! beginning-of-group push-stack + ! (parse-regexp) pop-stack make-non-capturing-group ; ERROR: bad-special-group string ; -DEFER: nested-parse-regexp +: nested-parse-regexp ( token ? -- ) + [ push-stack (parse-regexp) pop-stack ] dip + [ ] when pop-stack boa push-stack ; + +! non-capturing groups : (parse-special-group) ( -- ) read1 { - { [ dup CHAR: # = ] - [ drop nested-parse-regexp pop-stack drop ] } + { [ dup CHAR: # = ] ! comment + [ drop comment-group f nested-parse-regexp pop-stack drop ] } { [ dup CHAR: : = ] - [ drop nested-parse-regexp pop-stack make-non-capturing-group ] } + [ drop non-capture-group f nested-parse-regexp ] } { [ dup CHAR: = = ] - [ drop nested-parse-regexp pop-stack make-positive-lookahead ] } + [ drop lookahead f nested-parse-regexp ] } { [ dup CHAR: ! = ] - [ drop nested-parse-regexp pop-stack make-negative-lookahead ] } + [ drop lookahead t nested-parse-regexp ] } { [ dup CHAR: > = ] - [ drop nested-parse-regexp pop-stack make-independent-group ] } + [ drop non-capture-group f nested-parse-regexp ] } { [ dup CHAR: < = peek1 CHAR: = = and ] - [ drop drop1 nested-parse-regexp pop-stack make-positive-lookbehind ] } + [ drop drop1 lookbehind f nested-parse-regexp ] } { [ dup CHAR: < = peek1 CHAR: ! = and ] - [ drop drop1 nested-parse-regexp pop-stack make-negative-lookbehind ] } + [ drop drop1 lookbehind t nested-parse-regexp ] } [ ":)" read-until [ swap prefix ] dip { - { CHAR: : [ parse-options parse-special-group ] } + { CHAR: : [ parse-options (parse-special-group) ] } { CHAR: ) [ parse-options ] } [ drop bad-special-group ] } case @@ -179,7 +175,7 @@ DEFER: nested-parse-regexp : handle-left-parenthesis ( -- ) peek1 CHAR: ? = [ drop1 (parse-special-group) ] - [ nested-parse-regexp ] if ; + [ capture-group f nested-parse-regexp ] if ; : handle-dot ( -- ) any-char push-stack ; : handle-pipe ( -- ) pipe push-stack ; @@ -408,14 +404,12 @@ DEFER: handle-left-bracket [ first|concatenation ] map first|alternation ; : handle-right-parenthesis ( -- ) - stack beginning-of-group over last-index cut rest - [ current-regexp get swap >>stack drop ] - [ finish-regexp-parse push-stack ] bi* ; + stack dup [ parentheses-group "members" word-prop member? ] find-last -rot cut rest + [ [ push ] keep current-regexp get (>>stack) ] + [ finish-regexp-parse push-stack ] bi* ; -: nested-parse-regexp ( -- ) - beginning-of-group push-stack (parse-regexp) ; -: ((parse-regexp)) ( token -- ? ) +: parse-regexp-token ( token -- ? ) { { CHAR: . [ handle-dot t ] } { CHAR: ( [ handle-left-parenthesis t ] } @@ -433,7 +427,7 @@ DEFER: handle-left-bracket } case ; : (parse-regexp) ( -- ) - read1 [ ((parse-regexp)) [ (parse-regexp) ] when ] when* ; + read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ; : parse-regexp ( regexp -- ) dup current-regexp [ From 56fbeb25ff88228a270bde7c584d3714a8350568 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 22 Sep 2008 11:45:36 -0500 Subject: [PATCH 4/4] fix options before non-capturing groups --- basis/regexp/parser/parser.factor | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index fc3f949670..987d05591f 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -132,14 +132,9 @@ ERROR: bad-option ch ; : parse-options ( string -- ) "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ; -DEFER: (parse-regexp) -: parse-special-group ( -- ) - ; - ! beginning-of-group push-stack - ! (parse-regexp) pop-stack make-non-capturing-group ; - ERROR: bad-special-group string ; +DEFER: (parse-regexp) : nested-parse-regexp ( token ? -- ) [ push-stack (parse-regexp) pop-stack ] dip [ ] when pop-stack boa push-stack ; @@ -165,7 +160,7 @@ ERROR: bad-special-group string ; ":)" read-until [ swap prefix ] dip { - { CHAR: : [ parse-options (parse-special-group) ] } + { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] } { CHAR: ) [ parse-options ] } [ drop bad-special-group ] } case