From 80a914769124e6517121f7d7f52fe3b19a59be60 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 22 Sep 2008 13:37:27 -0500 Subject: [PATCH] working on lookbehind --- basis/regexp/nfa/nfa.factor | 10 ++++++ basis/regexp/parser/parser.factor | 43 ++++++++++++++++--------- basis/regexp/regexp-tests.factor | 14 ++++++++ basis/regexp/regexp.factor | 3 ++ basis/regexp/traversal/traversal.factor | 24 ++++++++++++-- 5 files changed, 75 insertions(+), 19 deletions(-) diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index f070c3528b..aebfea04e1 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -14,6 +14,8 @@ SINGLETON: eps MIXIN: traversal-flag SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag +SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag +SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag @@ -143,6 +145,14 @@ M: lookahead nfa-node ( node -- ) lookahead-off add-traversal-flag 2 [ concatenate-nodes ] times ; +M: lookbehind nfa-node ( node -- ) + eps literal-transition add-simple-entry + lookbehind-on add-traversal-flag + term>> nfa-node + eps literal-transition add-simple-entry + lookbehind-off add-traversal-flag + 2 [ concatenate-nodes ] times ; + : construct-nfa ( regexp -- ) [ reset-regexp diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 987d05591f..d2ed346bf2 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -230,8 +230,18 @@ ERROR: invalid-range a b ; [ [ nip at-most-n ] [ at-least-n ] if* ] if ] [ drop 0 max exactly-n ] if ; +SINGLETON: beginning-of-input +SINGLETON: end-of-input + +! : beginning-of-input ( -- obj ) : handle-front-anchor ( -- ) front-anchor push-stack ; -: handle-back-anchor ( -- ) back-anchor push-stack ; +: end-of-line ( -- obj ) + end-of-input + CHAR: \r + CHAR: \n + 2dup 2array 4array lookahead boa ; + +: handle-back-anchor ( -- ) end-of-line push-stack ; ERROR: bad-character-class obj ; ERROR: expected-posix-class ; @@ -277,6 +287,8 @@ ERROR: unrecognized-escape char ; read1 { { CHAR: \ [ CHAR: \ ] } + { CHAR: ^ [ CHAR: ^ ] } + { CHAR: $ [ CHAR: $ ] } { CHAR: - [ CHAR: - ] } { CHAR: { [ CHAR: { ] } { CHAR: } [ CHAR: } ] } @@ -289,7 +301,6 @@ ERROR: unrecognized-escape char ; { CHAR: + [ CHAR: + ] } { CHAR: ? [ CHAR: ? ] } { CHAR: . [ CHAR: . ] } -! xyzzy { CHAR: : [ CHAR: : ] } { CHAR: t [ CHAR: \t ] } { CHAR: n [ CHAR: \n ] } @@ -297,8 +308,6 @@ ERROR: unrecognized-escape char ; { CHAR: f [ HEX: c ] } { CHAR: a [ HEX: 7 ] } { CHAR: e [ HEX: 1b ] } - { CHAR: $ [ CHAR: $ ] } - { CHAR: ^ [ CHAR: ^ ] } { CHAR: d [ digit-class ] } { CHAR: D [ digit-class ] } @@ -320,16 +329,16 @@ ERROR: unrecognized-escape char ; ! { CHAR: G [ end of previous match ] } ! { CHAR: Z [ handle-end-of-input ] } ! { CHAR: z [ handle-end-of-input ] } ! except for terminator -! xyzzy - { CHAR: 1 [ CHAR: 1 ] } - { CHAR: 2 [ CHAR: 2 ] } - { CHAR: 3 [ CHAR: 3 ] } - { CHAR: 4 [ CHAR: 4 ] } - { CHAR: 5 [ CHAR: 5 ] } - { CHAR: 6 [ CHAR: 6 ] } - { CHAR: 7 [ CHAR: 7 ] } - { CHAR: 8 [ CHAR: 8 ] } - { CHAR: 9 [ CHAR: 9 ] } + + ! { CHAR: 1 [ CHAR: 1 ] } + ! { CHAR: 2 [ CHAR: 2 ] } + ! { CHAR: 3 [ CHAR: 3 ] } + ! { CHAR: 4 [ CHAR: 4 ] } + ! { CHAR: 5 [ CHAR: 5 ] } + ! { CHAR: 6 [ CHAR: 6 ] } + ! { CHAR: 7 [ CHAR: 7 ] } + ! { CHAR: 8 [ CHAR: 8 ] } + ! { CHAR: 9 [ CHAR: 9 ] } { CHAR: Q [ parse-escaped-literals ] } [ unrecognized-escape ] @@ -406,6 +415,10 @@ DEFER: handle-left-bracket : parse-regexp-token ( token -- ? ) { +! todo: only match these at beginning/end of regexp + { CHAR: ^ [ handle-front-anchor t ] } + { CHAR: $ [ handle-back-anchor t ] } + { CHAR: . [ handle-dot t ] } { CHAR: ( [ handle-left-parenthesis t ] } { CHAR: ) [ handle-right-parenthesis f ] } @@ -415,8 +428,6 @@ DEFER: handle-left-bracket { CHAR: + [ handle-plus t ] } { CHAR: { [ handle-left-brace t ] } { CHAR: [ [ handle-left-bracket t ] } - { CHAR: ^ [ handle-front-anchor t ] } - { CHAR: $ [ handle-back-anchor t ] } { CHAR: \ [ handle-escape t ] } [ push-stack t ] } case ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index ab3bca9ead..4d25a58c39 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -285,3 +285,17 @@ IN: regexp-tests ! 2. (A) ! 3. (B(C)) ! 4. (C) + +! clear "a(?=b*)" "ab" over match +! clear "a(?=b*c)" "abbbbbc" over match +! clear "a(?=b*)" "ab" over match + +! clear "^a" "a" over match +! clear "^a" "\na" over match +! clear "^a" "\r\na" over match +! clear "^a" "\ra" over match + +! clear "a$" "a" over match +! clear "a$" "a\n" over match +! clear "a$" "a\r" over match +! clear "a$" "a\r\n" over match diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index c4ab2675d2..debf94ef33 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -128,6 +128,8 @@ IN: regexp : option? ( option regexp -- ? ) options>> key? ; +USE: multiline +/* M: regexp pprint* [ [ @@ -136,3 +138,4 @@ M: regexp pprint* case-insensitive swap option? [ "i" % ] when ] "" make ] keep present-text ; +*/ diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 6f41b16d95..d4b43c37f0 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -8,9 +8,11 @@ IN: regexp.traversal TUPLE: dfa-traverser dfa-table traversal-flags + traverse-forward capture-groups { capture-group-index integer } lookahead-counters + lookbehind-counters last-state current-state text start-index current-index @@ -23,10 +25,12 @@ TUPLE: dfa-traverser swap [ start-state>> >>current-state ] keep >>dfa-table swap >>text + t >>traverse-forward 0 >>start-index 0 >>current-index V{ } clone >>matches V{ } clone >>capture-groups + V{ } clone >>lookbehind-counters V{ } clone >>lookahead-counters ; : final-state? ( dfa-traverser -- ? ) @@ -52,8 +56,19 @@ M: lookahead-on flag-action ( dfa-traverser flag -- ) M: lookahead-off flag-action ( dfa-traverser flag -- ) drop - dup lookahead-counters>> pop - '[ _ - ] change-current-index drop ; + dup lookahead-counters>> + [ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ; + +M: lookbehind-on flag-action ( dfa-traverser flag -- ) + drop + f >>traverse-forward + lookbehind-counters>> 0 swap push ; + +M: lookbehind-off flag-action ( dfa-traverser flag -- ) + drop + t >>traverse-forward + dup lookbehind-counters>> + [ drop ] [ pop '[ _ + ] change-current-index drop ] if-empty ; : process-flags ( dfa-traverser -- ) [ [ 1+ ] map ] change-lookahead-counters @@ -62,7 +77,10 @@ M: lookahead-off flag-action ( dfa-traverser flag -- ) : increment-state ( dfa-traverser state -- dfa-traverser ) [ - [ 1+ ] change-current-index dup current-state>> >>last-state + dup traverse-forward>> + [ [ 1+ ] change-current-index ] + [ [ 1- ] change-current-index ] if + dup current-state>> >>last-state ] dip first >>current-state ;