diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index aebfea04e1..72d0fe970b 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -121,7 +121,12 @@ M: character-class-range nfa-node ( node -- ) class-transition add-simple-entry ; M: capture-group nfa-node ( node -- ) - term>> nfa-node ; + eps literal-transition add-simple-entry + capture-group-on add-traversal-flag + term>> nfa-node + eps literal-transition add-simple-entry + capture-group-off add-traversal-flag + 2 [ concatenate-nodes ] times ; ! xyzzy M: non-capture-group nfa-node ( node -- ) diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 934b635e50..46696c8c0f 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -318,7 +318,17 @@ IN: regexp-tests [ { 0 1 } ] [ "ab" "a(?=b)(?=b)" first-match ] unit-test [ { 1 2 } ] [ "ba" "a(?<=b)(?<=b)" first-match ] unit-test - - [ { 1 2 } ] [ "cab" "a(?=b)(?<=c)" first-match ] unit-test +! capture group 1: "aaaa" 2: "" +! "aaaa" "(a*)(a*)" match* +! "aaaa" "(a*)(a+)" match* + +[ { 0 2 } ] [ "ab" "(a|ab)(bc)?" first-match ] unit-test +[ { 0 3 } ] [ "abc" "(a|ab)(bc)?" first-match ] unit-test + +[ { 0 2 } ] [ "ab" "(ab|a)(bc)?" first-match ] unit-test +[ { 0 3 } ] [ "abc" "(ab|a)(bc)?" first-match ] unit-test + +[ { 23 24 } ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" first-match ] unit-test + diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index debf94ef33..73555fe953 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -28,6 +28,9 @@ IN: regexp : match ( string regexp -- pair ) do-match return-match ; +: match* ( string regexp -- pair ) + do-match [ return-match ] [ captured-groups>> ] bi ; + : matches? ( string regexp -- ? ) dupd match [ [ length ] [ length>> 1- ] bi* = ] [ drop f ] if* ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index d82e9941a2..f5a235fa7f 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors assocs combinators kernel math math.ranges -quotations sequences regexp.parser regexp.classes fry +quotations sequences regexp.parser regexp.classes fry arrays combinators.short-circuit regexp.utils prettyprint regexp.nfa ; IN: regexp.traversal @@ -9,10 +9,11 @@ TUPLE: dfa-traverser dfa-table traversal-flags traverse-forward - capture-groups - { capture-group-index integer } lookahead-counters lookbehind-counters + capture-counters + captured-groups + capture-group-index last-state current-state text start-index current-index @@ -28,10 +29,12 @@ TUPLE: dfa-traverser t >>traverse-forward 0 >>start-index 0 >>current-index + 0 >>capture-group-index V{ } clone >>matches - V{ } clone >>capture-groups + V{ } clone >>capture-counters V{ } clone >>lookbehind-counters - V{ } clone >>lookahead-counters ; + V{ } clone >>lookahead-counters + H{ } clone >>captured-groups ; : final-state? ( dfa-traverser -- ? ) [ current-state>> ] [ dfa-table>> final-states>> ] bi @@ -75,9 +78,28 @@ M: lookbehind-off flag-action ( dfa-traverser flag -- ) dup lookbehind-counters>> [ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ; +M: capture-group-on flag-action ( dfa-traverser flag -- ) + drop + [ current-index>> 0 2array ] + [ capture-counters>> ] bi push ; + +M: capture-group-off flag-action ( dfa-traverser flag -- ) + drop + dup capture-counters>> empty? [ + drop + ] [ + { + [ capture-counters>> pop first2 dupd + ] + [ text>> ] + [ [ 1+ ] change-capture-group-index capture-group-index>> ] + [ captured-groups>> set-at ] + } cleave + ] if ; + : process-flags ( dfa-traverser -- ) [ [ 1+ ] map ] change-lookahead-counters [ [ 1+ ] map ] change-lookbehind-counters + [ [ first2 1+ 2array ] map ] change-capture-counters ! dup current-state>> . dup [ current-state>> ] [ traversal-flags>> ] bi at [ dup . flag-action ] with each ;