first stab at capture groups. they work for unambiguous groups (no overlap), working on fixing it for any case

db4
Doug Coleman 2008-09-22 20:09:42 -05:00
parent 35564e4377
commit c252060bb5
4 changed files with 48 additions and 8 deletions

View File

@ -121,7 +121,12 @@ M: character-class-range nfa-node ( node -- )
class-transition add-simple-entry ;
M: capture-group nfa-node ( node -- )
term>> nfa-node ;
eps literal-transition add-simple-entry
capture-group-on add-traversal-flag
term>> nfa-node
eps literal-transition add-simple-entry
capture-group-off add-traversal-flag
2 [ concatenate-nodes ] times ;
! xyzzy
M: non-capture-group nfa-node ( node -- )

View File

@ -318,7 +318,17 @@ IN: regexp-tests
[ { 0 1 } ] [ "ab" "a(?=b)(?=b)" <regexp> first-match ] unit-test
[ { 1 2 } ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match ] unit-test
[ { 1 2 } ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match ] unit-test
! capture group 1: "aaaa" 2: ""
! "aaaa" "(a*)(a*)" <regexp> match*
! "aaaa" "(a*)(a+)" <regexp> match*
[ { 0 2 } ] [ "ab" "(a|ab)(bc)?" <regexp> first-match ] unit-test
[ { 0 3 } ] [ "abc" "(a|ab)(bc)?" <regexp> first-match ] unit-test
[ { 0 2 } ] [ "ab" "(ab|a)(bc)?" <regexp> first-match ] unit-test
[ { 0 3 } ] [ "abc" "(ab|a)(bc)?" <regexp> first-match ] unit-test
[ { 23 24 } ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match ] unit-test

View File

@ -28,6 +28,9 @@ IN: regexp
: match ( string regexp -- pair )
<dfa-traverser> do-match return-match ;
: match* ( string regexp -- pair )
<dfa-traverser> do-match [ return-match ] [ captured-groups>> ] bi ;
: matches? ( string regexp -- ? )
dupd match
[ [ length ] [ length>> 1- ] bi* = ] [ drop f ] if* ;

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors assocs combinators kernel math math.ranges
quotations sequences regexp.parser regexp.classes fry
quotations sequences regexp.parser regexp.classes fry arrays
combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
IN: regexp.traversal
@ -9,10 +9,11 @@ TUPLE: dfa-traverser
dfa-table
traversal-flags
traverse-forward
capture-groups
{ capture-group-index integer }
lookahead-counters
lookbehind-counters
capture-counters
captured-groups
capture-group-index
last-state current-state
text
start-index current-index
@ -28,10 +29,12 @@ TUPLE: dfa-traverser
t >>traverse-forward
0 >>start-index
0 >>current-index
0 >>capture-group-index
V{ } clone >>matches
V{ } clone >>capture-groups
V{ } clone >>capture-counters
V{ } clone >>lookbehind-counters
V{ } clone >>lookahead-counters ;
V{ } clone >>lookahead-counters
H{ } clone >>captured-groups ;
: final-state? ( dfa-traverser -- ? )
[ current-state>> ] [ dfa-table>> final-states>> ] bi
@ -75,9 +78,28 @@ M: lookbehind-off flag-action ( dfa-traverser flag -- )
dup lookbehind-counters>>
[ drop ] [ pop '[ _ + 2 + ] change-current-index drop ] if-empty ;
M: capture-group-on flag-action ( dfa-traverser flag -- )
drop
[ current-index>> 0 2array ]
[ capture-counters>> ] bi push ;
M: capture-group-off flag-action ( dfa-traverser flag -- )
drop
dup capture-counters>> empty? [
drop
] [
{
[ capture-counters>> pop first2 dupd + ]
[ text>> <slice> ]
[ [ 1+ ] change-capture-group-index capture-group-index>> ]
[ captured-groups>> set-at ]
} cleave
] if ;
: process-flags ( dfa-traverser -- )
[ [ 1+ ] map ] change-lookahead-counters
[ [ 1+ ] map ] change-lookbehind-counters
[ [ first2 1+ 2array ] map ] change-capture-counters
! dup current-state>> .
dup [ current-state>> ] [ traversal-flags>> ] bi
at [ dup . flag-action ] with each ;