working on lookbehind

db4
Doug Coleman 2008-09-22 13:37:27 -05:00
parent 56fbeb25ff
commit 80a9147691
5 changed files with 75 additions and 19 deletions

View File

@ -14,6 +14,8 @@ SINGLETON: eps
MIXIN: traversal-flag
SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
@ -143,6 +145,14 @@ M: lookahead nfa-node ( node -- )
lookahead-off add-traversal-flag
2 [ concatenate-nodes ] times ;
M: lookbehind nfa-node ( node -- )
eps literal-transition add-simple-entry
lookbehind-on add-traversal-flag
term>> nfa-node
eps literal-transition add-simple-entry
lookbehind-off add-traversal-flag
2 [ concatenate-nodes ] times ;
: construct-nfa ( regexp -- )
[
reset-regexp

View File

@ -230,8 +230,18 @@ ERROR: invalid-range a b ;
[ [ nip at-most-n ] [ at-least-n ] if* ] if
] [ drop 0 max exactly-n ] if ;
SINGLETON: beginning-of-input
SINGLETON: end-of-input
! : beginning-of-input ( -- obj )
: handle-front-anchor ( -- ) front-anchor push-stack ;
: handle-back-anchor ( -- ) back-anchor push-stack ;
: end-of-line ( -- obj )
end-of-input
CHAR: \r <constant>
CHAR: \n <constant>
2dup 2array <concatenation> 4array <alternation> lookahead boa ;
: handle-back-anchor ( -- ) end-of-line push-stack ;
ERROR: bad-character-class obj ;
ERROR: expected-posix-class ;
@ -277,6 +287,8 @@ ERROR: unrecognized-escape char ;
read1
{
{ CHAR: \ [ CHAR: \ <constant> ] }
{ CHAR: ^ [ CHAR: ^ <constant> ] }
{ CHAR: $ [ CHAR: $ <constant> ] }
{ CHAR: - [ CHAR: - <constant> ] }
{ CHAR: { [ CHAR: { <constant> ] }
{ CHAR: } [ CHAR: } <constant> ] }
@ -289,7 +301,6 @@ ERROR: unrecognized-escape char ;
{ CHAR: + [ CHAR: + <constant> ] }
{ CHAR: ? [ CHAR: ? <constant> ] }
{ CHAR: . [ CHAR: . <constant> ] }
! xyzzy
{ CHAR: : [ CHAR: : <constant> ] }
{ CHAR: t [ CHAR: \t <constant> ] }
{ CHAR: n [ CHAR: \n <constant> ] }
@ -297,8 +308,6 @@ ERROR: unrecognized-escape char ;
{ CHAR: f [ HEX: c <constant> ] }
{ CHAR: a [ HEX: 7 <constant> ] }
{ CHAR: e [ HEX: 1b <constant> ] }
{ CHAR: $ [ CHAR: $ <constant> ] }
{ CHAR: ^ [ CHAR: ^ <constant> ] }
{ CHAR: d [ digit-class ] }
{ CHAR: D [ digit-class <negation> ] }
@ -320,16 +329,16 @@ ERROR: unrecognized-escape char ;
! { CHAR: G [ end of previous match ] }
! { CHAR: Z [ handle-end-of-input ] }
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
! xyzzy
{ CHAR: 1 [ CHAR: 1 <constant> ] }
{ CHAR: 2 [ CHAR: 2 <constant> ] }
{ CHAR: 3 [ CHAR: 3 <constant> ] }
{ CHAR: 4 [ CHAR: 4 <constant> ] }
{ CHAR: 5 [ CHAR: 5 <constant> ] }
{ CHAR: 6 [ CHAR: 6 <constant> ] }
{ CHAR: 7 [ CHAR: 7 <constant> ] }
{ CHAR: 8 [ CHAR: 8 <constant> ] }
{ CHAR: 9 [ CHAR: 9 <constant> ] }
! { CHAR: 1 [ CHAR: 1 <constant> ] }
! { CHAR: 2 [ CHAR: 2 <constant> ] }
! { CHAR: 3 [ CHAR: 3 <constant> ] }
! { CHAR: 4 [ CHAR: 4 <constant> ] }
! { CHAR: 5 [ CHAR: 5 <constant> ] }
! { CHAR: 6 [ CHAR: 6 <constant> ] }
! { CHAR: 7 [ CHAR: 7 <constant> ] }
! { CHAR: 8 [ CHAR: 8 <constant> ] }
! { CHAR: 9 [ CHAR: 9 <constant> ] }
{ CHAR: Q [ parse-escaped-literals ] }
[ unrecognized-escape ]
@ -406,6 +415,10 @@ DEFER: handle-left-bracket
: parse-regexp-token ( token -- ? )
{
! todo: only match these at beginning/end of regexp
{ CHAR: ^ [ handle-front-anchor t ] }
{ CHAR: $ [ handle-back-anchor t ] }
{ CHAR: . [ handle-dot t ] }
{ CHAR: ( [ handle-left-parenthesis t ] }
{ CHAR: ) [ handle-right-parenthesis f ] }
@ -415,8 +428,6 @@ DEFER: handle-left-bracket
{ CHAR: + [ handle-plus t ] }
{ CHAR: { [ handle-left-brace t ] }
{ CHAR: [ [ handle-left-bracket t ] }
{ CHAR: ^ [ handle-front-anchor t ] }
{ CHAR: $ [ handle-back-anchor t ] }
{ CHAR: \ [ handle-escape t ] }
[ <constant> push-stack t ]
} case ;

View File

@ -285,3 +285,17 @@ IN: regexp-tests
! 2. (A)
! 3. (B(C))
! 4. (C)
! clear "a(?=b*)" <regexp> "ab" over match
! clear "a(?=b*c)" <regexp> "abbbbbc" over match
! clear "a(?=b*)" <regexp> "ab" over match
! clear "^a" <regexp> "a" over match
! clear "^a" <regexp> "\na" over match
! clear "^a" <regexp> "\r\na" over match
! clear "^a" <regexp> "\ra" over match
! clear "a$" <regexp> "a" over match
! clear "a$" <regexp> "a\n" over match
! clear "a$" <regexp> "a\r" over match
! clear "a$" <regexp> "a\r\n" over match

View File

@ -128,6 +128,8 @@ IN: regexp
: option? ( option regexp -- ? )
options>> key? ;
USE: multiline
/*
M: regexp pprint*
[
[
@ -136,3 +138,4 @@ M: regexp pprint*
case-insensitive swap option? [ "i" % ] when
] "" make
] keep present-text ;
*/

View File

@ -8,9 +8,11 @@ IN: regexp.traversal
TUPLE: dfa-traverser
dfa-table
traversal-flags
traverse-forward
capture-groups
{ capture-group-index integer }
lookahead-counters
lookbehind-counters
last-state current-state
text
start-index current-index
@ -23,10 +25,12 @@ TUPLE: dfa-traverser
swap [ start-state>> >>current-state ] keep
>>dfa-table
swap >>text
t >>traverse-forward
0 >>start-index
0 >>current-index
V{ } clone >>matches
V{ } clone >>capture-groups
V{ } clone >>lookbehind-counters
V{ } clone >>lookahead-counters ;
: final-state? ( dfa-traverser -- ? )
@ -52,8 +56,19 @@ M: lookahead-on flag-action ( dfa-traverser flag -- )
M: lookahead-off flag-action ( dfa-traverser flag -- )
drop
dup lookahead-counters>> pop
'[ _ - ] change-current-index drop ;
dup lookahead-counters>>
[ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ;
M: lookbehind-on flag-action ( dfa-traverser flag -- )
drop
f >>traverse-forward
lookbehind-counters>> 0 swap push ;
M: lookbehind-off flag-action ( dfa-traverser flag -- )
drop
t >>traverse-forward
dup lookbehind-counters>>
[ drop ] [ pop '[ _ + ] change-current-index drop ] if-empty ;
: process-flags ( dfa-traverser -- )
[ [ 1+ ] map ] change-lookahead-counters
@ -62,7 +77,10 @@ M: lookahead-off flag-action ( dfa-traverser flag -- )
: increment-state ( dfa-traverser state -- dfa-traverser )
[
[ 1+ ] change-current-index dup current-state>> >>last-state
dup traverse-forward>>
[ [ 1+ ] change-current-index ]
[ [ 1- ] change-current-index ] if
dup current-state>> >>last-state
] dip
first >>current-state ;