working on lookbehind
parent
56fbeb25ff
commit
80a9147691
|
@ -14,6 +14,8 @@ SINGLETON: eps
|
|||
MIXIN: traversal-flag
|
||||
SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
|
||||
SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
|
||||
SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
|
||||
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
|
||||
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
|
||||
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
|
||||
|
||||
|
@ -143,6 +145,14 @@ M: lookahead nfa-node ( node -- )
|
|||
lookahead-off add-traversal-flag
|
||||
2 [ concatenate-nodes ] times ;
|
||||
|
||||
M: lookbehind nfa-node ( node -- )
|
||||
eps literal-transition add-simple-entry
|
||||
lookbehind-on add-traversal-flag
|
||||
term>> nfa-node
|
||||
eps literal-transition add-simple-entry
|
||||
lookbehind-off add-traversal-flag
|
||||
2 [ concatenate-nodes ] times ;
|
||||
|
||||
: construct-nfa ( regexp -- )
|
||||
[
|
||||
reset-regexp
|
||||
|
|
|
@ -230,8 +230,18 @@ ERROR: invalid-range a b ;
|
|||
[ [ nip at-most-n ] [ at-least-n ] if* ] if
|
||||
] [ drop 0 max exactly-n ] if ;
|
||||
|
||||
SINGLETON: beginning-of-input
|
||||
SINGLETON: end-of-input
|
||||
|
||||
! : beginning-of-input ( -- obj )
|
||||
: handle-front-anchor ( -- ) front-anchor push-stack ;
|
||||
: handle-back-anchor ( -- ) back-anchor push-stack ;
|
||||
: end-of-line ( -- obj )
|
||||
end-of-input
|
||||
CHAR: \r <constant>
|
||||
CHAR: \n <constant>
|
||||
2dup 2array <concatenation> 4array <alternation> lookahead boa ;
|
||||
|
||||
: handle-back-anchor ( -- ) end-of-line push-stack ;
|
||||
|
||||
ERROR: bad-character-class obj ;
|
||||
ERROR: expected-posix-class ;
|
||||
|
@ -277,6 +287,8 @@ ERROR: unrecognized-escape char ;
|
|||
read1
|
||||
{
|
||||
{ CHAR: \ [ CHAR: \ <constant> ] }
|
||||
{ CHAR: ^ [ CHAR: ^ <constant> ] }
|
||||
{ CHAR: $ [ CHAR: $ <constant> ] }
|
||||
{ CHAR: - [ CHAR: - <constant> ] }
|
||||
{ CHAR: { [ CHAR: { <constant> ] }
|
||||
{ CHAR: } [ CHAR: } <constant> ] }
|
||||
|
@ -289,7 +301,6 @@ ERROR: unrecognized-escape char ;
|
|||
{ CHAR: + [ CHAR: + <constant> ] }
|
||||
{ CHAR: ? [ CHAR: ? <constant> ] }
|
||||
{ CHAR: . [ CHAR: . <constant> ] }
|
||||
! xyzzy
|
||||
{ CHAR: : [ CHAR: : <constant> ] }
|
||||
{ CHAR: t [ CHAR: \t <constant> ] }
|
||||
{ CHAR: n [ CHAR: \n <constant> ] }
|
||||
|
@ -297,8 +308,6 @@ ERROR: unrecognized-escape char ;
|
|||
{ CHAR: f [ HEX: c <constant> ] }
|
||||
{ CHAR: a [ HEX: 7 <constant> ] }
|
||||
{ CHAR: e [ HEX: 1b <constant> ] }
|
||||
{ CHAR: $ [ CHAR: $ <constant> ] }
|
||||
{ CHAR: ^ [ CHAR: ^ <constant> ] }
|
||||
|
||||
{ CHAR: d [ digit-class ] }
|
||||
{ CHAR: D [ digit-class <negation> ] }
|
||||
|
@ -320,16 +329,16 @@ ERROR: unrecognized-escape char ;
|
|||
! { CHAR: G [ end of previous match ] }
|
||||
! { CHAR: Z [ handle-end-of-input ] }
|
||||
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
|
||||
! xyzzy
|
||||
{ CHAR: 1 [ CHAR: 1 <constant> ] }
|
||||
{ CHAR: 2 [ CHAR: 2 <constant> ] }
|
||||
{ CHAR: 3 [ CHAR: 3 <constant> ] }
|
||||
{ CHAR: 4 [ CHAR: 4 <constant> ] }
|
||||
{ CHAR: 5 [ CHAR: 5 <constant> ] }
|
||||
{ CHAR: 6 [ CHAR: 6 <constant> ] }
|
||||
{ CHAR: 7 [ CHAR: 7 <constant> ] }
|
||||
{ CHAR: 8 [ CHAR: 8 <constant> ] }
|
||||
{ CHAR: 9 [ CHAR: 9 <constant> ] }
|
||||
|
||||
! { CHAR: 1 [ CHAR: 1 <constant> ] }
|
||||
! { CHAR: 2 [ CHAR: 2 <constant> ] }
|
||||
! { CHAR: 3 [ CHAR: 3 <constant> ] }
|
||||
! { CHAR: 4 [ CHAR: 4 <constant> ] }
|
||||
! { CHAR: 5 [ CHAR: 5 <constant> ] }
|
||||
! { CHAR: 6 [ CHAR: 6 <constant> ] }
|
||||
! { CHAR: 7 [ CHAR: 7 <constant> ] }
|
||||
! { CHAR: 8 [ CHAR: 8 <constant> ] }
|
||||
! { CHAR: 9 [ CHAR: 9 <constant> ] }
|
||||
|
||||
{ CHAR: Q [ parse-escaped-literals ] }
|
||||
[ unrecognized-escape ]
|
||||
|
@ -406,6 +415,10 @@ DEFER: handle-left-bracket
|
|||
|
||||
: parse-regexp-token ( token -- ? )
|
||||
{
|
||||
! todo: only match these at beginning/end of regexp
|
||||
{ CHAR: ^ [ handle-front-anchor t ] }
|
||||
{ CHAR: $ [ handle-back-anchor t ] }
|
||||
|
||||
{ CHAR: . [ handle-dot t ] }
|
||||
{ CHAR: ( [ handle-left-parenthesis t ] }
|
||||
{ CHAR: ) [ handle-right-parenthesis f ] }
|
||||
|
@ -415,8 +428,6 @@ DEFER: handle-left-bracket
|
|||
{ CHAR: + [ handle-plus t ] }
|
||||
{ CHAR: { [ handle-left-brace t ] }
|
||||
{ CHAR: [ [ handle-left-bracket t ] }
|
||||
{ CHAR: ^ [ handle-front-anchor t ] }
|
||||
{ CHAR: $ [ handle-back-anchor t ] }
|
||||
{ CHAR: \ [ handle-escape t ] }
|
||||
[ <constant> push-stack t ]
|
||||
} case ;
|
||||
|
|
|
@ -285,3 +285,17 @@ IN: regexp-tests
|
|||
! 2. (A)
|
||||
! 3. (B(C))
|
||||
! 4. (C)
|
||||
|
||||
! clear "a(?=b*)" <regexp> "ab" over match
|
||||
! clear "a(?=b*c)" <regexp> "abbbbbc" over match
|
||||
! clear "a(?=b*)" <regexp> "ab" over match
|
||||
|
||||
! clear "^a" <regexp> "a" over match
|
||||
! clear "^a" <regexp> "\na" over match
|
||||
! clear "^a" <regexp> "\r\na" over match
|
||||
! clear "^a" <regexp> "\ra" over match
|
||||
|
||||
! clear "a$" <regexp> "a" over match
|
||||
! clear "a$" <regexp> "a\n" over match
|
||||
! clear "a$" <regexp> "a\r" over match
|
||||
! clear "a$" <regexp> "a\r\n" over match
|
||||
|
|
|
@ -128,6 +128,8 @@ IN: regexp
|
|||
: option? ( option regexp -- ? )
|
||||
options>> key? ;
|
||||
|
||||
USE: multiline
|
||||
/*
|
||||
M: regexp pprint*
|
||||
[
|
||||
[
|
||||
|
@ -136,3 +138,4 @@ M: regexp pprint*
|
|||
case-insensitive swap option? [ "i" % ] when
|
||||
] "" make
|
||||
] keep present-text ;
|
||||
*/
|
||||
|
|
|
@ -8,9 +8,11 @@ IN: regexp.traversal
|
|||
TUPLE: dfa-traverser
|
||||
dfa-table
|
||||
traversal-flags
|
||||
traverse-forward
|
||||
capture-groups
|
||||
{ capture-group-index integer }
|
||||
lookahead-counters
|
||||
lookbehind-counters
|
||||
last-state current-state
|
||||
text
|
||||
start-index current-index
|
||||
|
@ -23,10 +25,12 @@ TUPLE: dfa-traverser
|
|||
swap [ start-state>> >>current-state ] keep
|
||||
>>dfa-table
|
||||
swap >>text
|
||||
t >>traverse-forward
|
||||
0 >>start-index
|
||||
0 >>current-index
|
||||
V{ } clone >>matches
|
||||
V{ } clone >>capture-groups
|
||||
V{ } clone >>lookbehind-counters
|
||||
V{ } clone >>lookahead-counters ;
|
||||
|
||||
: final-state? ( dfa-traverser -- ? )
|
||||
|
@ -52,8 +56,19 @@ M: lookahead-on flag-action ( dfa-traverser flag -- )
|
|||
|
||||
M: lookahead-off flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
dup lookahead-counters>> pop
|
||||
'[ _ - ] change-current-index drop ;
|
||||
dup lookahead-counters>>
|
||||
[ drop ] [ pop '[ _ - ] change-current-index drop ] if-empty ;
|
||||
|
||||
M: lookbehind-on flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
f >>traverse-forward
|
||||
lookbehind-counters>> 0 swap push ;
|
||||
|
||||
M: lookbehind-off flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
t >>traverse-forward
|
||||
dup lookbehind-counters>>
|
||||
[ drop ] [ pop '[ _ + ] change-current-index drop ] if-empty ;
|
||||
|
||||
: process-flags ( dfa-traverser -- )
|
||||
[ [ 1+ ] map ] change-lookahead-counters
|
||||
|
@ -62,7 +77,10 @@ M: lookahead-off flag-action ( dfa-traverser flag -- )
|
|||
|
||||
: increment-state ( dfa-traverser state -- dfa-traverser )
|
||||
[
|
||||
[ 1+ ] change-current-index dup current-state>> >>last-state
|
||||
dup traverse-forward>>
|
||||
[ [ 1+ ] change-current-index ]
|
||||
[ [ 1- ] change-current-index ] if
|
||||
dup current-state>> >>last-state
|
||||
] dip
|
||||
first >>current-state ;
|
||||
|
||||
|
|
Loading…
Reference in New Issue