Reorganizing regexp matcher protocol
parent
af2d380a7f
commit
99a2b95a5b
|
@ -36,8 +36,7 @@ IN: regexp.compiler
|
||||||
|
|
||||||
: transitions>quot ( transitions final-state? -- quot )
|
: transitions>quot ( transitions final-state? -- quot )
|
||||||
[ split-literals suffix ] dip
|
[ split-literals suffix ] dip
|
||||||
'[ _ _ step ] ;
|
'[ { array-capacity string } declare _ _ step ] ;
|
||||||
! '[ { array-capacity string } declare _ _ step ] ;
|
|
||||||
|
|
||||||
: word>quot ( word dfa -- quot )
|
: word>quot ( word dfa -- quot )
|
||||||
[ transitions>> at ]
|
[ transitions>> at ]
|
||||||
|
@ -68,8 +67,11 @@ IN: regexp.compiler
|
||||||
: dfa>word ( dfa -- word )
|
: dfa>word ( dfa -- word )
|
||||||
states>words [ states>code ] keep start-state>> ;
|
states>words [ states>code ] keep start-state>> ;
|
||||||
|
|
||||||
: run-regexp ( string word -- ? )
|
: check-string ( string -- string )
|
||||||
[ f 0 ] 2dip execute ; inline
|
dup string? [ "String required" throw ] unless ;
|
||||||
|
|
||||||
|
: run-regexp ( start-index string word -- ? )
|
||||||
|
{ [ f ] [ >fixnum ] [ check-string ] [ execute ] } spread ; inline
|
||||||
|
|
||||||
: dfa>quotation ( dfa -- quot )
|
: dfa>quotation ( dfa -- quot )
|
||||||
dfa>word '[ _ run-regexp ] ;
|
dfa>word '[ _ run-regexp ] ;
|
||||||
|
@ -77,5 +79,5 @@ IN: regexp.compiler
|
||||||
TUPLE: quot-matcher quot ;
|
TUPLE: quot-matcher quot ;
|
||||||
C: <quot-matcher> quot-matcher
|
C: <quot-matcher> quot-matcher
|
||||||
|
|
||||||
M: quot-matcher match-index
|
M: quot-matcher match-index-from
|
||||||
quot>> call( string -- i/f ) ;
|
quot>> call( index string -- i/f ) ;
|
||||||
|
|
|
@ -1,61 +1,60 @@
|
||||||
! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman.
|
! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: kernel sequences math splitting make fry ;
|
USING: kernel sequences math splitting make fry locals math.ranges
|
||||||
|
accessors arrays ;
|
||||||
IN: regexp.matchers
|
IN: regexp.matchers
|
||||||
|
|
||||||
! For now, a matcher is just something with a method to do the
|
! For now, a matcher is just something with a method to do the
|
||||||
! equivalent of match.
|
! equivalent of match.
|
||||||
|
|
||||||
! matcher protocol:
|
GENERIC: match-index-from ( i string matcher -- index/f )
|
||||||
GENERIC: match-index ( string matcher -- index/f )
|
|
||||||
|
|
||||||
: match ( string matcher -- slice/f )
|
: match-index-head ( string matcher -- index/f )
|
||||||
dupd match-index [ head-slice ] [ drop f ] if* ;
|
[ 0 ] 2dip match-index-from ;
|
||||||
|
|
||||||
|
: match-slice ( i string matcher -- slice/f )
|
||||||
|
[ 2dup ] dip match-index-from
|
||||||
|
[ swap <slice> ] [ 2drop f ] if* ;
|
||||||
|
|
||||||
: matches? ( string matcher -- ? )
|
: matches? ( string matcher -- ? )
|
||||||
dupd match-index
|
dupd match-index-head
|
||||||
[ swap length = ] [ drop f ] if* ;
|
[ swap length = ] [ drop f ] if* ;
|
||||||
|
|
||||||
: match-head ( string matcher -- end/f ) match [ length ] [ f ] if* ;
|
: map-find ( seq quot -- result elt )
|
||||||
|
[ f ] 2dip
|
||||||
|
'[ nip @ dup ] find
|
||||||
|
[ [ drop f ] unless ] dip ; inline
|
||||||
|
|
||||||
: match-at ( string m matcher -- n/f finished? )
|
:: match-from ( i string matcher -- slice/f )
|
||||||
[
|
i string length [a,b)
|
||||||
2dup swap length > [ 2drop f f ] [ tail-slice t ] if
|
[ string matcher match-slice ] map-find drop ;
|
||||||
] dip swap [ match-head f ] [ 2drop f t ] if ;
|
|
||||||
|
|
||||||
: match-range ( string m matcher -- a/f b/f )
|
: match-head ( str matcher -- slice/f )
|
||||||
3dup match-at over [
|
[ 0 ] 2dip match-from ;
|
||||||
drop nip rot drop dupd +
|
|
||||||
] [
|
|
||||||
[ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
: first-match ( string matcher -- slice/f )
|
: next-match ( i string matcher -- i match/f )
|
||||||
dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
|
match-from [ dup [ to>> ] when ] keep ;
|
||||||
|
|
||||||
: re-cut ( string matcher -- end/f start )
|
:: all-matches ( string matcher -- seq )
|
||||||
dupd first-match
|
0 [ dup ] [ string matcher next-match ] [ ] produce nip but-last ;
|
||||||
[ split1-slice swap ] [ "" like f swap ] if* ;
|
|
||||||
|
|
||||||
<PRIVATE
|
|
||||||
|
|
||||||
: (re-split) ( string matcher -- )
|
|
||||||
over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
|
|
||||||
|
|
||||||
PRIVATE>
|
|
||||||
|
|
||||||
: re-split ( string matcher -- seq )
|
|
||||||
[ (re-split) ] { } make ;
|
|
||||||
|
|
||||||
: re-replace ( string matcher replacement -- result )
|
|
||||||
[ re-split ] dip join ;
|
|
||||||
|
|
||||||
: next-match ( string matcher -- end/f match/f )
|
|
||||||
dupd first-match dup
|
|
||||||
[ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
|
|
||||||
|
|
||||||
: all-matches ( string matcher -- seq )
|
|
||||||
[ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
|
|
||||||
|
|
||||||
: count-matches ( string matcher -- n )
|
: count-matches ( string matcher -- n )
|
||||||
all-matches length ;
|
all-matches length ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
:: split-slices ( string slices -- new-slices )
|
||||||
|
slices [ to>> ] map 0 prefix
|
||||||
|
slices [ from>> ] map string length suffix
|
||||||
|
[ string <slice> ] 2map ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: re-split1 ( string matcher -- before after/f )
|
||||||
|
dupd match-head [ 1array split-slices first2 ] [ f ] if* ;
|
||||||
|
|
||||||
|
: re-split ( string matcher -- seq )
|
||||||
|
dupd all-matches split-slices ;
|
||||||
|
|
||||||
|
: re-replace ( string matcher replacement -- result )
|
||||||
|
[ re-split ] dip join ;
|
||||||
|
|
|
@ -208,8 +208,8 @@ IN: regexp-tests
|
||||||
[ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
|
[ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
|
[ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
|
[ 3 ] [ "aaacb" "a*" <regexp> match-index-head ] unit-test
|
||||||
[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
|
[ 2 ] [ "aaacb" "aa?" <regexp> match-index-head ] unit-test
|
||||||
|
|
||||||
[ t ] [ "aaa" R/ AAA/i matches? ] unit-test
|
[ t ] [ "aaa" R/ AAA/i matches? ] unit-test
|
||||||
[ f ] [ "aax" R/ AAA/i matches? ] unit-test
|
[ f ] [ "aax" R/ AAA/i matches? ] unit-test
|
||||||
|
@ -238,11 +238,11 @@ IN: regexp-tests
|
||||||
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
|
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
|
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
|
[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test
|
||||||
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
|
[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "xabc" R/ abc/ <reverse-matcher> match >boolean ] unit-test
|
[ t ] [ "xabc" R/ abc/ <reverse-matcher> match-index-head >boolean ] unit-test
|
||||||
[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match >boolean ] unit-test
|
[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-head >boolean ] unit-test
|
||||||
|
|
||||||
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||||
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||||
|
@ -267,13 +267,13 @@ IN: regexp-tests
|
||||||
|
|
||||||
[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test
|
[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test
|
||||||
|
|
||||||
[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test
|
[ "ab" ] [ "ab" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
|
||||||
[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> first-match >string ] unit-test
|
[ "abc" ] [ "abc" "(a|ab)(bc)?" <regexp> match-head >string ] unit-test
|
||||||
|
|
||||||
[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
|
[ "ab" ] [ "ab" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
|
||||||
[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> first-match >string ] unit-test
|
[ "abc" ] [ "abc" "(ab|a)(bc)?" <regexp> match-head >string ] unit-test
|
||||||
|
|
||||||
[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> first-match >string ] unit-test
|
[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> match-head >string ] unit-test
|
||||||
|
|
||||||
! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
|
! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
|
@ -304,16 +304,16 @@ IN: regexp-tests
|
||||||
[ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
|
[ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
|
||||||
|
|
||||||
/*
|
/*
|
||||||
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
|
[ f ] [ "ab" "a(?!b)" <regexp> match-head ] unit-test
|
||||||
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
|
[ "a" ] [ "ac" "a(?!b)" <regexp> match-head >string ] unit-test
|
||||||
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||||
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||||
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
|
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> match-head >string ] unit-test
|
||||||
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
|
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> match-head >string ] unit-test
|
||||||
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
|
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> match-head >string ] unit-test
|
||||||
|
|
||||||
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
|
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-index-head ] unit-test
|
||||||
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
|
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-index-head ] unit-test
|
||||||
*/
|
*/
|
||||||
|
|
||||||
! Bug in parsing word
|
! Bug in parsing word
|
||||||
|
@ -393,15 +393,15 @@ IN: regexp-tests
|
||||||
! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
|
||||||
! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
|
! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
|
||||||
|
|
||||||
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
|
! [ f ] [ "foobxr" "foo\\z" <regexp> match-index-head ] unit-test
|
||||||
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
|
! [ 3 ] [ "foo" "foo\\z" <regexp> match-index-head ] unit-test
|
||||||
|
|
||||||
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
|
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
|
||||||
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
||||||
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
|
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
|
||||||
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
|
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-index-head ] unit-test
|
||||||
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
|
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
|
||||||
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
|
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
|
||||||
! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
|
! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
|
||||||
|
@ -409,18 +409,18 @@ IN: regexp-tests
|
||||||
! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
|
! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
|
! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
|
||||||
! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-head ] unit-test
|
! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-index-head ] unit-test
|
||||||
! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
|
! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
|
||||||
! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
|
! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
|
||||||
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
|
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
|
||||||
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
|
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
|
! [ 1 ] [ "aaacb" "a+?" <regexp> match-index-head ] unit-test
|
||||||
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
|
! [ 1 ] [ "aaacb" "aa??" <regexp> match-index-head ] unit-test
|
||||||
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
|
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
|
||||||
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
|
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
|
||||||
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
|
! [ 3 ] [ "aacb" "aa?c" <regexp> match-index-head ] unit-test
|
||||||
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
|
! [ 3 ] [ "aacb" "aa??c" <regexp> match-index-head ] unit-test
|
||||||
|
|
||||||
! "ab" "a(?=b*)" <regexp> match
|
! "ab" "a(?=b*)" <regexp> match
|
||||||
! "abbbbbc" "a(?=b*c)" <regexp> match
|
! "abbbbbc" "a(?=b*c)" <regexp> match
|
||||||
|
|
|
@ -45,13 +45,13 @@ C: <reverse-matcher> reverse-matcher
|
||||||
: compile-reverse ( regexp -- regexp )
|
: compile-reverse ( regexp -- regexp )
|
||||||
dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
|
dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
|
||||||
|
|
||||||
M: regexp match-index ( string regexp -- index/f )
|
M: regexp match-index-from ( string regexp -- index/f )
|
||||||
dup dfa-quot>>
|
dup dfa-quot>>
|
||||||
[ <quot-matcher> ]
|
[ <quot-matcher> ]
|
||||||
[ compile-regexp dfa>> <dfa-matcher> ] ?if
|
[ compile-regexp dfa>> <dfa-matcher> ] ?if
|
||||||
match-index ;
|
match-index-from ;
|
||||||
|
|
||||||
M: reverse-matcher match-index ( string regexp -- index/f )
|
M: reverse-matcher match-index-from ( string regexp -- index/f )
|
||||||
[ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi*
|
[ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi*
|
||||||
<dfa-traverser> do-match match-index>> ;
|
<dfa-traverser> do-match match-index>> ;
|
||||||
|
|
||||||
|
|
|
@ -12,11 +12,11 @@ TUPLE: dfa-traverser
|
||||||
current-index
|
current-index
|
||||||
match-index ;
|
match-index ;
|
||||||
|
|
||||||
: <dfa-traverser> ( text dfa -- match )
|
: <dfa-traverser> ( start-index text dfa -- match )
|
||||||
dfa-traverser new
|
dfa-traverser new
|
||||||
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
|
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
|
||||||
swap >>text
|
swap >>text
|
||||||
0 >>current-index ;
|
swap >>current-index ;
|
||||||
|
|
||||||
: final-state? ( dfa-traverser -- ? )
|
: final-state? ( dfa-traverser -- ? )
|
||||||
[ current-state>> ]
|
[ current-state>> ]
|
||||||
|
@ -65,5 +65,5 @@ TUPLE: dfa-traverser
|
||||||
|
|
||||||
TUPLE: dfa-matcher dfa ;
|
TUPLE: dfa-matcher dfa ;
|
||||||
C: <dfa-matcher> dfa-matcher
|
C: <dfa-matcher> dfa-matcher
|
||||||
M: dfa-matcher match-index
|
M: dfa-matcher match-index-from
|
||||||
dfa>> <dfa-traverser> do-match match-index>> ;
|
dfa>> <dfa-traverser> do-match match-index>> ;
|
||||||
|
|
Loading…
Reference in New Issue