From 99a2b95a5b7ac189c7a7a2c90280ab33f66146aa Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 26 Feb 2009 22:14:41 -0600 Subject: [PATCH] Reorganizing regexp matcher protocol --- basis/regexp/compiler/compiler.factor | 14 +++-- basis/regexp/matchers/matchers.factor | 83 ++++++++++++------------- basis/regexp/regexp-tests.factor | 52 ++++++++-------- basis/regexp/regexp.factor | 6 +- basis/regexp/traversal/traversal.factor | 6 +- 5 files changed, 81 insertions(+), 80 deletions(-) diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index fa3e67d1f9..7fda010351 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -36,8 +36,7 @@ IN: regexp.compiler : transitions>quot ( transitions final-state? -- quot ) [ split-literals suffix ] dip - '[ _ _ step ] ; - ! '[ { array-capacity string } declare _ _ step ] ; + '[ { array-capacity string } declare _ _ step ] ; : word>quot ( word dfa -- quot ) [ transitions>> at ] @@ -68,8 +67,11 @@ IN: regexp.compiler : dfa>word ( dfa -- word ) states>words [ states>code ] keep start-state>> ; -: run-regexp ( string word -- ? ) - [ f 0 ] 2dip execute ; inline +: check-string ( string -- string ) + dup string? [ "String required" throw ] unless ; + +: run-regexp ( start-index string word -- ? ) + { [ f ] [ >fixnum ] [ check-string ] [ execute ] } spread ; inline : dfa>quotation ( dfa -- quot ) dfa>word '[ _ run-regexp ] ; @@ -77,5 +79,5 @@ IN: regexp.compiler TUPLE: quot-matcher quot ; C: quot-matcher -M: quot-matcher match-index - quot>> call( string -- i/f ) ; +M: quot-matcher match-index-from + quot>> call( index string -- i/f ) ; diff --git a/basis/regexp/matchers/matchers.factor b/basis/regexp/matchers/matchers.factor index 7ac1edf58c..1c45dade71 100644 --- a/basis/regexp/matchers/matchers.factor +++ b/basis/regexp/matchers/matchers.factor @@ -1,61 +1,60 @@ ! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel sequences math splitting make fry ; +USING: kernel sequences math splitting make fry locals math.ranges +accessors arrays ; IN: regexp.matchers ! For now, a matcher is just something with a method to do the ! equivalent of match. -! matcher protocol: -GENERIC: match-index ( string matcher -- index/f ) +GENERIC: match-index-from ( i string matcher -- index/f ) -: match ( string matcher -- slice/f ) - dupd match-index [ head-slice ] [ drop f ] if* ; +: match-index-head ( string matcher -- index/f ) + [ 0 ] 2dip match-index-from ; + +: match-slice ( i string matcher -- slice/f ) + [ 2dup ] dip match-index-from + [ swap ] [ 2drop f ] if* ; : matches? ( string matcher -- ? ) - dupd match-index + dupd match-index-head [ swap length = ] [ drop f ] if* ; -: match-head ( string matcher -- end/f ) match [ length ] [ f ] if* ; +: map-find ( seq quot -- result elt ) + [ f ] 2dip + '[ nip @ dup ] find + [ [ drop f ] unless ] dip ; inline -: match-at ( string m matcher -- n/f finished? ) - [ - 2dup swap length > [ 2drop f f ] [ tail-slice t ] if - ] dip swap [ match-head f ] [ 2drop f t ] if ; +:: match-from ( i string matcher -- slice/f ) + i string length [a,b) + [ string matcher match-slice ] map-find drop ; -: match-range ( string m matcher -- a/f b/f ) - 3dup match-at over [ - drop nip rot drop dupd + - ] [ - [ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if - ] if ; +: match-head ( str matcher -- slice/f ) + [ 0 ] 2dip match-from ; -: first-match ( string matcher -- slice/f ) - dupd 0 swap match-range rot over [ ] [ 3drop f ] if ; +: next-match ( i string matcher -- i match/f ) + match-from [ dup [ to>> ] when ] keep ; -: re-cut ( string matcher -- end/f start ) - dupd first-match - [ split1-slice swap ] [ "" like f swap ] if* ; - - - -: re-split ( string matcher -- seq ) - [ (re-split) ] { } make ; - -: re-replace ( string matcher replacement -- result ) - [ re-split ] dip join ; - -: next-match ( string matcher -- end/f match/f ) - dupd first-match dup - [ [ split1-slice nip ] keep ] [ 2drop f f ] if ; - -: all-matches ( string matcher -- seq ) - [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ; +:: all-matches ( string matcher -- seq ) + 0 [ dup ] [ string matcher next-match ] [ ] produce nip but-last ; : count-matches ( string matcher -- n ) all-matches length ; + +> ] map 0 prefix + slices [ from>> ] map string length suffix + [ string ] 2map ; + +PRIVATE> + +: re-split1 ( string matcher -- before after/f ) + dupd match-head [ 1array split-slices first2 ] [ f ] if* ; + +: re-split ( string matcher -- seq ) + dupd all-matches split-slices ; + +: re-replace ( string matcher replacement -- result ) + [ re-split ] dip join ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index cbc582b295..f4382b5078 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -208,8 +208,8 @@ IN: regexp-tests [ f ] [ "aaaxb" "a+ab" matches? ] unit-test [ t ] [ "aaacb" "a+cb" matches? ] unit-test -[ 3 ] [ "aaacb" "a*" match-head ] unit-test -[ 2 ] [ "aaacb" "aa?" match-head ] unit-test +[ 3 ] [ "aaacb" "a*" match-index-head ] unit-test +[ 2 ] [ "aaacb" "aa?" match-index-head ] unit-test [ t ] [ "aaa" R/ AAA/i matches? ] unit-test [ f ] [ "aax" R/ AAA/i matches? ] unit-test @@ -238,11 +238,11 @@ IN: regexp-tests [ f ] [ "A" "\\p{Lower}" matches? ] unit-test [ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test -[ t ] [ "abc" R/ abc/r matches? ] unit-test -[ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test +[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test +[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test -[ t ] [ "xabc" R/ abc/ match >boolean ] unit-test -[ t ] [ "xabc" R/ a[bB][cC]/ match >boolean ] unit-test +[ t ] [ "xabc" R/ abc/ match-index-head >boolean ] unit-test +[ t ] [ "xabc" R/ a[bB][cC]/ match-index-head >boolean ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test @@ -267,13 +267,13 @@ IN: regexp-tests [ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test -[ "ab" ] [ "ab" "(a|ab)(bc)?" first-match >string ] unit-test -[ "abc" ] [ "abc" "(a|ab)(bc)?" first-match >string ] unit-test +[ "ab" ] [ "ab" "(a|ab)(bc)?" match-head >string ] unit-test +[ "abc" ] [ "abc" "(a|ab)(bc)?" match-head >string ] unit-test -[ "ab" ] [ "ab" "(ab|a)(bc)?" first-match >string ] unit-test -[ "abc" ] [ "abc" "(ab|a)(bc)?" first-match >string ] unit-test +[ "ab" ] [ "ab" "(ab|a)(bc)?" match-head >string ] unit-test +[ "abc" ] [ "abc" "(ab|a)(bc)?" match-head >string ] unit-test -[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" first-match >string ] unit-test +[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" match-head >string ] unit-test ! [ t ] [ "a:b" ".+:?" matches? ] unit-test @@ -304,16 +304,16 @@ IN: regexp-tests [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test /* -[ f ] [ "ab" "a(?!b)" first-match ] unit-test -[ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test +[ f ] [ "ab" "a(?!b)" match-head ] unit-test +[ "a" ] [ "ac" "a(?!b)" match-head >string ] unit-test [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test -[ "a" ] [ "ab" "a(?=b)(?=b)" first-match >string ] unit-test -[ "a" ] [ "ba" "a(?<=b)(?<=b)" first-match >string ] unit-test -[ "a" ] [ "cab" "a(?=b)(?<=c)" first-match >string ] unit-test +[ "a" ] [ "ab" "a(?=b)(?=b)" match-head >string ] unit-test +[ "a" ] [ "ba" "a(?<=b)(?<=b)" match-head >string ] unit-test +[ "a" ] [ "cab" "a(?=b)(?<=c)" match-head >string ] unit-test -[ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test -[ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test +[ 3 ] [ "foobar" "foo(?=bar)" match-index-head ] unit-test +[ f ] [ "foobxr" "foo(?=bar)" match-index-head ] unit-test */ ! Bug in parsing word @@ -393,15 +393,15 @@ IN: regexp-tests ! [ t ] [ "a\r" R/ a$/m matches? ] unit-test ! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test -! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test -! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test +! [ f ] [ "foobxr" "foo\\z" match-index-head ] unit-test +! [ 3 ] [ "foo" "foo\\z" match-index-head ] unit-test ! [ t ] [ "foo" "\\bfoo\\b" matches? ] unit-test ! [ t ] [ "afoob" "\\Bfoo\\B" matches? ] unit-test ! [ t ] [ "afoob" "\\bfoo\\b" matches? ] unit-test ! [ f ] [ "foo" "\\Bfoo\\B" matches? ] unit-test -! [ 3 ] [ "foo bar" "foo\\b" match-head ] unit-test +! [ 3 ] [ "foo bar" "foo\\b" match-index-head ] unit-test ! [ f ] [ "fooxbar" "foo\\b" matches? ] unit-test ! [ t ] [ "foo" "foo\\b" matches? ] unit-test ! [ t ] [ "foo bar" "foo\\b bar" matches? ] unit-test @@ -409,18 +409,18 @@ IN: regexp-tests ! [ f ] [ "foo" "foo\\bbar" matches? ] unit-test ! [ f ] [ "foo bar" "foo\\B" matches? ] unit-test -! [ 3 ] [ "fooxbar" "foo\\B" match-head ] unit-test +! [ 3 ] [ "fooxbar" "foo\\B" match-index-head ] unit-test ! [ t ] [ "foo" "foo\\B" matches? ] unit-test ! [ f ] [ "foo bar" "foo\\B bar" matches? ] unit-test ! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test -! [ 1 ] [ "aaacb" "a+?" match-head ] unit-test -! [ 1 ] [ "aaacb" "aa??" match-head ] unit-test +! [ 1 ] [ "aaacb" "a+?" match-index-head ] unit-test +! [ 1 ] [ "aaacb" "aa??" match-index-head ] unit-test ! [ f ] [ "aaaab" "a++ab" matches? ] unit-test ! [ t ] [ "aaacb" "a++cb" matches? ] unit-test -! [ 3 ] [ "aacb" "aa?c" match-head ] unit-test -! [ 3 ] [ "aacb" "aa??c" match-head ] unit-test +! [ 3 ] [ "aacb" "aa?c" match-index-head ] unit-test +! [ 3 ] [ "aacb" "aa??c" match-index-head ] unit-test ! "ab" "a(?=b*)" match ! "abbbbbc" "a(?=b*c)" match diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 45660ad309..0502cb4d4b 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -45,13 +45,13 @@ C: reverse-matcher : compile-reverse ( regexp -- regexp ) dup '[ [ _ get-ast ast>dfa ] unless* ] change-reverse-dfa ; -M: regexp match-index ( string regexp -- index/f ) +M: regexp match-index-from ( string regexp -- index/f ) dup dfa-quot>> [ ] [ compile-regexp dfa>> ] ?if - match-index ; + match-index-from ; -M: reverse-matcher match-index ( string regexp -- index/f ) +M: reverse-matcher match-index-from ( string regexp -- index/f ) [ ] [ regexp>> compile-reverse reverse-dfa>> ] bi* do-match match-index>> ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index e215cde416..b890ca7e12 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -12,11 +12,11 @@ TUPLE: dfa-traverser current-index match-index ; -: ( text dfa -- match ) +: ( start-index text dfa -- match ) dfa-traverser new swap [ start-state>> >>current-state ] [ >>dfa-table ] bi swap >>text - 0 >>current-index ; + swap >>current-index ; : final-state? ( dfa-traverser -- ? ) [ current-state>> ] @@ -65,5 +65,5 @@ TUPLE: dfa-traverser TUPLE: dfa-matcher dfa ; C: dfa-matcher -M: dfa-matcher match-index +M: dfa-matcher match-index-from dfa>> do-match match-index>> ;