From 6c5f7615037b5220bc6156595c42dc9d9d239d63 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 22 Nov 2008 21:09:31 -0600 Subject: [PATCH 1/5] re-enable a unit test --- basis/regexp/regexp-tests.factor | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 777d0985e4..0647c4b36f 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -253,7 +253,7 @@ IN: regexp-tests [ ] [ "(\\$[\\p{XDigit}]|[\\p{Digit}])" drop ] unit-test -! Comment +! Comment inside a regular expression [ t ] [ "ac" "a(?#boo)c" matches? ] unit-test [ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test @@ -291,6 +291,12 @@ IN: regexp-tests [ "a" ] [ "ba" "a(?<=b)(?<=b)" first-match >string ] unit-test [ "a" ] [ "cab" "a(?=b)(?<=c)" first-match >string ] unit-test +[ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test +[ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test + +! Bug in parsing word +[ t ] [ "a" R' a' matches? ] unit-test + ! [ "{Lower}" ] [ invalid-range? ] must-fail-with ! [ 1 ] [ "aaacb" "a+?" match-head ] unit-test @@ -303,9 +309,6 @@ IN: regexp-tests ! [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test ! [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test -[ 3 ] [ "foobar" "foo(?=bar)" match-head ] unit-test -[ f ] [ "foobxr" "foo(?=bar)" match-head ] unit-test - ! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test @@ -323,10 +326,6 @@ IN: regexp-tests ! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test - -! Bug in parsing word -! [ t ] [ "a" R' a' matches? ] unit-test - ! clear "a(?=b*)" "ab" over match ! clear "a(?=b*c)" "abbbbbc" over match ! clear "a(?=b*)" "ab" over match @@ -356,7 +355,6 @@ IN: regexp-tests ! "a(?<=b)" "caba" over first-match - ! capture group 1: "aaaa" 2: "" ! "aaaa" "(a*)(a*)" match* ! "aaaa" "(a*)(a+)" match* From b00156bc85040f1f675e434a3d3c2535ee2c1f61 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 22 Nov 2008 21:10:53 -0600 Subject: [PATCH 2/5] fix count-matches and add unit test for it --- basis/regexp/regexp-tests.factor | 6 ++++++ basis/regexp/regexp.factor | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 0647c4b36f..291287f8c2 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -283,6 +283,12 @@ IN: regexp-tests [ { "ABC" "DEF" "GHI" } ] [ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test +[ 3 ] +[ "1ABC2DEF3GHI4" R/ [A-Z]+/ count-matches ] unit-test + +[ 0 ] +[ "123" R/ [A-Z]+/ count-matches ] unit-test + [ "1.2.3.4" ] [ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 66bc39415b..652d943090 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -73,7 +73,7 @@ IN: regexp [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ; : count-matches ( string regexp -- n ) - all-matches length 1- ; + all-matches length ; : initial-option ( regexp option -- regexp' ) over options>> conjoin ; From afc97627f9876b40848b89864048787e569256f6 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 22 Nov 2008 23:01:24 -0600 Subject: [PATCH 3/5] remove >r r> from regexp --- basis/regexp/dfa/dfa.factor | 3 ++- basis/regexp/parser/parser.factor | 2 +- basis/regexp/transition-tables/transition-tables.factor | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index ef985258fd..0abd1c2edc 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -43,7 +43,8 @@ IN: regexp.dfa dupd pop dup pick find-transitions rot [ [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep - >r swapd transition make-transition r> dfa-table>> add-transition + [ swapd transition make-transition ] dip + dfa-table>> add-transition ] curry with each new-transitions ] if-empty ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 7f1d92a1ab..1feba62f68 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -225,7 +225,7 @@ ERROR: invalid-range a b ; : handle-left-brace ( -- ) parse-repetition - >r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r> + [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip [ 2dup and [ from-m-to-n ] [ [ nip at-most-n ] [ at-least-n ] if* ] if diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index 1c9a3e3001..3050be14fa 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -40,7 +40,7 @@ TUPLE: transition-table transitions start-state final-states ; 2dup [ to>> ] dip maybe-initialize-key [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip 2dup at* [ 2nip insert-at ] - [ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ; + [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ; : add-transition ( transition transition-table -- ) transitions>> set-transition ; From db3c21663eec4d1b30df504caf25a5b40ecb6440 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 24 Nov 2008 00:18:27 -0600 Subject: [PATCH 4/5] better literal syntax for regexps. support ^ and $ (not in multiline mode yet) but not yet in multiline mode. support for \A and \Z \z is next. removed some crud from the parser, and added more commented out unit tests... --- basis/regexp/classes/classes.factor | 10 ++ basis/regexp/nfa/nfa.factor | 19 +++- basis/regexp/parser/parser.factor | 80 ++++++++------ basis/regexp/regexp-tests.factor | 141 ++++++++++++++++-------- basis/regexp/regexp.factor | 103 ++++++++++------- basis/regexp/traversal/traversal.factor | 68 +++++++++--- 6 files changed, 283 insertions(+), 138 deletions(-) diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 7b729b2e50..f143bebdf7 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -7,6 +7,7 @@ IN: regexp.classes GENERIC: class-member? ( obj class -- ? ) M: word class-member? ( obj class -- ? ) 2drop f ; + M: integer class-member? ( obj class -- ? ) 2drop f ; M: character-class-range class-member? ( obj class -- ? ) @@ -60,3 +61,12 @@ M: java-blank-class class-member? ( obj class -- ? ) M: unmatchable-class class-member? ( obj class -- ? ) 2drop f ; + +M: terminator-class class-member? ( obj class -- ? ) + drop { + [ CHAR: \r = ] + [ CHAR: \n = ] + [ CHAR: \u000085 = ] + [ CHAR: \u002028 = ] + [ CHAR: \u002029 = ] + } 1|| ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 72d0fe970b..50847d6ff9 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -18,6 +18,9 @@ SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag +SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag +SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag +SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag : next-state ( regexp -- state ) [ state>> ] [ [ 1+ ] change-state drop ] bi ; @@ -135,7 +138,21 @@ M: non-capture-group nfa-node ( node -- ) M: reluctant-kleene-star nfa-node ( node -- ) term>> nfa-node ; -! + +: add-epsilon-flag ( flag -- ) + eps literal-transition add-simple-entry add-traversal-flag ; + +M: beginning-of-line nfa-node ( node -- ) + drop beginning-of-line add-epsilon-flag ; + +M: end-of-line nfa-node ( node -- ) + drop end-of-line add-epsilon-flag ; + +M: beginning-of-input nfa-node ( node -- ) + drop beginning-of-input add-epsilon-flag ; + +M: end-of-input nfa-node ( node -- ) + drop end-of-input add-epsilon-flag ; M: negation nfa-node ( node -- ) negation-mode inc diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 1feba62f68..ea8aaffcd5 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators io io.streams.string kernel math math.parser namespaces qualified sets quotations sequences splitting symbols vectors math.order unicode.categories strings regexp.backend regexp.utils -unicode.case words ; +unicode.case words locals ; IN: regexp.parser FROM: math.ranges => [a,b] ; @@ -44,18 +44,21 @@ TUPLE: character-class-range from to ; INSTANCE: character-class-range node SINGLETON: epsilon INSTANCE: epsilon node SINGLETON: any-char INSTANCE: any-char node SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node -SINGLETON: front-anchor INSTANCE: front-anchor node -SINGLETON: back-anchor INSTANCE: back-anchor node +SINGLETON: beginning-of-input INSTANCE: beginning-of-input node +SINGLETON: end-of-input INSTANCE: end-of-input node +SINGLETON: beginning-of-line INSTANCE: beginning-of-line node +SINGLETON: end-of-line INSTANCE: end-of-line node TUPLE: option-on option ; INSTANCE: option-on node TUPLE: option-off option ; INSTANCE: option-off node -SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ; +SINGLETONS: unix-lines dotall multiline comments case-insensitive +unicode-case reversed-regexp ; SINGLETONS: letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class -unmatchable-class ; +terminator-class unmatchable-class word-boundary-class ; SINGLETONS: beginning-of-group end-of-group beginning-of-character-class end-of-character-class @@ -231,20 +234,6 @@ ERROR: invalid-range a b ; [ [ nip at-most-n ] [ at-least-n ] if* ] if ] [ drop 0 max exactly-n ] if ; -SINGLETON: beginning-of-input -SINGLETON: end-of-input - -: newlines ( -- obj1 obj2 obj3 ) - CHAR: \r - CHAR: \n - 2dup 2array ; - -: beginning-of-line ( -- obj ) - beginning-of-input newlines 4array lookbehind boa ; - -: end-of-line ( -- obj ) - end-of-input newlines 4array lookahead boa ; - : handle-front-anchor ( -- ) get-multiline beginning-of-line beginning-of-input ? push-stack ; @@ -281,13 +270,26 @@ ERROR: expected-posix-class ; : parse-control-character ( -- n ) read1 ; ERROR: bad-escaped-literals seq ; -: parse-escaped-literals ( -- obj ) - "\\E" read-until [ bad-escaped-literals ] unless + +: parse-til-E ( -- obj ) + "\\E" read-until [ bad-escaped-literals ] unless ; + +:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj ) + parse-til-E drop1 [ epsilon ] [ - [ ] V{ } map-as + [ quot call ] V{ } map-as first|concatenation - ] if-empty ; + ] if-empty ; inline + +: parse-escaped-literals ( -- obj ) + [ ] (parse-escaped-literals) ; + +: lower-case-literals ( -- obj ) + [ ch>lower ] (parse-escaped-literals) ; + +: upper-case-literals ( -- obj ) + [ ch>upper ] (parse-escaped-literals) ; : parse-escaped ( -- obj ) read1 @@ -299,12 +301,12 @@ ERROR: bad-escaped-literals seq ; { CHAR: a [ HEX: 7 ] } { CHAR: e [ HEX: 1b ] } - { CHAR: d [ digit-class ] } - { CHAR: D [ digit-class ] } - { CHAR: s [ java-blank-class ] } - { CHAR: S [ java-blank-class ] } { CHAR: w [ c-identifier-class ] } { CHAR: W [ c-identifier-class ] } + { CHAR: s [ java-blank-class ] } + { CHAR: S [ java-blank-class ] } + { CHAR: d [ digit-class ] } + { CHAR: D [ digit-class ] } { CHAR: p [ parse-posix-class ] } { CHAR: P [ parse-posix-class ] } @@ -313,13 +315,19 @@ ERROR: bad-escaped-literals seq ; { CHAR: 0 [ parse-octal ] } { CHAR: c [ parse-control-character ] } - ! { CHAR: b [ handle-word-boundary ] } - ! { CHAR: B [ handle-word-boundary ] } - ! { CHAR: A [ handle-beginning-of-input ] } - ! { CHAR: G [ end of previous match ] } - ! { CHAR: Z [ handle-end-of-input ] } - ! { CHAR: z [ handle-end-of-input ] } ! except for terminator + { CHAR: Q [ parse-escaped-literals ] } + ! { CHAR: b [ word-boundary-class ] } + ! { CHAR: B [ word-boundary-class ] } + ! { CHAR: A [ handle-beginning-of-input ] } + ! { CHAR: z [ handle-end-of-input ] } + + ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator + + ! m//g mode + ! { CHAR: G [ end of previous match ] } + + ! Group capture ! { CHAR: 1 [ CHAR: 1 ] } ! { CHAR: 2 [ CHAR: 2 ] } ! { CHAR: 3 [ CHAR: 3 ] } @@ -330,7 +338,11 @@ ERROR: bad-escaped-literals seq ; ! { CHAR: 8 [ CHAR: 8 ] } ! { CHAR: 9 [ CHAR: 9 ] } - { CHAR: Q [ parse-escaped-literals ] } + ! Perl extensions + ! can't do \l and \u because \u is already a 4-hex + { CHAR: L [ lower-case-literals ] } + { CHAR: U [ upper-case-literals ] } + [ ] } case ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 291287f8c2..e4bab990a4 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -45,6 +45,7 @@ IN: regexp-tests ! Off by default. [ f ] [ "\n" "." matches? ] unit-test [ t ] [ "\n" "(?s)." matches? ] unit-test +[ t ] [ "\n" R/ ./s matches? ] unit-test [ f ] [ "\n\n" "(?s).(?-s)." matches? ] unit-test [ f ] [ "" ".+" matches? ] unit-test @@ -210,34 +211,34 @@ IN: regexp-tests [ 3 ] [ "aaacb" "a*" match-head ] unit-test [ 2 ] [ "aaacb" "aa?" match-head ] unit-test -[ t ] [ "aaa" "AAA" matches? ] unit-test -[ f ] [ "aax" "AAA" matches? ] unit-test -[ t ] [ "aaa" "A*" matches? ] unit-test -[ f ] [ "aaba" "A*" matches? ] unit-test -[ t ] [ "b" "[AB]" matches? ] unit-test -[ f ] [ "c" "[AB]" matches? ] unit-test -[ t ] [ "c" "[A-Z]" matches? ] unit-test -[ f ] [ "3" "[A-Z]" matches? ] unit-test +[ t ] [ "aaa" R/ AAA/i matches? ] unit-test +[ f ] [ "aax" R/ AAA/i matches? ] unit-test +[ t ] [ "aaa" R/ A*/i matches? ] unit-test +[ f ] [ "aaba" R/ A*/i matches? ] unit-test +[ t ] [ "b" R/ [AB]/i matches? ] unit-test +[ f ] [ "c" R/ [AB]/i matches? ] unit-test +[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test +[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test [ t ] [ "a" "(?i)a" matches? ] unit-test [ t ] [ "a" "(?i)a" matches? ] unit-test [ t ] [ "A" "(?i)a" matches? ] unit-test [ t ] [ "A" "(?i)a" matches? ] unit-test -[ t ] [ "a" "(?-i)a" matches? ] unit-test -[ t ] [ "a" "(?-i)a" matches? ] unit-test -[ f ] [ "A" "(?-i)a" matches? ] unit-test -[ f ] [ "A" "(?-i)a" matches? ] unit-test +[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test +[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test +[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test +[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" "[a-z]" matches? ] unit-test -[ t ] [ "A" "[a-z]" matches? ] unit-test +[ t ] [ "A" R/ [a-z]/i matches? ] unit-test [ f ] [ "A" "\\p{Lower}" matches? ] unit-test -[ t ] [ "A" "\\p{Lower}" matches? ] unit-test +[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test -[ t ] [ "abc" "abc" matches? ] unit-test -[ t ] [ "abc" "a[bB][cC]" matches? ] unit-test -[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" matches? ] unit-test +[ t ] [ "abc" R/ abc/r matches? ] unit-test +[ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test +[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test @@ -293,6 +294,9 @@ IN: regexp-tests [ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test [ f ] [ "ab" "a(?!b)" first-match ] unit-test +[ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test +[ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test +[ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test [ "a" ] [ "ab" "a(?=b)(?=b)" first-match >string ] unit-test [ "a" ] [ "ba" "a(?<=b)(?<=b)" first-match >string ] unit-test [ "a" ] [ "cab" "a(?=b)(?<=c)" first-match >string ] unit-test @@ -305,19 +309,65 @@ IN: regexp-tests ! [ "{Lower}" ] [ invalid-range? ] must-fail-with -! [ 1 ] [ "aaacb" "a+?" match-head ] unit-test -! [ 1 ] [ "aaacb" "aa??" match-head ] unit-test -! [ f ] [ "aaaab" "a++ab" matches? ] unit-test -! [ t ] [ "aaacb" "a++cb" matches? ] unit-test -! [ 3 ] [ "aacb" "aa?c" match-head ] unit-test -! [ 3 ] [ "aacb" "aa??c" match-head ] unit-test +[ t ] [ "a" R/ ^a/ matches? ] unit-test +[ f ] [ "\na" R/ ^a/ matches? ] unit-test +[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test +[ f ] [ "\ra" R/ ^a/ matches? ] unit-test -! [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test -! [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test +[ t ] [ "a" R/ a$/ matches? ] unit-test +[ f ] [ "a\n" R/ a$/ matches? ] unit-test +[ f ] [ "a\r" R/ a$/ matches? ] unit-test +[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test + +! [ t ] [ "a" R/ \Aa/ matches? ] unit-test +! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test +! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test +! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test + +! [ t ] [ "a" R/ \Aa/m matches? ] unit-test +! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test +! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test +! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test + +! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test + +! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test +! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test + +! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test +! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test + +! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test +! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test +! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test +! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test + +! [ t ] [ "a" R/ ^a/m matches? ] unit-test +! [ t ] [ "\na" R/ ^a/m matches? ] unit-test +! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test +! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test + +! Convert to lowercase until E +[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test +[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test + +! Convert to uppercase until E +[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test +[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test + +! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test +! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test +! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test +! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test ! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test +! [ t ] [ "foo" "\\bfoo\\b" matches? ] unit-test +! [ t ] [ "afoob" "\\Bfoo\\B" matches? ] unit-test +! [ t ] [ "afoob" "\\bfoo\\b" matches? ] unit-test +! [ f ] [ "foo" "\\Bfoo\\B" matches? ] unit-test + ! [ 3 ] [ "foo bar" "foo\\b" match-head ] unit-test ! [ f ] [ "fooxbar" "foo\\b" matches? ] unit-test ! [ t ] [ "foo" "foo\\b" matches? ] unit-test @@ -332,34 +382,29 @@ IN: regexp-tests ! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test -! clear "a(?=b*)" "ab" over match -! clear "a(?=b*c)" "abbbbbc" over match -! clear "a(?=b*)" "ab" over match +! [ 1 ] [ "aaacb" "a+?" match-head ] unit-test +! [ 1 ] [ "aaacb" "aa??" match-head ] unit-test +! [ f ] [ "aaaab" "a++ab" matches? ] unit-test +! [ t ] [ "aaacb" "a++cb" matches? ] unit-test +! [ 3 ] [ "aacb" "aa?c" match-head ] unit-test +! [ 3 ] [ "aacb" "aa??c" match-head ] unit-test -! clear "^a" "a" over match -! clear "^a" "\na" over match -! clear "^a" "\r\na" over match -! clear "^a" "\ra" over match +! "ab" "a(?=b*)" match +! "abbbbbc" "a(?=b*c)" match +! "ab" "a(?=b*)" match -! clear "a$" "a" over match -! clear "a$" "a\n" over match -! clear "a$" "a\r" over match -! clear "a$" "a\r\n" over match +! "baz" "(az)(?<=b)" first-match +! "cbaz" "a(?<=b*)" first-match +! "baz" "a(?<=b)" first-match -! "(az)(?<=b)" "baz" over first-match -! "a(?<=b*)" "cbaz" over first-match -! "a(?<=b)" "baz" over first-match +! "baz" "a(? first-match +! "caz" "a(? first-match -! "a(? "baz" over first-match -! "a(? "caz" over first-match +! "abcdefg" "a(?=bcdefg)bcd" first-match +! "abcdefg" "a(?#bcdefg)bcd" first-match +! "abcdefg" "a(?:bcdefg)" first-match -! "a(?=bcdefg)bcd" "abcdefg" over first-match -! "a(?#bcdefg)bcd" "abcdefg" over first-match -! "a(?:bcdefg)" "abcdefg" over first-match - -[ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test - -! "a(?<=b)" "caba" over first-match +! "caba" "a(?<=b)" first-match ! capture group 1: "aaaa" 2: "" ! "aaaa" "(a*)(a*)" match* diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 652d943090..e61d5692f4 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -1,9 +1,10 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors combinators kernel math sequences +USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend make lexer namespaces parser arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa -regexp.dfa regexp.traversal regexp.transition-tables splitting ; +regexp.dfa regexp.traversal regexp.transition-tables splitting +sorting ; IN: regexp : default-regexp ( string -- regexp ) @@ -75,40 +76,7 @@ IN: regexp : count-matches ( string regexp -- n ) all-matches length ; -: initial-option ( regexp option -- regexp' ) - over options>> conjoin ; - -: ( string -- regexp ) - default-regexp construct-regexp ; - -: ( string -- regexp ) - default-regexp - case-insensitive initial-option - construct-regexp ; - -: ( string -- regexp ) - default-regexp - reversed-regexp initial-option - construct-regexp ; - -: parsing-regexp ( accum end -- accum ) - lexer get dup skip-blank - [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column - lexer get dup still-parsing-line? - [ (parse-token) ] [ drop f ] if - "i" = [ ] [ ] if parsed ; - -: R! CHAR: ! parsing-regexp ; parsing -: R" CHAR: " parsing-regexp ; parsing -: R# CHAR: # parsing-regexp ; parsing -: R' CHAR: ' parsing-regexp ; parsing -: R( CHAR: ) parsing-regexp ; parsing -: R/ CHAR: / parsing-regexp ; parsing -: R@ CHAR: @ parsing-regexp ; parsing -: R[ CHAR: ] parsing-regexp ; parsing -: R` CHAR: ` parsing-regexp ; parsing -: R{ CHAR: } parsing-regexp ; parsing -: R| CHAR: | parsing-regexp ; parsing +> key? ; +ERROR: unknown-regexp-option option ; + +: option>ch ( option -- string ) + { + { case-insensitive [ CHAR: i ] } + { multiline [ CHAR: m ] } + { reversed-regexp [ CHAR: r ] } + { dotall [ CHAR: s ] } + [ unknown-regexp-option ] + } case ; + +: ch>option ( ch -- option ) + { + { CHAR: i [ case-insensitive ] } + { CHAR: m [ multiline ] } + { CHAR: r [ reversed-regexp ] } + { CHAR: s [ dotall ] } + [ unknown-regexp-option ] + } case ; + +: string>options ( string -- options ) + [ ch>option dup ] H{ } map>assoc ; + +: options>string ( options -- string ) + keys [ option>ch ] map natural-sort >string ; + +PRIVATE> + +: ( string option-string -- regexp ) + [ default-regexp ] [ string>options ] bi* >>options + construct-regexp ; + +: ( string -- regexp ) "" ; + + parsed ; + +PRIVATE> + +: R! CHAR: ! parsing-regexp ; parsing +: R" CHAR: " parsing-regexp ; parsing +: R# CHAR: # parsing-regexp ; parsing +: R' CHAR: ' parsing-regexp ; parsing +: R( CHAR: ) parsing-regexp ; parsing +: R/ CHAR: / parsing-regexp ; parsing +: R@ CHAR: @ parsing-regexp ; parsing +: R[ CHAR: ] parsing-regexp ; parsing +: R` CHAR: ` parsing-regexp ; parsing +: R{ CHAR: } parsing-regexp ; parsing +: R| CHAR: | parsing-regexp ; parsing M: regexp pprint* [ [ - dup raw>> - dup find-regexp-syntax swap % swap % % - case-insensitive swap option? [ "i" % ] when + [ raw>> dup find-regexp-syntax swap % swap % % ] + [ options>> options>string % ] bi ] "" make ] keep present-text ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 86d315ee2f..c880c11c53 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -17,6 +17,7 @@ TUPLE: dfa-traverser capture-group-index last-state current-state text + match-failed? start-index current-index matches ; @@ -37,14 +38,20 @@ TUPLE: dfa-traverser H{ } clone >>captured-groups ; : final-state? ( dfa-traverser -- ? ) - [ current-state>> ] [ dfa-table>> final-states>> ] bi - key? ; + [ current-state>> ] + [ dfa-table>> final-states>> ] bi key? ; + +: beginning-of-text? ( dfa-traverser -- ? ) + current-index>> 0 <= ; inline + +: end-of-text? ( dfa-traverser -- ? ) + [ current-index>> ] [ text>> length ] bi >= ; inline : text-finished? ( dfa-traverser -- ? ) { [ current-state>> empty? ] - [ [ current-index>> ] [ text>> length ] bi >= ] - ! [ current-index>> 0 < ] + [ end-of-text? ] + [ match-failed?>> ] } 1|| ; : save-final-state ( dfa-straverser -- ) @@ -55,8 +62,47 @@ TUPLE: dfa-traverser dup save-final-state ] when text-finished? ; +: previous-text-character ( dfa-traverser -- ch ) + [ text>> ] [ current-index>> 1- ] bi nth ; + +: current-text-character ( dfa-traverser -- ch ) + [ text>> ] [ current-index>> ] bi nth ; + +: next-text-character ( dfa-traverser -- ch ) + [ text>> ] [ current-index>> 1+ ] bi nth ; + GENERIC: flag-action ( dfa-traverser flag -- ) + +M: beginning-of-input flag-action ( dfa-traverser flag -- ) + drop + dup beginning-of-text? [ t >>match-failed? ] unless drop ; + +M: end-of-input flag-action ( dfa-traverser flag -- ) + drop + dup end-of-text? [ t >>match-failed? ] unless drop ; + +M: beginning-of-line flag-action ( dfa-traverser flag -- ) + drop + dup { + [ beginning-of-text? ] + [ previous-text-character terminator-class class-member? ] + } 1|| [ t >>match-failed? ] unless drop ; + +M: end-of-line flag-action ( dfa-traverser flag -- ) + drop + dup { + [ end-of-text? ] + [ next-text-character terminator-class class-member? ] + } 1|| [ t >>match-failed? ] unless drop ; + +M: word-boundary flag-action ( dfa-traverser flag -- ) + drop + dup { + [ end-of-text? ] + [ current-text-character terminator-class class-member? ] + } 1|| [ t >>match-failed? ] unless drop ; + M: lookahead-on flag-action ( dfa-traverser flag -- ) drop lookahead-counters>> 0 swap push ; @@ -110,11 +156,7 @@ M: capture-group-off flag-action ( dfa-traverser flag -- ) [ [ 1+ ] change-current-index ] [ [ 1- ] change-current-index ] if dup current-state>> >>last-state - ] dip - first >>current-state ; - -: match-failed ( dfa-traverser -- dfa-traverser ) - V{ } clone >>matches ; + ] [ first ] bi* >>current-state ; : match-literal ( transition from-state table -- to-state/f ) transitions>> at at ; @@ -131,11 +173,9 @@ M: capture-group-off flag-action ( dfa-traverser flag -- ) { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; : setup-match ( match -- obj state dfa-table ) - { - [ current-index>> ] [ text>> ] - [ current-state>> ] [ dfa-table>> ] - } cleave - [ nth ] 2dip ; + [ [ current-index>> ] [ text>> ] bi nth ] + [ current-state>> ] + [ dfa-table>> ] tri ; : do-match ( dfa-traverser -- dfa-traverser ) dup process-flags From 7c42a9ce6b5f9c6d8e06ef875efcd88efb575f56 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 24 Nov 2008 00:20:38 -0600 Subject: [PATCH 5/5] improved regexp literals caught some typos --- extra/benchmark/regex-dna/regex-dna.factor | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/extra/benchmark/regex-dna/regex-dna.factor b/extra/benchmark/regex-dna/regex-dna.factor index 0c21de0363..8c0aee596d 100644 --- a/extra/benchmark/regex-dna/regex-dna.factor +++ b/extra/benchmark/regex-dna/regex-dna.factor @@ -11,14 +11,14 @@ IN: benchmark.regex-dna : count-patterns ( string -- ) { - R/ agggtaaa|tttaccct/i, - R/ [cgt]gggtaaa|tttaccc[acg]/i, - R/ a[act]ggtaaa|tttacc[agt]t/i, - R/ ag[act]gtaaa|tttac[agt]ct/i, - R/ agg[act]taaa|ttta[agt]cct/i, - R/ aggg[acg]aaa|ttt[cgt]ccct/i, - R/ agggt[cgt]aa|tt[acg]accct/i, - R/ agggta[cgt]a|t[acg]taccct/i, + R/ agggtaaa|tttaccct/i + R/ [cgt]gggtaaa|tttaccc[acg]/i + R/ a[act]ggtaaa|tttacc[agt]t/i + R/ ag[act]gtaaa|tttac[agt]ct/i + R/ agg[act]taaa|ttta[agt]cct/i + R/ aggg[acg]aaa|ttt[cgt]ccct/i + R/ agggt[cgt]aa|tt[acg]accct/i + R/ agggta[cgt]a|t[acg]taccct/i R/ agggtaa[cgt]|[acg]ttaccct/i } [ [ raw>> write bl ]