From 7d0d2da318c043733e4466b728765e31018a9790 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Thu, 21 Aug 2008 17:55:25 -0500 Subject: [PATCH] case insensitive works --- extra/regexp2/backend/backend.factor | 1 + extra/regexp2/classes/classes.factor | 3 +++ extra/regexp2/parser/parser.factor | 36 ++++++++++++++++++---------- extra/regexp2/regexp2-tests.factor | 26 ++++++++++++++++---- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/extra/regexp2/backend/backend.factor b/extra/regexp2/backend/backend.factor index 3d4b08d3c0..c39d67e7b8 100644 --- a/extra/regexp2/backend/backend.factor +++ b/extra/regexp2/backend/backend.factor @@ -19,6 +19,7 @@ TUPLE: regexp 0 >>state V{ } clone >>stack V{ } clone >>new-states + H{ } clone >>options H{ } clone >>visited-states ; SYMBOL: current-regexp diff --git a/extra/regexp2/classes/classes.factor b/extra/regexp2/classes/classes.factor index 0862f9cb63..f7907da7d5 100644 --- a/extra/regexp2/classes/classes.factor +++ b/extra/regexp2/classes/classes.factor @@ -21,6 +21,9 @@ M: letter-class class-member? ( obj class -- ? ) M: LETTER-class class-member? ( obj class -- ? ) drop LETTER? ; +M: Letter-class class-member? ( obj class -- ? ) + drop Letter? ; + M: ascii-class class-member? ( obj class -- ? ) drop ascii? ; diff --git a/extra/regexp2/parser/parser.factor b/extra/regexp2/parser/parser.factor index e173b71fc1..fef38cc887 100644 --- a/extra/regexp2/parser/parser.factor +++ b/extra/regexp2/parser/parser.factor @@ -31,18 +31,12 @@ SINGLETON: back-anchor INSTANCE: back-anchor node TUPLE: option-on option ; INSTANCE: option-on node TUPLE: option-off option ; INSTANCE: option-off node SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case ; -MIXIN: regexp-option -INSTANCE: unix-lines regexp-option -INSTANCE: dotall regexp-option -INSTANCE: multiline regexp-option -INSTANCE: comments regexp-option -INSTANCE: case-insensitive regexp-option -INSTANCE: unicode-case regexp-option SINGLETONS: letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class ascii-class punctuation-class java-printable-class blank-class -control-character-class hex-digit-class java-blank-class c-identifier-class ; +control-character-class hex-digit-class java-blank-class c-identifier-class +unmatchable-class ; SINGLETONS: beginning-of-group end-of-group beginning-of-character-class end-of-character-class @@ -75,6 +69,17 @@ left-parenthesis pipe caret dash ; : first|alternation ( seq -- first/alternation ) dup length 1 = [ first ] [ ] if ; +: ( from to -- obj ) + 2dup [ Letter? ] bi@ or get-case-insensitive and [ + [ [ ch>lower ] bi@ character-class-range boa ] + [ [ ch>upper ] bi@ character-class-range boa ] 2bi + 2array [ [ from>> ] [ to>> ] bi < ] filter + [ unmatchable-class ] [ first|alternation ] if-empty + ] [ + dup [ from>> ] [ to>> ] bi < + [ character-class-range boa ] [ 2drop unmatchable-class ] if + ] if ; + ERROR: unmatched-parentheses ; : make-positive-lookahead ( string -- ) @@ -213,10 +218,10 @@ ERROR: expected-posix-class ; read1 CHAR: { = [ expected-posix-class ] unless "}" read-until [ bad-character-class ] unless { - { "Lower" [ letter-class ] } - { "Upper" [ LETTER-class ] } - { "ASCII" [ ascii-class ] } + { "Lower" [ get-case-insensitive Letter-class letter-class ? ] } + { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] } { "Alpha" [ Letter-class ] } + { "ASCII" [ ascii-class ] } { "Digit" [ digit-class ] } { "Alnum" [ alpha-class ] } { "Punct" [ punctuation-class ] } @@ -270,6 +275,13 @@ ERROR: bad-escaped-literals seq ; { CHAR: 0 [ parse-octal ] } { CHAR: c [ parse-control-character ] } + ! { CHAR: b [ handle-word-boundary ] } + ! { CHAR: B [ handle-word-boundary ] } + ! { CHAR: A [ handle-beginning-of-input ] } + ! { CHAR: G [ end of previous match ] } + ! { CHAR: Z [ handle-end-of-input ] } + ! { CHAR: z [ handle-end-of-input ] } ! except for terminator + { CHAR: Q [ parse-escaped-literals ] } } case ; @@ -293,7 +305,7 @@ ERROR: bad-escaped-literals seq ; handle-dash handle-caret ; : apply-dash ( -- ) - stack [ pop3 nip character-class-range boa ] keep push ; + stack [ pop3 nip ] keep push ; : apply-dash? ( -- ? ) stack dup length 3 >= diff --git a/extra/regexp2/regexp2-tests.factor b/extra/regexp2/regexp2-tests.factor index dd6055117e..5e1171a28c 100644 --- a/extra/regexp2/regexp2-tests.factor +++ b/extra/regexp2/regexp2-tests.factor @@ -1,4 +1,4 @@ -USING: regexp2 tools.test kernel regexp2.traversal ; +USING: regexp2 tools.test kernel regexp2.parser regexp2.traversal ; IN: regexp2-tests [ f ] [ "b" "a*" matches? ] unit-test @@ -203,6 +203,8 @@ IN: regexp2-tests drop ] unit-test +[ "{Lower}" ] [ invalid-range? ] must-fail-with + [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test @@ -226,9 +228,25 @@ IN: regexp2-tests ! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test -! [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test -! [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test -! [ t ] [ ".o" "\\.[a-z]" matches? ] unit-test +[ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test +[ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test +[ t ] [ ".o" "\\.[a-z]" matches? ] unit-test + +[ t ] [ "a" "(?i)a" matches? ] unit-test +[ t ] [ "a" "(?i)a" matches? ] unit-test +[ t ] [ "A" "(?i)a" matches? ] unit-test +[ t ] [ "A" "(?i)a" matches? ] unit-test + +[ t ] [ "a" "(?-i)a" matches? ] unit-test +[ t ] [ "a" "(?-i)a" matches? ] unit-test +[ f ] [ "A" "(?-i)a" matches? ] unit-test +[ f ] [ "A" "(?-i)a" matches? ] unit-test + +[ f ] [ "A" "[a-z]" matches? ] unit-test +[ t ] [ "A" "[a-z]" matches? ] unit-test + +[ f ] [ "A" "\\p{Lower}" matches? ] unit-test +[ t ] [ "A" "\\p{Lower}" matches? ] unit-test ! Bug in parsing word ! [ t ] [ "a" R' a' matches? ] unit-test