case insensitive works

2008-08-21 17:55:25 -05:00 · 2008-08-21 17:55:25 -05:00 · 7d0d2da318
parent 9eba6c0034
commit 7d0d2da318
4 changed files with 50 additions and 16 deletions
--- a/extra/regexp2/backend/backend.factor
+++ b/extra/regexp2/backend/backend.factor
@ -19,6 +19,7 @@ TUPLE: regexp
    0 >>state
    V{ } clone >>stack
    V{ } clone >>new-states
+    H{ } clone >>options
    H{ } clone >>visited-states ;

 SYMBOL: current-regexp
--- a/extra/regexp2/classes/classes.factor
+++ b/extra/regexp2/classes/classes.factor
@ -21,6 +21,9 @@ M: letter-class class-member? ( obj class -- ? )
 M: LETTER-class class-member? ( obj class -- ? )
    drop LETTER? ;

+M: Letter-class class-member? ( obj class -- ? )
+    drop Letter? ;
+
 M: ascii-class class-member? ( obj class -- ? )
    drop ascii? ;

--- a/extra/regexp2/parser/parser.factor
+++ b/extra/regexp2/parser/parser.factor
@ -31,18 +31,12 @@ SINGLETON: back-anchor INSTANCE: back-anchor node
 TUPLE: option-on option ; INSTANCE: option-on node
 TUPLE: option-off option ; INSTANCE: option-off node
 SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case ;
-MIXIN: regexp-option
-INSTANCE: unix-lines regexp-option
-INSTANCE: dotall regexp-option
-INSTANCE: multiline regexp-option
-INSTANCE: comments regexp-option
-INSTANCE: case-insensitive regexp-option
-INSTANCE: unicode-case regexp-option

 SINGLETONS: letter-class LETTER-class Letter-class digit-class
 alpha-class non-newline-blank-class
 ascii-class punctuation-class java-printable-class blank-class
-control-character-class hex-digit-class java-blank-class c-identifier-class ;
+control-character-class hex-digit-class java-blank-class c-identifier-class
+unmatchable-class ;

 SINGLETONS: beginning-of-group end-of-group
 beginning-of-character-class end-of-character-class
@ -75,6 +69,17 @@ left-parenthesis pipe caret dash ;
 : first|alternation ( seq -- first/alternation )
    dup length 1 = [ first ] [ <alternation> ] if ;

+: <character-class-range> ( from to -- obj )
+    2dup [ Letter? ] bi@ or get-case-insensitive and [
+        [ [ ch>lower ] bi@ character-class-range boa ]
+        [ [ ch>upper ] bi@ character-class-range boa ] 2bi
+        2array [ [ from>> ] [ to>> ] bi < ] filter
+        [ unmatchable-class ] [ first|alternation ] if-empty
+    ] [
+        dup [ from>> ] [ to>> ] bi <
+        [ character-class-range boa ] [ 2drop unmatchable-class ] if
+    ] if ;
+
 ERROR: unmatched-parentheses ;

 : make-positive-lookahead ( string -- )
@ -213,10 +218,10 @@ ERROR: expected-posix-class ;
    read1 CHAR: { = [ expected-posix-class ] unless
    "}" read-until [ bad-character-class ] unless
    {
-        { "Lower" [ letter-class ] }
-        { "Upper" [ LETTER-class ] }
-        { "ASCII" [ ascii-class ] }
+        { "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
+        { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
        { "Alpha" [ Letter-class ] }
+        { "ASCII" [ ascii-class ] }
        { "Digit" [ digit-class ] }
        { "Alnum" [ alpha-class ] }
        { "Punct" [ punctuation-class ] }
@ -270,6 +275,13 @@ ERROR: bad-escaped-literals seq ;
        { CHAR: 0 [ parse-octal <constant> ] }
        { CHAR: c [ parse-control-character ] }

+        ! { CHAR: b [ handle-word-boundary ] }
+        ! { CHAR: B [ handle-word-boundary <negation> ] }
+        ! { CHAR: A [ handle-beginning-of-input ] }
+        ! { CHAR: G [ end of previous match ] }
+        ! { CHAR: Z [ handle-end-of-input ] }
+        ! { CHAR: z [ handle-end-of-input ] } ! except for terminator
+
        { CHAR: Q [ parse-escaped-literals ] }
    } case ;

@ -293,7 +305,7 @@ ERROR: bad-escaped-literals seq ;
    handle-dash handle-caret ;

 : apply-dash ( -- )
-    stack [ pop3 nip character-class-range boa ] keep push ;
+    stack [ pop3 nip <character-class-range> ] keep push ;

 : apply-dash? ( -- ? )
    stack dup length 3 >=
--- a/extra/regexp2/regexp2-tests.factor
+++ b/extra/regexp2/regexp2-tests.factor
@ -1,4 +1,4 @@
-USING: regexp2 tools.test kernel regexp2.traversal ;
+USING: regexp2 tools.test kernel regexp2.parser regexp2.traversal ;
 IN: regexp2-tests

 [ f ] [ "b" "a*" <regexp> matches? ] unit-test
@ -203,6 +203,8 @@ IN: regexp2-tests
    <regexp> drop
 ] unit-test

+[ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
+
 [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
 [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test

@ -226,9 +228,25 @@ IN: regexp2-tests
 ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
 ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test

-! [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
-! [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
-! [ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
+[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
+[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
+[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
+
+[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
+[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
+[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
+[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
+
+[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
+[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
+[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
+[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
+
+[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
+[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
+
+[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
+[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test

 ! Bug in parsing word
 ! [ t ] [ "a" R' a' matches?  ] unit-test