case-insensitive for constants

2008-08-21 17:12:26 -05:00 · 2008-08-21 17:12:26 -05:00 · 9eba6c0034
parent 88134e539f
commit 9eba6c0034
4 changed files with 41 additions and 20 deletions
--- a/extra/regexp2/backend/backend.factor
+++ b/extra/regexp2/backend/backend.factor
@ -7,10 +7,10 @@ TUPLE: regexp
    raw
    { stack vector }
    parse-tree
+    { options hashtable }
    nfa-table
    dfa-table
    minimized-table
-    case-insensitive
    { state integer }
    { new-states vector }
    { visited-states hashtable } ;
--- a/extra/regexp2/parser/parser.factor
+++ b/extra/regexp2/parser/parser.factor
@ -1,10 +1,10 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs combinators io io.streams.string
-kernel math math.parser multi-methods namespaces qualified
+kernel math math.parser multi-methods namespaces qualified sets
 quotations sequences sequences.lib splitting symbols vectors
-dlists math.order combinators.lib unicode.categories
-sequences.lib regexp2.backend regexp2.utils ;
+dlists math.order combinators.lib unicode.categories strings
+sequences.lib regexp2.backend regexp2.utils unicode.case ;
 IN: regexp2.parser

 FROM: math.ranges => [a,b] ;
@ -48,12 +48,26 @@ SINGLETONS: beginning-of-group end-of-group
 beginning-of-character-class end-of-character-class
 left-parenthesis pipe caret dash ;

-: <constant> ( obj -- constant ) constant boa ;
+: get-option ( option -- ? ) current-regexp get options>> at ;
+: get-unix-lines ( -- ? ) unix-lines get-option ;
+: get-dotall ( -- ? ) dotall get-option ;
+: get-multiline ( -- ? ) multiline get-option ;
+: get-comments ( -- ? ) comments get-option ;
+: get-case-insensitive ( -- ? ) case-insensitive get-option ;
+: get-unicode-case ( -- ? ) unicode-case get-option ;
+
 : <negation> ( obj -- negation ) negation boa ;
 : <concatenation> ( seq -- concatenation ) >vector concatenation boa ;
 : <alternation> ( seq -- alternation ) >vector alternation boa ;
 : <capture-group> ( obj -- capture-group ) capture-group boa ;
 : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
+: <constant> ( obj -- constant )
+    dup Letter? get-case-insensitive and [
+        [ ch>lower constant boa ]
+        [ ch>upper constant boa ] bi 2array <alternation>
+    ] [
+        constant boa
+    ] if ;

 : first|concatenation ( seq -- first/concatenation )
    dup length 1 = [ first ] [ <concatenation> ] if ;
@ -95,19 +109,20 @@ ERROR: bad-option ch ;
        { CHAR: x [ comments ] }
        [ bad-option ]
    } case ;
-    
-: option-on ( ch -- ) option \ option-on boa push-stack ;
-: option-off ( ch -- ) option \ option-off boa push-stack ;
-: toggle-option ( ch ? -- ) [ option-on ] [ option-off ] if ;
+
+: option-on ( option -- ) current-regexp get options>> conjoin ;
+: option-off ( option -- ) current-regexp get options>> delete-at ;
+
+: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
 : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;

 : parse-options ( string -- )
    "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;

 DEFER: (parse-regexp)
-: parse-special-group-options ( options -- )
+: parse-special-group ( -- )
    beginning-of-group push-stack
-    parse-options (parse-regexp) pop-stack make-non-capturing-group ;
+    (parse-regexp) pop-stack make-non-capturing-group ;

 ERROR: bad-special-group string ;

@ -126,8 +141,13 @@ ERROR: bad-special-group string ;
        { [ dup CHAR: < = peek1 CHAR: ! = and ]
            [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] }
        [
-            ":" read-until [ bad-special-group ] unless
-            swap prefix parse-special-group-options
+            ":)" read-until
+            [ swap prefix ] dip
+            {
+                { CHAR: : [ parse-options parse-special-group ] }
+                { CHAR: ) [ parse-options ] }
+                [ drop bad-special-group ]
+            } case
        ]
    } cond ;

@ -312,10 +332,8 @@ DEFER: handle-left-bracket
    beginning-of-character-class push-stack
    parse-character-class-first (parse-character-class) ;

-ERROR: empty-regexp ;
 : finish-regexp-parse ( stack -- obj )
    dup length {
-        { 0 [ empty-regexp ] }
        { 1 [ first ] }
        [
            drop { pipe } split
--- a/extra/regexp2/regexp2-tests.factor
+++ b/extra/regexp2/regexp2-tests.factor
@ -151,7 +151,7 @@ IN: regexp2-tests
 [ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
 [ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test

-[ t ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
+[ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
 [ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
 [ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
 [ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
--- a/extra/regexp2/regexp2.factor
+++ b/extra/regexp2/regexp2.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors combinators kernel math math.ranges
-sequences regexp2.backend regexp2.utils memoize
+sequences regexp2.backend regexp2.utils memoize sets
 regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal
 regexp2.transition-tables ;
 IN: regexp2
@ -30,12 +30,15 @@ IN: regexp2

 : match-head ( string regexp -- end ) match length>> 1- ;

-MEMO: <regexp> ( string -- regexp )
+: initial-option ( regexp option -- regexp' )
+    over options>> conjoin ;
+
+: <regexp> ( string -- regexp )
    default-regexp construct-regexp ;

-MEMO: <iregexp> ( string -- regexp )
+: <iregexp> ( string -- regexp )
    default-regexp
-    t >>case-insensitive
+    case-insensitive initial-option
    construct-regexp ;

 : R! CHAR: ! <regexp> ; parsing