working on character classes

still have to do negation, &&, and :foo:
2008-05-21 00:30:22 -05:00 · 2008-05-21 00:30:22 -05:00 · 9307bb3b4a
parent 3f315e8b4a
commit 9307bb3b4a
2 changed files with 85 additions and 64 deletions
--- a/extra/regexp4/regexp4-tests.factor
+++ b/extra/regexp4/regexp4-tests.factor
@ -77,58 +77,58 @@ IN: regexp4-tests
 [ t ] [ "aaa" "a{1,3}" <regexp> matches? ] unit-test
 [ f ] [ "aaaa" "a{1,3}" <regexp> matches? ] unit-test

-! [ f ] [ "" "[a]" <regexp> matches? ] unit-test
-! [ t ] [ "a" "[a]" <regexp> matches? ] unit-test
-! [ t ] [ "a" "[abc]" <regexp> matches? ] unit-test
-! [ f ] [ "b" "[a]" <regexp> matches? ] unit-test
-! [ f ] [ "d" "[abc]" <regexp> matches? ] unit-test
-! [ t ] [ "ab" "[abc]{1,2}" <regexp> matches? ] unit-test
-! [ f ] [ "abc" "[abc]{1,2}" <regexp> matches? ] unit-test
+[ f ] [ "" "[a]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[a]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[abc]" <regexp> matches? ] unit-test
+[ f ] [ "b" "[a]" <regexp> matches? ] unit-test
+[ f ] [ "d" "[abc]" <regexp> matches? ] unit-test
+[ t ] [ "ab" "[abc]{1,2}" <regexp> matches? ] unit-test
+[ f ] [ "abc" "[abc]{1,2}" <regexp> matches? ] unit-test

-! [ f ] [ "" "[^a]" <regexp> matches? ] unit-test
-! [ f ] [ "a" "[^a]" <regexp> matches? ] unit-test
-! [ f ] [ "a" "[^abc]" <regexp> matches? ] unit-test
-! [ t ] [ "b" "[^a]" <regexp> matches? ] unit-test
-! [ t ] [ "d" "[^abc]" <regexp> matches? ] unit-test
-! [ f ] [ "ab" "[^abc]{1,2}" <regexp> matches? ] unit-test
-! [ f ] [ "abc" "[^abc]{1,2}" <regexp> matches? ] unit-test
+[ f ] [ "" "[^a]" <regexp> matches? ] unit-test
+[ f ] [ "a" "[^a]" <regexp> matches? ] unit-test
+[ f ] [ "a" "[^abc]" <regexp> matches? ] unit-test
+[ t ] [ "b" "[^a]" <regexp> matches? ] unit-test
+[ t ] [ "d" "[^abc]" <regexp> matches? ] unit-test
+[ f ] [ "ab" "[^abc]{1,2}" <regexp> matches? ] unit-test
+[ f ] [ "abc" "[^abc]{1,2}" <regexp> matches? ] unit-test

-! [ t ] [ "]" "[]]" <regexp> matches? ] unit-test
-! [ f ] [ "]" "[^]]" <regexp> matches? ] unit-test
+[ t ] [ "]" "[]]" <regexp> matches? ] unit-test
+[ f ] [ "]" "[^]]" <regexp> matches? ] unit-test

-! [ "^" "[^]" <regexp> matches? ] must-fail
-! [ t ] [ "^" "[]^]" <regexp> matches? ] unit-test
-! [ t ] [ "]" "[]^]" <regexp> matches? ] unit-test
+[ "^" "[^]" <regexp> matches? ] must-fail
+[ t ] [ "^" "[]^]" <regexp> matches? ] unit-test
+[ t ] [ "]" "[]^]" <regexp> matches? ] unit-test

-! [ t ] [ "[" "[[]" <regexp> matches? ] unit-test
-! [ f ] [ "^" "[^^]" <regexp> matches? ] unit-test
-! [ t ] [ "a" "[^^]" <regexp> matches? ] unit-test
+[ t ] [ "[" "[[]" <regexp> matches? ] unit-test
+[ f ] [ "^" "[^^]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[^^]" <regexp> matches? ] unit-test

-! [ t ] [ "-" "[-]" <regexp> matches? ] unit-test
-! [ f ] [ "a" "[-]" <regexp> matches? ] unit-test
-! [ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
-! [ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
+[ t ] [ "-" "[-]" <regexp> matches? ] unit-test
+[ f ] [ "a" "[-]" <regexp> matches? ] unit-test
+[ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[^-]" <regexp> matches? ] unit-test

-! [ t ] [ "-" "[-a]" <regexp> matches? ] unit-test
-! [ t ] [ "a" "[-a]" <regexp> matches? ] unit-test
-! [ t ] [ "-" "[a-]" <regexp> matches? ] unit-test
-! [ t ] [ "a" "[a-]" <regexp> matches? ] unit-test
-! [ f ] [ "b" "[a-]" <regexp> matches? ] unit-test
-! [ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
-! [ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
+[ t ] [ "-" "[-a]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[-a]" <regexp> matches? ] unit-test
+[ t ] [ "-" "[a-]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[a-]" <regexp> matches? ] unit-test
+[ f ] [ "b" "[a-]" <regexp> matches? ] unit-test
+[ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[^-]" <regexp> matches? ] unit-test

-! [ f ] [ "-" "[a-c]" <regexp> matches? ] unit-test
-! [ t ] [ "-" "[^a-c]" <regexp> matches? ] unit-test
-! [ t ] [ "b" "[a-c]" <regexp> matches? ] unit-test
-! [ f ] [ "b" "[^a-c]" <regexp> matches? ] unit-test
+[ f ] [ "-" "[a-c]" <regexp> matches? ] unit-test
+[ t ] [ "-" "[^a-c]" <regexp> matches? ] unit-test
+[ t ] [ "b" "[a-c]" <regexp> matches? ] unit-test
+[ f ] [ "b" "[^a-c]" <regexp> matches? ] unit-test

-! [ t ] [ "-" "[a-c-]" <regexp> matches? ] unit-test
-! [ f ] [ "-" "[^a-c-]" <regexp> matches? ] unit-test
+[ t ] [ "-" "[a-c-]" <regexp> matches? ] unit-test
+[ f ] [ "-" "[^a-c-]" <regexp> matches? ] unit-test

-! [ t ] [ "\\" "[\\\\]" <regexp> matches? ] unit-test
-! [ f ] [ "a" "[\\\\]" <regexp> matches? ] unit-test
-! [ f ] [ "\\" "[^\\\\]" <regexp> matches? ] unit-test
-! [ t ] [ "a" "[^\\\\]" <regexp> matches? ] unit-test
+[ t ] [ "\\" "[\\\\]" <regexp> matches? ] unit-test
+[ f ] [ "a" "[\\\\]" <regexp> matches? ] unit-test
+[ f ] [ "\\" "[^\\\\]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[^\\\\]" <regexp> matches? ] unit-test


 ! ((A)(B(C)))
--- a/extra/regexp4/regexp4.factor
+++ b/extra/regexp4/regexp4.factor
@ -4,12 +4,12 @@ USING: accessors arrays assocs combinators kernel math
 sequences namespaces locals combinators.lib state-tables
 math.parser state-parser sets dlists unicode.categories
 math.order quotations shuffle math.ranges splitting
-symbols ;
+symbols fry ;
 IN: regexp4

 SYMBOLS: eps start-state final-state beginning-of-text
 end-of-text left-parenthesis alternation left-bracket
-caret dash ampersand semicolon ;
+caret dash ampersand colon ;

 SYMBOL: runtime-epsilon

@ -84,12 +84,14 @@ ERROR: unbalanced-brackets ;
        [ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop
    ] when ;

-: cut-stack ( n vector -- vector' vector )
-    <reversed> tuck index cut reverse dup pop* ;
+: cut-out ( vector n -- vector' vector ) cut rest ;
+
+: cut-stack ( obj vector -- vector' vector )
+    tuck last-index cut-out swap ;
    
-: apply-til-last ( token regexp -- )
+: apply-til-last ( regexp token -- )
    swap [ cut-stack ] change-stack
-    >r reverse r> apply-loop stack>> push-all ;
+    apply-loop stack>> push-all ;

 : concatenation-loop ( regexp -- )
    dup stack>> dup apply-concatenation?
@ -300,13 +302,13 @@ ERROR: bad-hex number ;

 : parse-escaped ( regexp -- )
    next get-char {
-        { CHAR: \ [ CHAR: \ make-nontoken-nfa ] }
-        { CHAR: t [ CHAR: \t make-nontoken-nfa ] }
-        { CHAR: n [ CHAR: \n make-nontoken-nfa ] }
-        { CHAR: r [ CHAR: \r make-nontoken-nfa ] }
-        { CHAR: f [ HEX: c make-nontoken-nfa ] }
-        { CHAR: a [ HEX: 7 make-nontoken-nfa ] }
-        { CHAR: e [ HEX: 1b make-nontoken-nfa ] }
+        { CHAR: \ [ [ CHAR: \ = ] make-nontoken-nfa ] }
+        { CHAR: t [ [ CHAR: \t = ] make-nontoken-nfa ] }
+        { CHAR: n [ [ CHAR: \n = ] make-nontoken-nfa ] }
+        { CHAR: r [ [ CHAR: \r = ] make-nontoken-nfa ] }
+        { CHAR: f [ [ HEX: c = ] make-nontoken-nfa ] }
+        { CHAR: a [ [ HEX: 7 = ] make-nontoken-nfa ] }
+        { CHAR: e [ [ HEX: 1b = ] make-nontoken-nfa ] }

        { CHAR: d [ [ digit? ] make-nontoken-nfa ] }
        { CHAR: D [ [ digit? not ] make-nontoken-nfa ] }
@ -335,31 +337,50 @@ ERROR: bad-hex number ;
    } case ;

 : make-character-set ( regexp -- )
-    stack>> throw ;
+    left-bracket over stack>> cut-stack
+    pick (>>stack)
+    [ dup number? [ '[ dup , = ] ] when ] map
+    [ [ drop t ] 2array ] map [ drop f ] suffix [ cond ] curry
+    make-nontoken-nfa ;

+: apply-dash ( regexp -- )
+    stack>> dup [ pop ] [ pop* ] [ pop ] tri
+    swap '[ dup , , between? ] swap push ;
+
+: apply-dash? ( regexp -- ? )
+    stack>> dup length 3 >=
+    [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
+
+DEFER: parse-character-set
 : (parse-character-set) ( regexp -- )
    [
        next get-char
        {
-            { CHAR: [ [ [ 1+ ] change-bracket-count left-bracket push-stack ] }
-            { CHAR: ] [ [ 1- ] change-bracket-count left-bracket over stack>> cut-stack ] }
+            { CHAR: [ [
+                [ 1+ ] change-bracket-count left-bracket push-stack
+                parse-character-set
+            ] }
+            { CHAR: ] [
+                [ 1- ] change-bracket-count
+                make-character-set
+            ] }
            { CHAR: - [ dash push-stack ] }
-            { CHAR: & [ ampersand push-stack ] }
-            { CHAR: : [ semicolon push-stack ] }
+            ! { CHAR: & [ ampersand push-stack ] }
+            ! { CHAR: : [ semicolon push-stack ] }
            { CHAR: \ [ parse-escaped ] }
            { f [ unbalanced-brackets ] }
-            [ make-nontoken-nfa ]
+            [ dupd push-stack dup apply-dash? [ apply-dash ] [ drop ] if ]
        } case
    ] [
        dup bracket-count>> 0 >
-        [ (parse-character-set) ]
-        [ make-character-set ] if
+        [ (parse-character-set) ] [ drop ] if
    ] bi ;

 : parse-character-set-first ( regexp -- )
    get-next
    {
        { CHAR: ^ [ caret push-stack next ] }
+        { CHAR: [ [ CHAR: [ make-nontoken-nfa next ] }
        { CHAR: ] [ CHAR: ] make-nontoken-nfa next ] }
        [ 2drop ]
    } case ;