From e41cdf5e8f6a848df14a015b70ca18612b630c35 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <littledan@Macintosh-103.local>
Date: Fri, 20 Feb 2009 17:54:48 -0600
Subject: [PATCH] Various unfinshed regexp changes

---
 basis/regexp/ast/ast.factor         |   8 +-
 basis/regexp/classes/classes.factor |  60 +++++++------
 basis/regexp/dfa/dfa.factor         |  31 ++++++-
 basis/regexp/nfa/nfa.factor         | 126 +++++++++++++++-------------
 basis/regexp/parser/parser.factor   |   6 +-
 basis/regexp/regexp-tests.factor    |  16 ++++
 6 files changed, 153 insertions(+), 94 deletions(-)
diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor
index e1308f0855..65748005f4 100644
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@@ -18,7 +18,7 @@ SINGLETON: epsilon
 TUPLE: concatenation first second ;
 
 : <concatenation> ( seq -- concatenation )
-    epsilon [ concatenation boa ] reduce ;
+    [ epsilon ] [ unclip [ concatenation boa ] reduce ] if-empty ;
 
 TUPLE: alternation first second ;
 
@@ -54,3 +54,9 @@ M: from-to <times>
 
 : char-class ( ranges ? -- term )
     [ <or-class> ] dip [ <not-class> ] when ;
+
+TUPLE: lookahead term ;
+C: <lookahead> lookahead
+
+TUPLE: lookbehind term ;
+C: <lookbehind> lookbehind
diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor
index aaa650726c..516b6b4a1d 100644
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -4,28 +4,6 @@ USING: accessors kernel math math.order words
 ascii unicode.categories combinators.short-circuit sequences ;
 IN: regexp.classes
 
-: punct? ( ch -- ? )
-    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
-
-: c-identifier-char? ( ch -- ? )
-    { [ alpha? ] [ CHAR: _ = ] } 1|| ;
-
-: java-blank? ( ch -- ? )
-    {
-        CHAR: \s CHAR: \t CHAR: \n
-        HEX: b HEX: 7 CHAR: \r
-    } member? ;
-
-: java-printable? ( ch -- ? )
-    [ [ alpha? ] [ punct? ] ] 1|| ;
-
-: hex-digit? ( ch -- ? )
-    {
-        [ CHAR: A CHAR: F between? ]
-        [ CHAR: a CHAR: f between? ]
-        [ CHAR: 0 CHAR: 9 between? ]
-    } 1|| ;
-
 SINGLETONS: any-char any-char-no-nl
 letter-class LETTER-class Letter-class digit-class
 alpha-class non-newline-blank-class
@@ -70,16 +48,24 @@ M: ascii-class class-member? ( obj class -- ? )
 M: digit-class class-member? ( obj class -- ? )
     drop digit? ;
 
+: c-identifier-char? ( ch -- ? )
+    { [ alpha? ] [ CHAR: _ = ] } 1|| ;
+
 M: c-identifier-class class-member? ( obj class -- ? )
-    drop
-    { [ digit? ] [ Letter? ] [ CHAR: _ = ] } 1|| ;
+    drop c-identifier-char? ;
 
 M: alpha-class class-member? ( obj class -- ? )
     drop alpha? ;
 
+: punct? ( ch -- ? )
+    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
+
 M: punctuation-class class-member? ( obj class -- ? )
     drop punct? ;
 
+: java-printable? ( ch -- ? )
+    { [ alpha? ] [ punct? ] } 1|| ;
+
 M: java-printable-class class-member? ( obj class -- ? )
     drop java-printable? ;
 
@@ -89,9 +75,22 @@ M: non-newline-blank-class class-member? ( obj class -- ? )
 M: control-character-class class-member? ( obj class -- ? )
     drop control? ;
 
+: hex-digit? ( ch -- ? )
+    {
+        [ CHAR: A CHAR: F between? ]
+        [ CHAR: a CHAR: f between? ]
+        [ CHAR: 0 CHAR: 9 between? ]
+    } 1|| ;
+
 M: hex-digit-class class-member? ( obj class -- ? )
     drop hex-digit? ;
 
+: java-blank? ( ch -- ? )
+    {
+        CHAR: \s CHAR: \t CHAR: \n
+        HEX: b HEX: 7 CHAR: \r
+    } member? ;
+
 M: java-blank-class class-member? ( obj class -- ? )
     drop java-blank? ;
 
@@ -99,13 +98,7 @@ M: unmatchable-class class-member? ( obj class -- ? )
     2drop f ;
 
 M: terminator-class class-member? ( obj class -- ? )
-    drop {
-        [ CHAR: \r = ]
-        [ CHAR: \n = ]
-        [ CHAR: \u000085 = ]
-        [ CHAR: \u002028 = ]
-        [ CHAR: \u002029 = ]
-    } 1|| ;
+    drop "\r\n\u000085\u002029\u002028" member? ;
 
 M: beginning-of-line class-member? ( obj class -- ? )
     2drop f ;
@@ -119,6 +112,9 @@ C: <or-class> or-class
 TUPLE: not-class class ;
 C: <not-class> not-class
 
+: <and-class> ( classes -- class )
+    [ <not-class> ] map <or-class> <not-class> ;
+
 TUPLE: primitive-class class ;
 C: <primitive-class> primitive-class
 
@@ -130,3 +126,5 @@ M: not-class class-member?
 
 M: primitive-class class-member?
     class>> class-member? ;
+
+UNION: class primitive-class not-class or-class range ;
diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor
index 88e4e8f9ff..9834ca4ca0 100644
--- a/basis/regexp/dfa/dfa.factor
+++ b/basis/regexp/dfa/dfa.factor
@@ -2,7 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs combinators fry kernel locals
 math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors sequences.deep ;
+sets sorting vectors sequences.deep math.functions regexp.classes ;
 USING: io prettyprint threads ;
 IN: regexp.dfa
 
@@ -17,6 +17,34 @@ IN: regexp.dfa
 : while-changes ( obj quot pred -- obj' )
     3dup nip call (while-changes) ; inline
 
+TUPLE: parts in out ;
+
+: make-partition ( choices classes -- partition )
+    zip [ first ] partition parts boa ;
+
+: powerset-partition ( classes -- partitions )
+    ! Here is where class algebra will happen, when I implement it
+    [ length [ 2^ ] keep ] keep '[
+        _ [ ] map-bits _ make-partition
+    ] map ;
+
+: partition>class ( parts -- class )
+    [ in>> ] [ out>> ] bi
+    [ <or-class> ] bi@ <not-class> 2array <and-class> ;
+
+: get-transitions ( partition state-transitions -- next-states )
+    [ in>> ] dip '[ at ] gather ;
+
+: disambiguate-overlap ( nfa -- nfa' )  
+    [
+        [
+            [ keys powerset-partition ] keep '[
+                [ partition>class ]
+                [ _ get-transitions ] bi
+            ] H{ } map>assoc
+        ] assoc-map
+    ] change-transitions ;
+
 : find-delta ( states transition nfa -- new-states )
     transitions>> '[ _ swap _ at at ] gather sift ;
 
@@ -72,6 +100,7 @@ IN: regexp.dfa
         swap find-start-state >>start-state ;
 
 : construct-dfa ( nfa -- dfa )
+    disambiguate-overlap
     dup initialize-dfa
     dup start-state>> 1vector
     H{ } clone
diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor
index 6775124e60..370b354276 100644
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@@ -3,17 +3,26 @@
 USING: accessors arrays assocs grouping kernel
 locals math namespaces sequences fry quotations
 math.order math.ranges vectors unicode.categories
-regexp.transition-tables words sets hashtables
+regexp.transition-tables words sets hashtables combinators.short-circuit
 unicode.case.private regexp.ast regexp.classes ;
+IN: regexp.nfa
+
 ! This uses unicode.case.private for ch>upper and ch>lower
 ! but case-insensitive matching should be done by case-folding everything
 ! before processing starts
-IN: regexp.nfa
 
-SYMBOL: negated?
+GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
+! This is unfinished and does nothing right now!
 
-: negate ( -- )
-    negated? [ not ] change ;
+M: object remove-lookahead ;
+
+M: with-options remove-lookahead
+    [ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
+
+M: alternation remove-lookahead
+    [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ ;
+
+M: concatenation remove-lookahead ;
 
 SINGLETON: eps
 
@@ -45,16 +54,9 @@ SYMBOL: nfa-table
 
 GENERIC: nfa-node ( node -- start-state end-state )
 
-:: add-simple-entry ( obj class -- start-state end-state )
-    next-state :> s0
-    next-state :> s1
-    negated? get [
-        s0 f obj class make-transition table add-transition
-        s0 s1 <default-transition> table add-transition
-    ] [
-        s0 s1 obj class make-transition table add-transition
-    ] if
-    s0 s1 ;
+: add-simple-entry ( obj class -- start-state end-state )
+    [ next-state next-state 2dup ] 2dip
+    make-transition table add-transition ;
 
 : epsilon-transition ( source target -- )
     eps <literal-transition> table add-transition ;
@@ -92,62 +94,66 @@ M: alternation nfa-node ( node -- start end )
     [ nfa-node ] bi@
     alternate-nodes ;
 
+GENERIC: modify-class ( char-class -- char-class' )
+
+M: object modify-class ;
+
+M: integer modify-class
+    case-insensitive option? [
+        dup Letter? [
+            [ ch>lower ] [ ch>upper ] bi 2array <or-class>
+        ] when
+    ] when ;
+
 M: integer nfa-node ( node -- start end )
+    modify-class dup class?
+    class-transition literal-transition ?
+    add-simple-entry ;
+
+M: primitive-class modify-class
+    class>> modify-class <primitive-class> ;
+
+M: or-class modify-class
+    seq>> [ modify-class ] map <or-class> ;
+
+M: not-class modify-class
+    class>> modify-class <not-class> ;
+
+M: any-char modify-class
+    [ dotall option? ] dip any-char-no-nl ? ;
+
+: modify-letter-class ( class -- newclass )
+    case-insensitive option? [ drop Letter-class ] when ;
+M: letter-class modify-class modify-letter-class ;
+M: LETTER-class modify-class modify-letter-class ;
+
+: cased-range? ( range -- ? )
+    [ from>> ] [ to>> ] bi {
+        [ [ letter? ] bi@ and ]
+        [ [ LETTER? ] bi@ and ]
+    } 2|| ;
+
+M: range modify-class
     case-insensitive option? [
-        dup [ ch>lower ] [ ch>upper ] bi
-        2dup = [
-            2drop
-            literal-transition add-simple-entry
-        ] [
-            [ literal-transition add-simple-entry ] bi@
-            alternate-nodes [ nip ] dip
-        ] if
-    ] [ literal-transition add-simple-entry ] if ;
-
-M: primitive-class nfa-node ( node -- start end )
-    class>> dup
-    { letter-class LETTER-class } member? case-insensitive option? and
-    [ drop Letter-class ] when
-    class-transition add-simple-entry ;
-
-M: or-class nfa-node class-transition add-simple-entry ;
-M: not-class nfa-node class-transition add-simple-entry ;
-
-M: any-char nfa-node ( node -- start end )
-    [ dotall option? ] dip any-char-no-nl ?
-    class-transition add-simple-entry ;
-
-! M: negation nfa-node ( node -- start end )
-!     negate term>> nfa-node negate ;
-
-M: range nfa-node ( node -- start end )
-    case-insensitive option? [
-        ! This should be implemented for Unicode by case-folding
-        ! the input and all strings in the regexp.
-        dup [ from>> ] [ to>> ] bi
-        2dup [ Letter? ] bi@ and [
-            rot drop
+        dup cased-range? [
+            [ from>> ] [ to>> ] bi
             [ [ ch>lower ] bi@ <range> ]
             [ [ ch>upper ] bi@ <range> ] 2bi 
-            [ class-transition add-simple-entry ] bi@
-            alternate-nodes
-        ] [
-            2drop
-            class-transition add-simple-entry
-        ] if
-    ] [
-        class-transition add-simple-entry
-    ] if ;
+            2array <or-class>
+        ] when
+    ] when ;
+
+M: class nfa-node
+    modify-class class-transition add-simple-entry ;
 
 M: with-options nfa-node ( node -- start end )
     dup options>> [ tree>> nfa-node ] using-options ;
 
 : construct-nfa ( ast -- nfa-table )
     [
-        negated? off
         0 state set
-        <transition-table> clone nfa-table set
-        nfa-node
+        <transition-table> nfa-table set
+        remove-lookahead nfa-node
         table
             swap dup associate >>final-states
             swap >>start-state
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor
index 3a7ba12552..18b43674c4 100644
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -132,11 +132,15 @@ Parenthized = "?:" Alternation:a => [[ a ]]
                 => [[ a on off parse-options <with-options> ]]
             | "?#" [^)]* => [[ f ]]
             | "?~" Alternation:a => [[ a <negation> ]]
+            | "?=" Alternation:a => [[ a <lookahead> ]]
+            | "?!" Alternation:a => [[ a <negation> <lookahead> ]]
+            | "?<=" Alternation:a => [[ a <lookbehind> ]]
+            | "?<!" Alternation:a => [[ a <negation> <lookbehind> ]]
             | Alternation
 
 Element = "(" Parenthized:p ")" => [[ p ]]
         | "[" CharClass:r "]" => [[ r ]]
-        | ".":d => [[ any-char ]]
+        | ".":d => [[ any-char <primitive-class> ]]
         | Character
 
 Number = (!(","|"}").)* => [[ string>number ensure-number ]]
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor
index 4331eaa250..0d9ed129c8 100644
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -317,6 +317,22 @@ IN: regexp-tests
 ! Bug in parsing word
 [ t ] [ "a" R' a' matches? ] unit-test
 
+! Testing negation
+[ f ] [ "a" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "aa" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a)/ matches? ] unit-test
+
+[ f ] [ "a" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "aa" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "bb" R/ (?~a+|b)/ matches? ] unit-test
+[ f ] [ "b" R/ (?~a+|b)/ matches? ] unit-test
+[ t ] [ "" R/ (?~a+|b)/ matches? ] unit-test
+
+! Intersecting classes
+[ t ] [ "ab" R/ ac|\p{Lower}b/ matches? ] unit-test
+[ t ] [ "ab" R/ ac|[a-z]b/ matches? ] unit-test
+
 ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
 ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
 ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test