Regexp negation (partial) and cleanup of regexp.nfa

2009-02-19 16:48:46 -06:00 · 2009-02-19 16:48:46 -06:00 · 9565b59928
parent fa84f4c752
commit 9565b59928
8 changed files with 184 additions and 123 deletions
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@ -16,11 +16,17 @@ C: <from-to> from-to
 TUPLE: at-least n ;
 C: <at-least> at-least

-TUPLE: concatenation seq ;
-C: <concatenation> concatenation
+SINGLETON: epsilon

-TUPLE: alternation seq ;
-C: <alternation> alternation
+TUPLE: concatenation first second ;
+
+: <concatenation> ( seq -- concatenation )
+    epsilon [ concatenation boa ] reduce ;
+
+TUPLE: alternation first second ;
+
+: <alternation> ( seq -- alternation )
+    unclip [ alternation boa ] reduce ;

 TUPLE: star term ;
 C: <star> star
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@ -1,4 +1,4 @@
-! Copyright (C) 2008 Doug Coleman.
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors kernel math math.order words
 ascii unicode.categories combinators.short-circuit sequences ;
@ -41,9 +41,10 @@ C: <range> range

 GENERIC: class-member? ( obj class -- ? )

+! When does t get put in?
 M: t class-member? ( obj class -- ? ) 2drop f ;

-M: integer class-member? ( obj class -- ? ) 2drop f ;
+M: integer class-member? ( obj class -- ? ) = ;

 M: range class-member? ( obj class -- ? )
    [ from>> ] [ to>> ] bi between? ;
@ -111,3 +112,15 @@ M: beginning-of-line class-member? ( obj class -- ? )

 M: end-of-line class-member? ( obj class -- ? )
    2drop f ;
+
+TUPLE: or-class seq ;
+C: <or-class> or-class
+
+TUPLE: not-class class ;
+C: <not-class> not-class
+
+M: or-class class-member?
+    seq>> [ class-member? ] with any? ;
+
+M: not-class class-member?
+    class>> class-member? not ;
--- a/basis/regexp/minimize/minimize.factor
+++ b/basis/regexp/minimize/minimize.factor
@ -1,20 +1,48 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: kernel sequences regexp.transition-tables fry assocs
-accessors locals math sorting arrays sets hashtables regexp.dfa  ;
+accessors locals math sorting arrays sets hashtables regexp.dfa
+combinators.short-circuit ;
 IN: regexp.minimize

+: number-transitions ( transitions numbering -- new-transitions )
+    dup '[
+        [ _ at ]
+        [ [ first _ at ] assoc-map ] bi*
+    ] assoc-map ;
+
+: table>state-numbers ( table -- assoc )
+    transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
+
+: map-set ( assoc quot -- new-assoc )
+    '[ drop @ dup ] assoc-map ; inline
+
+: rewrite-transitions ( transition-table assoc quot -- transition-table )
+    [
+        [ '[ _ at ] change-start-state ]
+        [ '[ [ _ at ] map-set ] change-final-states ]
+        [ ] tri
+    ] dip '[ _ @ ] change-transitions ; inline
+
+: number-states ( table -- newtable )
+    dup table>state-numbers
+    [ number-transitions ] rewrite-transitions ;
+
+: initially-same? ( s1 s2 transition-table -- ? )
+    {
+        [ drop <= ]
+        [ transitions>> '[ _ at keys ] bi@ set= ]
+        [ final-states>> '[ _ key? ] bi@ = ]
+    } 3&& ;
+
 :: initialize-partitions ( transition-table -- partitions )
    ! Partition table is sorted-array => ?
    H{ } clone :> out
    transition-table transitions>> keys :> states
    states [| s1 |
        states [| s2 |
-            s1 s2 <= [
-                s1 s2 [ transition-table transitions>> at keys ] bi@ set=
-                s1 s2 [ transition-table final-states>> key? ] bi@ = and
-                [ t s1 s2 2array out set-at ] when
-            ] when
+            s1 s2 transition-table initially-same?
+            [ s1 s2 2array out conjoin ] when
        ] each
    ] each out ;

@ -29,7 +57,6 @@ IN: regexp.minimize
    '[ _ same-partition? ] assoc-all? ;

 : partition-more ( partitions transition-table -- partitions )
-    ! This is horribly slow!
    over '[ drop first2 _ _ stay-same? ] assoc-filter ;

 : partition>classes ( partitions -- synonyms ) ! old-state => new-state
@ -40,7 +67,7 @@ IN: regexp.minimize

 : state-classes ( transition-table -- synonyms )
    [ initialize-partitions ] keep
-    '[ _ partition-more ] [ ] while-changes
+    '[ _ partition-more ] [ assoc-size ] while-changes
    partition>classes ;

 : canonical-state? ( state state-classes -- ? )
@ -52,33 +79,12 @@ IN: regexp.minimize
 : rewrite-duplicates ( new-transitions state-classes -- new-transitions )
    '[ [ _ at ] assoc-map ] assoc-map ;

-: map-set ( assoc quot -- new-assoc )
-    '[ drop @ dup ] assoc-map ; inline
+: combine-transitions ( transitions state-classes -- new-transitions )
+    [ delete-duplicates ] [ rewrite-duplicates ] bi ;

 : combine-states ( table -- smaller-table )
    dup state-classes
-    [
-        '[
-            _ [ delete-duplicates ]
-            [ rewrite-duplicates ] bi
-        ] change-transitions
-    ]
-    [ '[ [ _ at ] map-set ] change-final-states ]
-    [ '[ _ at ] change-start-state ]
-    tri ;
-
-: number-transitions ( transitions numbering -- new-transitions )
-    [
-        [ at ]
-        [ '[ first _ at ] assoc-map ]
-        bi-curry bi*
-    ] curry assoc-map ;
-
-: number-states ( table -- newtable )
-    dup transitions>> keys <enum> [ swap ] H{ } assoc-map-as
-    [ '[ _ at ] change-start-state ]
-    [ '[ [ _ at ] map-set ] change-final-states ]
-    [ '[ _ number-transitions ] change-transitions ] tri ;
+    [ combine-transitions ] rewrite-transitions ;

 : minimize ( table -- minimal-table )
    clone number-states combine-states ;
--- a/basis/regexp/negation/negation-tests.factor
+++ b/basis/regexp/negation/negation-tests.factor
@ -0,0 +1,27 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
+IN: regexp.negation.tests
+
+[
+    ! R/ |[^a]|.+/
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } }
+            { 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } }
+            { -1 H{ { any-char -1 } } }
+        } } 
+        { start-state 0 }
+        { final-states H{ { 0 0 } { -1 -1 } } }
+    }
+] [
+    ! R/ a/
+    T{ transition-table
+        { transitions H{
+            { 0 H{ { CHAR: a 1 } } }
+            { 1 H{ } } 
+        } }
+        { start-state 0 }
+        { final-states H{ { 1 1 } } }
+    } negate-table
+] unit-test
--- a/basis/regexp/negation/negation.factor
+++ b/basis/regexp/negation/negation.factor
@ -0,0 +1,36 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences
+assocs regexp.classes hashtables accessors ;
+IN: regexp.negation
+
+: ast>dfa ( parse-tree -- minimal-dfa )
+    construct-nfa construct-dfa minimize ;
+
+CONSTANT: fail-state -1
+
+: add-default-transition ( state's-transitions -- new-state's-transitions )
+    clone dup
+    [ [ fail-state ] dip keys <or-class> <not-class> ] keep set-at ;
+
+: fail-state-recurses ( transitions -- new-transitions )
+    clone dup
+    [ fail-state any-char associate fail-state ] dip set-at ;
+
+: add-fail-state ( transitions -- new-transitions )
+    [ add-default-transition ] assoc-map
+    fail-state-recurses ;
+
+: assoc>set ( assoc -- keys-set )
+    [ drop dup ] assoc-map ;
+
+: inverse-final-states ( transition-table -- final-states )
+    [ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
+
+: negate-table ( transition-table -- transition-table )
+    clone
+        [ add-fail-state ] change-transitions
+        dup inverse-final-states >>final-states ;
+
+! M: negation nfa-node ( node -- )
+!     ast>dfa negate-table adjoin-dfa ;
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@ -3,15 +3,13 @@
 USING: accessors arrays assocs grouping kernel
 locals math namespaces sequences fry quotations
 math.order math.ranges vectors unicode.categories
-regexp.transition-tables words sets 
+regexp.transition-tables words sets hashtables
 unicode.case.private regexp.ast regexp.classes ;
 ! This uses unicode.case.private for ch>upper and ch>lower
 ! but case-insensitive matching should be done by case-folding everything
 ! before processing starts
 IN: regexp.nfa

-ERROR: feature-is-broken feature ;
-
 SYMBOL: negated?

 : negate ( -- )
@ -21,14 +19,13 @@ SINGLETON: eps

 SYMBOL: option-stack

-SYMBOL: combine-stack
-
 SYMBOL: state

 : next-state ( -- state )
    state [ get ] [ inc ] bi ;

 SYMBOL: nfa-table
+: table ( -- table ) nfa-table get ;

 : set-each ( keys value hashtable -- )
    '[ _ swap _ set-at ] each ;
@ -46,84 +43,56 @@ SYMBOL: nfa-table
 : option? ( obj -- ? )
    option-stack get assoc-stack ;

-: set-start-state ( -- nfa-table )
-    nfa-table get
-        combine-stack get pop first >>start-state ;
+GENERIC: nfa-node ( node -- start-state end-state )

-GENERIC: nfa-node ( node -- )
-
-:: add-simple-entry ( obj class -- )
-    [let* | s0 [ next-state ]
-            s1 [ next-state ]
-            stack [ combine-stack get ]
-            table [ nfa-table get ] |
+:: add-simple-entry ( obj class -- start-state end-state )
+    next-state :> s0
+    next-state :> s1
    negated? get [
        s0 f obj class make-transition table add-transition
        s0 s1 <default-transition> table add-transition
    ] [
        s0 s1 obj class make-transition table add-transition
    ] if
-        s0 s1 2array stack push
-        t s1 table final-states>> set-at ] ;
+    s0 s1 ;

-:: concatenate-nodes ( -- )
-    [let* | stack [ combine-stack get ]
-            table [ nfa-table get ]
-            s2 [ stack peek first ]
-            s3 [ stack pop second ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ] |
-        s1 s2 eps <literal-transition> table add-transition
-        s1 table final-states>> delete-at
-        s0 s3 2array stack push ] ;
+: epsilon-transition ( source target -- )
+    eps <literal-transition> table add-transition ;

-:: alternate-nodes ( -- )
-    [let* | stack [ combine-stack get ]
-            table [ nfa-table get ]
-            s2 [ stack peek first ]
-            s3 [ stack pop second ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ]
-            s4 [ next-state ]
-            s5 [ next-state ] |
-        s4 s0 eps <literal-transition> table add-transition
-        s4 s2 eps <literal-transition> table add-transition
-        s1 s5 eps <literal-transition> table add-transition
-        s3 s5 eps <literal-transition> table add-transition
-        s1 table final-states>> delete-at
-        s3 table final-states>> delete-at
-        t s5 table final-states>> set-at
-        s4 s5 2array stack push ] ;
+M:: star nfa-node ( node -- start end )
+    node term>> nfa-node :> s1 :> s0
+    next-state :> s2
+    next-state :> s3
+    s1 s0 epsilon-transition
+    s2 s0 epsilon-transition
+    s2 s3 epsilon-transition
+    s1 s3 epsilon-transition
+    s2 s3 ;

-M: star nfa-node ( node -- )
-    term>> nfa-node
-    [let* | stack [ combine-stack get ]
-            s0 [ stack peek first ]
-            s1 [ stack pop second ]
-            s2 [ next-state ]
-            s3 [ next-state ]
-            table [ nfa-table get ] |
-        s1 table final-states>> delete-at
-        t s3 table final-states>> set-at
-        s1 s0 eps <literal-transition> table add-transition
-        s2 s0 eps <literal-transition> table add-transition
-        s2 s3 eps <literal-transition> table add-transition
-        s1 s3 eps <literal-transition> table add-transition
-        s2 s3 2array stack push ] ;
+M: epsilon nfa-node
+    drop eps literal-transition add-simple-entry ;

-M: concatenation nfa-node ( node -- )
-    seq>> [ eps literal-transition add-simple-entry ] [
-        reversed-regexp option? [ <reversed> ] when
-        [ [ nfa-node ] each ]
-        [ length 1- [ concatenate-nodes ] times ] bi
-    ] if-empty ;
+M: concatenation nfa-node ( node -- start end )
+    [ first>> ] [ second>> ] bi
+    reversed-regexp option? [ swap ] when
+    [ nfa-node ] bi@
+    [ epsilon-transition ] dip ;

-M: alternation nfa-node ( node -- )
-    seq>>
-    [ [ nfa-node ] each ]
-    [ length 1- [ alternate-nodes ] times ] bi ;
+:: alternate-nodes ( s0 s1 s2 s3 -- start end )
+    next-state :> s4
+    next-state :> s5
+    s4 s0 epsilon-transition
+    s4 s2 epsilon-transition
+    s1 s5 epsilon-transition
+    s3 s5 epsilon-transition
+    s4 s5 ;

-M: integer nfa-node ( node -- )
+M: alternation nfa-node ( node -- start end )
+    [ first>> ] [ second>> ] bi
+    [ nfa-node ] bi@
+    alternate-nodes ;
+
+M: integer nfa-node ( node -- start end )
    case-insensitive option? [
        dup [ ch>lower ] [ ch>upper ] bi
        2dup = [
@ -131,26 +100,26 @@ M: integer nfa-node ( node -- )
            literal-transition add-simple-entry
        ] [
            [ literal-transition add-simple-entry ] bi@
-            alternate-nodes drop
+            alternate-nodes [ nip ] dip
        ] if
    ] [
        literal-transition add-simple-entry
    ] if ;

-M: primitive-class nfa-node ( node -- )
+M: primitive-class nfa-node ( node -- start end )
    class>> dup
    { letter-class LETTER-class } member? case-insensitive option? and
    [ drop Letter-class ] when
    class-transition add-simple-entry ;

-M: any-char nfa-node ( node -- )
+M: any-char nfa-node ( node -- start end )
    [ dotall option? ] dip any-char-no-nl ?
    class-transition add-simple-entry ;

-M: negation nfa-node ( node -- )
+M: negation nfa-node ( node -- start end )
    negate term>> nfa-node negate ;

-M: range nfa-node ( node -- )
+M: range nfa-node ( node -- start end )
    case-insensitive option? [
        ! This should be implemented for Unicode by case-folding
        ! the input and all strings in the regexp.
@ -169,15 +138,16 @@ M: range nfa-node ( node -- )
        class-transition add-simple-entry
    ] if ;

-M: with-options nfa-node ( node -- )
+M: with-options nfa-node ( node -- start end )
    dup options>> [ tree>> nfa-node ] using-options ;

 : construct-nfa ( ast -- nfa-table )
    [
        negated? off
-        V{ } clone combine-stack set
        0 state set
        <transition-table> clone nfa-table set
        nfa-node
-        set-start-state
+        table
+            swap dup associate >>final-states
+            swap >>start-state
    ] with-scope ;
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@ -4,6 +4,7 @@ USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
 combinators regexp.classes strings splitting peg locals accessors
 regexp.ast ;
 IN: regexp.parser
+
 : allowed-char? ( ch -- ? )
    ".()|[*+?" member? not ;

@ -130,6 +131,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
            | "?" Options:on "-"? Options:off ":" Alternation:a
                => [[ a on off parse-options <with-options> ]]
            | "?#" [^)]* => [[ f ]]
+            | "?~" Alternation:a => [[ a <negation> ]]
            | Alternation

 Element = "(" Parenthized:p ")" => [[ p ]]
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@ -4,14 +4,15 @@ USING: accessors combinators kernel math sequences strings sets
 assocs prettyprint.backend prettyprint.custom make lexer
 namespaces parser arrays fry locals regexp.minimize
 regexp.parser regexp.nfa regexp.dfa regexp.traversal
-regexp.transition-tables splitting sorting regexp.ast ;
+regexp.transition-tables splitting sorting regexp.ast
+regexp.negation ;
 IN: regexp

 TUPLE: regexp raw parse-tree options dfa ;

 : <optioned-regexp> ( string options -- regexp )
    [ dup parse-regexp ] [ string>options ] bi*
-    2dup <with-options> construct-nfa construct-dfa minimize
+    2dup <with-options> ast>dfa
    regexp boa ;

 : <regexp> ( string -- regexp ) "" <optioned-regexp> ;