diff --git a/basis/regexp/dfa/dfa-tests.factor b/basis/regexp/dfa/dfa-tests.factor new file mode 100644 index 0000000000..b6ce13c723 --- /dev/null +++ b/basis/regexp/dfa/dfa-tests.factor @@ -0,0 +1,5 @@ +USING: regexp.dfa tools.test ; +IN: regexp.dfa.tests + +[ [ ] [ ] while-changes ] must-infer + diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 543c757a67..88e4e8f9ff 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -1,4 +1,4 @@ -! Copyright (C) 2008 Doug Coleman. +! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals math math.order regexp.nfa regexp.transition-tables sequences @@ -6,9 +6,13 @@ sets sorting vectors sequences.deep ; USING: io prettyprint threads ; IN: regexp.dfa -: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj ) - [ [ dup slip ] dip pick over call ] dip dupd = - [ 3drop ] [ (while-changes) ] if ; inline recursive +:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj ) + obj quot call :> new-obj + new-obj comp call :> new-key + new-key old-key = + [ new-obj ] + [ new-obj quot comp new-key (while-changes) ] + if ; inline recursive : while-changes ( obj quot pred -- obj' ) 3dup nip call (while-changes) ; inline diff --git a/basis/regexp/minimize/minimize-tests.factor b/basis/regexp/minimize/minimize-tests.factor new file mode 100644 index 0000000000..78a90ca3ba --- /dev/null +++ b/basis/regexp/minimize/minimize-tests.factor @@ -0,0 +1,48 @@ +! Copyright (C) 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: tools.test regexp.minimize assocs regexp accessors regexp.transition-tables ; +IN: regexp.minimize.tests + +[ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test +[ t ] [ 2 1 H{ { { 1 2 } t } } same-partition? ] unit-test +[ f ] [ 2 3 H{ { { 1 2 } t } } same-partition? ] unit-test + +[ H{ { 1 1 } { 2 1 } { 3 3 } { 4 3 } } ] +[ { { 1 1 } { 1 2 } { 2 2 } { 3 3 } { 3 4 } { 4 4 } } [ t ] H{ } map>assoc partition>classes ] unit-test + +[ { { 1 2 } { 3 4 } } ] [ H{ { "elephant" 1 } { "tiger" 3 } } H{ { "elephant" 2 } { "tiger" 4 } } assemble-values ] unit-test + +[ 3 ] [ R/ ab|ac/ dfa>> transitions>> assoc-size ] unit-test +[ 3 ] [ R/ a(b|c)/ dfa>> transitions>> assoc-size ] unit-test +[ 1 ] [ R/ ((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test +[ 1 ] [ R/ a|((aa*)*)*/ dfa>> transitions>> assoc-size ] unit-test +[ 2 ] [ R/ ab|((aa*)*)*b/ dfa>> transitions>> assoc-size ] unit-test +[ 4 ] [ R/ ab|cd/ dfa>> transitions>> assoc-size ] unit-test +[ 1 ] [ R/ [a-z]*|[A-Z]*/i dfa>> transitions>> assoc-size ] unit-test + +[ + T{ transition-table + { transitions H{ + { 0 H{ { CHAR: a 1 } { CHAR: b 1 } } } + { 1 H{ { CHAR: a 2 } { CHAR: b 2 } } } + { 2 H{ { CHAR: c 3 } } } + { 3 H{ } } + } } + { start-state 0 } + { final-states H{ { 3 3 } } } + } +] [ + T{ transition-table + { transitions H{ + { 0 H{ { CHAR: a 1 } { CHAR: b 4 } } } + { 1 H{ { CHAR: a 2 } { CHAR: b 5 } } } + { 2 H{ { CHAR: c 3 } } } + { 3 H{ } } + { 4 H{ { CHAR: a 2 } { CHAR: b 5 } } } + { 5 H{ { CHAR: c 6 } } } + { 6 H{ } } + } } + { start-state 0 } + { final-states H{ { 3 3 } { 6 6 } } } + } combine-states +] unit-test diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor new file mode 100644 index 0000000000..52a852af50 --- /dev/null +++ b/basis/regexp/minimize/minimize.factor @@ -0,0 +1,84 @@ +! Copyright (C) 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: kernel sequences regexp.transition-tables fry assocs +accessors locals math sorting arrays sets hashtables regexp.dfa ; +IN: regexp.minimize + +:: initialize-partitions ( transition-table -- partitions ) + ! Partition table is sorted-array => ? + H{ } clone :> out + transition-table transitions>> keys :> states + states [| s1 | + states [| s2 | + s1 s2 <= [ + s1 s2 [ transition-table transitions>> at keys ] bi@ set= + s1 s2 [ transition-table final-states>> key? ] bi@ = and + [ t s1 s2 2array out set-at ] when + ] when + ] each + ] each out ; + +: same-partition? ( s1 s2 partitions -- ? ) + [ 2array natural-sort ] dip key? ; + +: assemble-values ( assoc1 assoc2 -- values ) + dup keys '[ _ swap [ at ] curry map ] bi@ zip ; + +: stay-same? ( s1 s2 transition partitions -- ? ) + [ '[ _ transitions>> at ] bi@ assemble-values ] dip + '[ _ same-partition? ] assoc-all? ; + +: partition-more ( partitions transition-table -- partitions ) + ! This is horribly slow! + over '[ drop first2 _ _ stay-same? ] assoc-filter ; + +: partition>classes ( partitions -- synonyms ) ! old-state => new-state + >alist sort-keys + [ drop first2 swap ] assoc-map + + >hashtable ; + +: state-classes ( transition-table -- synonyms ) + [ initialize-partitions ] keep + '[ _ partition-more ] [ ] while-changes + partition>classes ; + +: canonical-state? ( state state-classes -- ? ) + dupd at = ; + +: delete-duplicates ( transitions state-classes -- new-transitions ) + '[ drop _ canonical-state? ] assoc-filter ; + +: rewrite-duplicates ( new-transitions state-classes -- new-transitions ) + '[ [ _ at ] assoc-map ] assoc-map ; + +: map-set ( assoc quot -- new-assoc ) + '[ drop @ dup ] assoc-map ; inline + +: combine-states ( table -- smaller-table ) + dup state-classes + [ + '[ + _ [ delete-duplicates ] + [ rewrite-duplicates ] bi + ] change-transitions + ] + [ '[ [ _ at ] map-set ] change-final-states ] + [ '[ _ at ] change-start-state ] + tri ; + +: number-transitions ( transitions numbering -- new-transitions ) + [ + [ at ] + [ '[ first _ at ] assoc-map ] + bi-curry bi* + ] curry assoc-map ; + +: number-states ( table -- newtable ) + dup transitions>> keys [ swap ] H{ } assoc-map-as + [ '[ _ at ] change-start-state ] + [ '[ [ _ at ] map-set ] change-final-states ] + [ '[ _ number-transitions ] change-transitions ] tri ; + +: minimize ( table -- minimal-table ) + clone number-states combine-states ; diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 7491961399..b6fd32a245 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer -namespaces parser arrays fry locals +namespaces parser arrays fry locals regexp.minimize regexp.parser regexp.nfa regexp.dfa regexp.traversal regexp.transition-tables splitting sorting regexp.ast ; IN: regexp @@ -11,7 +11,7 @@ TUPLE: regexp raw parse-tree options dfa ; : ( string options -- regexp ) [ dup parse-regexp ] [ string>options ] bi* - 2dup construct-nfa construct-dfa + 2dup construct-nfa construct-dfa minimize regexp boa ; : ( string -- regexp ) "" ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index e06efa7f80..5d48353f56 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -33,7 +33,7 @@ TUPLE: dfa-traverser : text-finished? ( dfa-traverser -- ? ) { - [ current-state>> empty? ] + [ current-state>> not ] [ end-of-text? ] [ match-failed?>> ] } 1|| ; @@ -59,8 +59,7 @@ TUPLE: dfa-traverser 1 text-character ; : increment-state ( dfa-traverser state -- dfa-traverser ) - [ [ 1 + ] change-current-index ] - [ first ] bi* >>current-state ; + [ [ 1 + ] change-current-index ] dip >>current-state ; : match-literal ( transition from-state table -- to-state/f ) transitions>> at at ;