diff --git a/basis/regexp/classes/classes-tests.factor b/basis/regexp/classes/classes-tests.factor index 4cbb2e7a57..5eac0ea352 100644 --- a/basis/regexp/classes/classes-tests.factor +++ b/basis/regexp/classes/classes-tests.factor @@ -23,3 +23,4 @@ IN: regexp.classes.tests [ 1 ] [ { 1 1 } ] unit-test [ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test [ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test +[ T{ or-class { seq { 2 3 1 } } } ] [ { 1 2 } { 2 3 } 2array ] unit-test diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 978be2c369..33652f7606 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: accessors kernel math math.order words combinators locals -ascii unicode.categories combinators.short-circuit sequences ; +ascii unicode.categories combinators.short-circuit sequences +fry macros arrays ; IN: regexp.classes SINGLETONS: any-char any-char-no-nl @@ -150,6 +151,12 @@ M: not-class combine-or M: integer combine-or 2dup swap class-member? [ drop t ] [ 2drop f f ] if ; +MACRO: instance? ( class -- ? ) + "predicate" word-prop ; + +: flatten ( seq class -- newseq ) + '[ dup _ instance? [ seq>> ] [ 1array ] if ] map concat ; inline + : try-combine ( elt1 elt2 quot -- combined/f ? ) 3dup call [ [ 3drop ] dip t ] [ drop swapd call ] if ; inline @@ -160,7 +167,8 @@ M: integer combine-or [ seq elt prefix ] if* ; inline :: combine ( seq quot: ( elt1 elt2 -- combined/f ? ) empty class -- newseq ) - seq { } [ quot prefix-combining ] reduce + seq class flatten + { } [ quot prefix-combining ] reduce dup length { { 0 [ drop empty ] } { 1 [ first ] } @@ -179,12 +187,19 @@ M: and-class class-member? M: or-class class-member? seq>> [ class-member? ] with any? ; -: ( class -- inverse ) - { - { t [ f ] } - { f [ t ] } - [ dup not-class? [ class>> ] [ not-class boa ] if ] - } case ; +GENERIC: ( class -- inverse ) + +M: object + not-class boa ; + +M: not-class + class>> ; + +M: and-class + seq>> [ ] map ; + +M: or-class + seq>> [ ] map ; M: not-class class-member? class>> class-member? not ; @@ -192,4 +207,4 @@ M: not-class class-member? M: primitive-class class-member? class>> class-member? ; -UNION: class primitive-class not-class or-class range ; +UNION: class primitive-class not-class or-class and-class range ; diff --git a/basis/regexp/disambiguate/disambiguate.factor b/basis/regexp/disambiguate/disambiguate.factor index b8c03d7a3b..abfe76d832 100644 --- a/basis/regexp/disambiguate/disambiguate.factor +++ b/basis/regexp/disambiguate/disambiguate.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: kernel accessors regexp.classes math.bits assocs sequences -arrays sets regexp.dfa math fry regexp.minimize ; +arrays sets regexp.dfa math fry regexp.minimize regexp.ast ; IN: regexp.disambiguate TUPLE: parts in out ; @@ -20,22 +20,28 @@ TUPLE: parts in out ; prefix ; : get-transitions ( partition state-transitions -- next-states ) - [ in>> ] dip '[ _ at ] map prune ; + [ in>> ] dip '[ _ at ] gather sift ; -: disambiguate ( dfa -- nfa ) +: new-transitions ( transitions -- assoc ) ! assoc is class, partition + values [ keys ] gather + [ tagged-epsilon? not ] filter + powerset-partition + [ [ partition>class ] keep ] { } map>assoc + [ drop ] assoc-filter ; + +: preserving-epsilon ( state-transitions quot -- new-state-transitions ) + [ [ drop tagged-epsilon? ] assoc-filter ] bi + assoc-union H{ } assoc-like ; inline + +: disambiguate ( nfa -- nfa ) [ - [ - [ keys powerset-partition ] keep '[ - [ partition>class ] - [ _ get-transitions ] bi - ] H{ } map>assoc - [ drop ] assoc-filter + dup new-transitions '[ + [ + _ swap '[ _ get-transitions ] assoc-map + [ nip empty? not ] assoc-filter + ] preserving-epsilon ] assoc-map ] change-transitions ; -USE: sorting - : nfa>dfa ( nfa -- dfa ) - construct-dfa minimize - disambiguate - construct-dfa minimize ; + disambiguate construct-dfa minimize ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 55147a1d26..68f7761394 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -11,19 +11,6 @@ IN: regexp.nfa ! but case-insensitive matching should be done by case-folding everything ! before processing starts -GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' ) -! This is unfinished and does nothing right now! - -M: object remove-lookahead ; - -M: with-options remove-lookahead - [ tree>> remove-lookahead ] [ options>> ] bi ; - -M: alternation remove-lookahead - [ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ alternation boa ; - -M: concatenation remove-lookahead ; - SYMBOL: option-stack SYMBOL: state @@ -148,7 +135,7 @@ M: with-options nfa-node ( node -- start end ) [ 0 state set nfa-table set - remove-lookahead nfa-node + nfa-node nfa-table get swap dup associate >>final-states swap >>start-state