Fixing bug in disambiguation in regexps

db4
Daniel Ehrenberg 2009-03-03 19:22:53 -06:00
parent a28a80abcf
commit 1740b85598
4 changed files with 46 additions and 37 deletions

View File

@ -23,3 +23,4 @@ IN: regexp.classes.tests
[ 1 ] [ { 1 1 } <or-class> ] unit-test
[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <and-class> ] unit-test
[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <or-class> ] unit-test
[ T{ or-class { seq { 2 3 1 } } } ] [ { 1 2 } <or-class> { 2 3 } <or-class> 2array <or-class> ] unit-test

View File

@ -1,7 +1,8 @@
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words combinators locals
ascii unicode.categories combinators.short-circuit sequences ;
ascii unicode.categories combinators.short-circuit sequences
fry macros arrays ;
IN: regexp.classes
SINGLETONS: any-char any-char-no-nl
@ -150,6 +151,12 @@ M: not-class combine-or
M: integer combine-or
2dup swap class-member? [ drop t ] [ 2drop f f ] if ;
MACRO: instance? ( class -- ? )
"predicate" word-prop ;
: flatten ( seq class -- newseq )
'[ dup _ instance? [ seq>> ] [ 1array ] if ] map concat ; inline
: try-combine ( elt1 elt2 quot -- combined/f ? )
3dup call [ [ 3drop ] dip t ] [ drop swapd call ] if ; inline
@ -160,7 +167,8 @@ M: integer combine-or
[ seq elt prefix ] if* ; inline
:: combine ( seq quot: ( elt1 elt2 -- combined/f ? ) empty class -- newseq )
seq { } [ quot prefix-combining ] reduce
seq class flatten
{ } [ quot prefix-combining ] reduce
dup length {
{ 0 [ drop empty ] }
{ 1 [ first ] }
@ -179,12 +187,19 @@ M: and-class class-member?
M: or-class class-member?
seq>> [ class-member? ] with any? ;
: <not-class> ( class -- inverse )
{
{ t [ f ] }
{ f [ t ] }
[ dup not-class? [ class>> ] [ not-class boa ] if ]
} case ;
GENERIC: <not-class> ( class -- inverse )
M: object <not-class>
not-class boa ;
M: not-class <not-class>
class>> ;
M: and-class <not-class>
seq>> [ <not-class> ] map <or-class> ;
M: or-class <not-class>
seq>> [ <not-class> ] map <and-class> ;
M: not-class class-member?
class>> class-member? not ;
@ -192,4 +207,4 @@ M: not-class class-member?
M: primitive-class class-member?
class>> class-member? ;
UNION: class primitive-class not-class or-class range ;
UNION: class primitive-class not-class or-class and-class range ;

View File

@ -1,7 +1,7 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel accessors regexp.classes math.bits assocs sequences
arrays sets regexp.dfa math fry regexp.minimize ;
arrays sets regexp.dfa math fry regexp.minimize regexp.ast ;
IN: regexp.disambiguate
TUPLE: parts in out ;
@ -20,22 +20,28 @@ TUPLE: parts in out ;
prefix <and-class> ;
: get-transitions ( partition state-transitions -- next-states )
[ in>> ] dip '[ _ at ] map prune ;
[ in>> ] dip '[ _ at ] gather sift ;
: disambiguate ( dfa -- nfa )
: new-transitions ( transitions -- assoc ) ! assoc is class, partition
values [ keys ] gather
[ tagged-epsilon? not ] filter
powerset-partition
[ [ partition>class ] keep ] { } map>assoc
[ drop ] assoc-filter ;
: preserving-epsilon ( state-transitions quot -- new-state-transitions )
[ [ drop tagged-epsilon? ] assoc-filter ] bi
assoc-union H{ } assoc-like ; inline
: disambiguate ( nfa -- nfa )
[
[
[ keys powerset-partition ] keep '[
[ partition>class ]
[ _ get-transitions ] bi
] H{ } map>assoc
[ drop ] assoc-filter
dup new-transitions '[
[
_ swap '[ _ get-transitions ] assoc-map
[ nip empty? not ] assoc-filter
] preserving-epsilon
] assoc-map
] change-transitions ;
USE: sorting
: nfa>dfa ( nfa -- dfa )
construct-dfa minimize
disambiguate
construct-dfa minimize ;
disambiguate construct-dfa minimize ;

View File

@ -11,19 +11,6 @@ IN: regexp.nfa
! but case-insensitive matching should be done by case-folding everything
! before processing starts
GENERIC: remove-lookahead ( syntax-tree -- syntax-tree' )
! This is unfinished and does nothing right now!
M: object remove-lookahead ;
M: with-options remove-lookahead
[ tree>> remove-lookahead ] [ options>> ] bi <with-options> ;
M: alternation remove-lookahead
[ first>> ] [ second>> ] bi [ remove-lookahead ] bi@ alternation boa ;
M: concatenation remove-lookahead ;
SYMBOL: option-stack
SYMBOL: state
@ -148,7 +135,7 @@ M: with-options nfa-node ( node -- start end )
[
0 state set
<transition-table> nfa-table set
remove-lookahead nfa-node
nfa-node
nfa-table get
swap dup associate >>final-states
swap >>start-state