Regexp negation (partial) and cleanup of regexp.nfa

db4
Daniel Ehrenberg 2009-02-19 16:48:46 -06:00
parent fa84f4c752
commit 9565b59928
8 changed files with 184 additions and 123 deletions

View File

@ -16,11 +16,17 @@ C: <from-to> from-to
TUPLE: at-least n ;
C: <at-least> at-least
TUPLE: concatenation seq ;
C: <concatenation> concatenation
SINGLETON: epsilon
TUPLE: alternation seq ;
C: <alternation> alternation
TUPLE: concatenation first second ;
: <concatenation> ( seq -- concatenation )
epsilon [ concatenation boa ] reduce ;
TUPLE: alternation first second ;
: <alternation> ( seq -- alternation )
unclip [ alternation boa ] reduce ;
TUPLE: star term ;
C: <star> star

View File

@ -1,4 +1,4 @@
! Copyright (C) 2008 Doug Coleman.
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words
ascii unicode.categories combinators.short-circuit sequences ;
@ -41,9 +41,10 @@ C: <range> range
GENERIC: class-member? ( obj class -- ? )
! When does t get put in?
M: t class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) = ;
M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ;
@ -111,3 +112,15 @@ M: beginning-of-line class-member? ( obj class -- ? )
M: end-of-line class-member? ( obj class -- ? )
2drop f ;
TUPLE: or-class seq ;
C: <or-class> or-class
TUPLE: not-class class ;
C: <not-class> not-class
M: or-class class-member?
seq>> [ class-member? ] with any? ;
M: not-class class-member?
class>> class-member? not ;

View File

@ -1,20 +1,48 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences regexp.transition-tables fry assocs
accessors locals math sorting arrays sets hashtables regexp.dfa ;
accessors locals math sorting arrays sets hashtables regexp.dfa
combinators.short-circuit ;
IN: regexp.minimize
: number-transitions ( transitions numbering -- new-transitions )
dup '[
[ _ at ]
[ [ first _ at ] assoc-map ] bi*
] assoc-map ;
: table>state-numbers ( table -- assoc )
transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
: map-set ( assoc quot -- new-assoc )
'[ drop @ dup ] assoc-map ; inline
: rewrite-transitions ( transition-table assoc quot -- transition-table )
[
[ '[ _ at ] change-start-state ]
[ '[ [ _ at ] map-set ] change-final-states ]
[ ] tri
] dip '[ _ @ ] change-transitions ; inline
: number-states ( table -- newtable )
dup table>state-numbers
[ number-transitions ] rewrite-transitions ;
: initially-same? ( s1 s2 transition-table -- ? )
{
[ drop <= ]
[ transitions>> '[ _ at keys ] bi@ set= ]
[ final-states>> '[ _ key? ] bi@ = ]
} 3&& ;
:: initialize-partitions ( transition-table -- partitions )
! Partition table is sorted-array => ?
H{ } clone :> out
transition-table transitions>> keys :> states
states [| s1 |
states [| s2 |
s1 s2 <= [
s1 s2 [ transition-table transitions>> at keys ] bi@ set=
s1 s2 [ transition-table final-states>> key? ] bi@ = and
[ t s1 s2 2array out set-at ] when
] when
s1 s2 transition-table initially-same?
[ s1 s2 2array out conjoin ] when
] each
] each out ;
@ -29,7 +57,6 @@ IN: regexp.minimize
'[ _ same-partition? ] assoc-all? ;
: partition-more ( partitions transition-table -- partitions )
! This is horribly slow!
over '[ drop first2 _ _ stay-same? ] assoc-filter ;
: partition>classes ( partitions -- synonyms ) ! old-state => new-state
@ -40,7 +67,7 @@ IN: regexp.minimize
: state-classes ( transition-table -- synonyms )
[ initialize-partitions ] keep
'[ _ partition-more ] [ ] while-changes
'[ _ partition-more ] [ assoc-size ] while-changes
partition>classes ;
: canonical-state? ( state state-classes -- ? )
@ -52,33 +79,12 @@ IN: regexp.minimize
: rewrite-duplicates ( new-transitions state-classes -- new-transitions )
'[ [ _ at ] assoc-map ] assoc-map ;
: map-set ( assoc quot -- new-assoc )
'[ drop @ dup ] assoc-map ; inline
: combine-transitions ( transitions state-classes -- new-transitions )
[ delete-duplicates ] [ rewrite-duplicates ] bi ;
: combine-states ( table -- smaller-table )
dup state-classes
[
'[
_ [ delete-duplicates ]
[ rewrite-duplicates ] bi
] change-transitions
]
[ '[ [ _ at ] map-set ] change-final-states ]
[ '[ _ at ] change-start-state ]
tri ;
: number-transitions ( transitions numbering -- new-transitions )
[
[ at ]
[ '[ first _ at ] assoc-map ]
bi-curry bi*
] curry assoc-map ;
: number-states ( table -- newtable )
dup transitions>> keys <enum> [ swap ] H{ } assoc-map-as
[ '[ _ at ] change-start-state ]
[ '[ [ _ at ] map-set ] change-final-states ]
[ '[ _ number-transitions ] change-transitions ] tri ;
[ combine-transitions ] rewrite-transitions ;
: minimize ( table -- minimal-table )
clone number-states combine-states ;

View File

@ -0,0 +1,27 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
IN: regexp.negation.tests
[
! R/ |[^a]|.+/
T{ transition-table
{ transitions H{
{ 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } }
{ 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } }
{ -1 H{ { any-char -1 } } }
} }
{ start-state 0 }
{ final-states H{ { 0 0 } { -1 -1 } } }
}
] [
! R/ a/
T{ transition-table
{ transitions H{
{ 0 H{ { CHAR: a 1 } } }
{ 1 H{ } }
} }
{ start-state 0 }
{ final-states H{ { 1 1 } } }
} negate-table
] unit-test

View File

@ -0,0 +1,36 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences
assocs regexp.classes hashtables accessors ;
IN: regexp.negation
: ast>dfa ( parse-tree -- minimal-dfa )
construct-nfa construct-dfa minimize ;
CONSTANT: fail-state -1
: add-default-transition ( state's-transitions -- new-state's-transitions )
clone dup
[ [ fail-state ] dip keys <or-class> <not-class> ] keep set-at ;
: fail-state-recurses ( transitions -- new-transitions )
clone dup
[ fail-state any-char associate fail-state ] dip set-at ;
: add-fail-state ( transitions -- new-transitions )
[ add-default-transition ] assoc-map
fail-state-recurses ;
: assoc>set ( assoc -- keys-set )
[ drop dup ] assoc-map ;
: inverse-final-states ( transition-table -- final-states )
[ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
: negate-table ( transition-table -- transition-table )
clone
[ add-fail-state ] change-transitions
dup inverse-final-states >>final-states ;
! M: negation nfa-node ( node -- )
! ast>dfa negate-table adjoin-dfa ;

View File

@ -3,15 +3,13 @@
USING: accessors arrays assocs grouping kernel
locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories
regexp.transition-tables words sets
regexp.transition-tables words sets hashtables
unicode.case.private regexp.ast regexp.classes ;
! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything
! before processing starts
IN: regexp.nfa
ERROR: feature-is-broken feature ;
SYMBOL: negated?
: negate ( -- )
@ -21,14 +19,13 @@ SINGLETON: eps
SYMBOL: option-stack
SYMBOL: combine-stack
SYMBOL: state
: next-state ( -- state )
state [ get ] [ inc ] bi ;
SYMBOL: nfa-table
: table ( -- table ) nfa-table get ;
: set-each ( keys value hashtable -- )
'[ _ swap _ set-at ] each ;
@ -46,84 +43,56 @@ SYMBOL: nfa-table
: option? ( obj -- ? )
option-stack get assoc-stack ;
: set-start-state ( -- nfa-table )
nfa-table get
combine-stack get pop first >>start-state ;
GENERIC: nfa-node ( node -- start-state end-state )
GENERIC: nfa-node ( node -- )
:: add-simple-entry ( obj class -- )
[let* | s0 [ next-state ]
s1 [ next-state ]
stack [ combine-stack get ]
table [ nfa-table get ] |
:: add-simple-entry ( obj class -- start-state end-state )
next-state :> s0
next-state :> s1
negated? get [
s0 f obj class make-transition table add-transition
s0 s1 <default-transition> table add-transition
] [
s0 s1 obj class make-transition table add-transition
] if
s0 s1 2array stack push
t s1 table final-states>> set-at ] ;
s0 s1 ;
:: concatenate-nodes ( -- )
[let* | stack [ combine-stack get ]
table [ nfa-table get ]
s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek first ]
s1 [ stack pop second ] |
s1 s2 eps <literal-transition> table add-transition
s1 table final-states>> delete-at
s0 s3 2array stack push ] ;
: epsilon-transition ( source target -- )
eps <literal-transition> table add-transition ;
:: alternate-nodes ( -- )
[let* | stack [ combine-stack get ]
table [ nfa-table get ]
s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek first ]
s1 [ stack pop second ]
s4 [ next-state ]
s5 [ next-state ] |
s4 s0 eps <literal-transition> table add-transition
s4 s2 eps <literal-transition> table add-transition
s1 s5 eps <literal-transition> table add-transition
s3 s5 eps <literal-transition> table add-transition
s1 table final-states>> delete-at
s3 table final-states>> delete-at
t s5 table final-states>> set-at
s4 s5 2array stack push ] ;
M:: star nfa-node ( node -- start end )
node term>> nfa-node :> s1 :> s0
next-state :> s2
next-state :> s3
s1 s0 epsilon-transition
s2 s0 epsilon-transition
s2 s3 epsilon-transition
s1 s3 epsilon-transition
s2 s3 ;
M: star nfa-node ( node -- )
term>> nfa-node
[let* | stack [ combine-stack get ]
s0 [ stack peek first ]
s1 [ stack pop second ]
s2 [ next-state ]
s3 [ next-state ]
table [ nfa-table get ] |
s1 table final-states>> delete-at
t s3 table final-states>> set-at
s1 s0 eps <literal-transition> table add-transition
s2 s0 eps <literal-transition> table add-transition
s2 s3 eps <literal-transition> table add-transition
s1 s3 eps <literal-transition> table add-transition
s2 s3 2array stack push ] ;
M: epsilon nfa-node
drop eps literal-transition add-simple-entry ;
M: concatenation nfa-node ( node -- )
seq>> [ eps literal-transition add-simple-entry ] [
reversed-regexp option? [ <reversed> ] when
[ [ nfa-node ] each ]
[ length 1- [ concatenate-nodes ] times ] bi
] if-empty ;
M: concatenation nfa-node ( node -- start end )
[ first>> ] [ second>> ] bi
reversed-regexp option? [ swap ] when
[ nfa-node ] bi@
[ epsilon-transition ] dip ;
M: alternation nfa-node ( node -- )
seq>>
[ [ nfa-node ] each ]
[ length 1- [ alternate-nodes ] times ] bi ;
:: alternate-nodes ( s0 s1 s2 s3 -- start end )
next-state :> s4
next-state :> s5
s4 s0 epsilon-transition
s4 s2 epsilon-transition
s1 s5 epsilon-transition
s3 s5 epsilon-transition
s4 s5 ;
M: integer nfa-node ( node -- )
M: alternation nfa-node ( node -- start end )
[ first>> ] [ second>> ] bi
[ nfa-node ] bi@
alternate-nodes ;
M: integer nfa-node ( node -- start end )
case-insensitive option? [
dup [ ch>lower ] [ ch>upper ] bi
2dup = [
@ -131,26 +100,26 @@ M: integer nfa-node ( node -- )
literal-transition add-simple-entry
] [
[ literal-transition add-simple-entry ] bi@
alternate-nodes drop
alternate-nodes [ nip ] dip
] if
] [
literal-transition add-simple-entry
] if ;
M: primitive-class nfa-node ( node -- )
M: primitive-class nfa-node ( node -- start end )
class>> dup
{ letter-class LETTER-class } member? case-insensitive option? and
[ drop Letter-class ] when
class-transition add-simple-entry ;
M: any-char nfa-node ( node -- )
M: any-char nfa-node ( node -- start end )
[ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ;
M: negation nfa-node ( node -- )
M: negation nfa-node ( node -- start end )
negate term>> nfa-node negate ;
M: range nfa-node ( node -- )
M: range nfa-node ( node -- start end )
case-insensitive option? [
! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp.
@ -169,15 +138,16 @@ M: range nfa-node ( node -- )
class-transition add-simple-entry
] if ;
M: with-options nfa-node ( node -- )
M: with-options nfa-node ( node -- start end )
dup options>> [ tree>> nfa-node ] using-options ;
: construct-nfa ( ast -- nfa-table )
[
negated? off
V{ } clone combine-stack set
0 state set
<transition-table> clone nfa-table set
nfa-node
set-start-state
table
swap dup associate >>final-states
swap >>start-state
] with-scope ;

View File

@ -4,6 +4,7 @@ USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
combinators regexp.classes strings splitting peg locals accessors
regexp.ast ;
IN: regexp.parser
: allowed-char? ( ch -- ? )
".()|[*+?" member? not ;
@ -130,6 +131,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
| "?" Options:on "-"? Options:off ":" Alternation:a
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
| Alternation
Element = "(" Parenthized:p ")" => [[ p ]]

View File

@ -4,14 +4,15 @@ USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry locals regexp.minimize
regexp.parser regexp.nfa regexp.dfa regexp.traversal
regexp.transition-tables splitting sorting regexp.ast ;
regexp.transition-tables splitting sorting regexp.ast
regexp.negation ;
IN: regexp
TUPLE: regexp raw parse-tree options dfa ;
: <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi*
2dup <with-options> construct-nfa construct-dfa minimize
2dup <with-options> ast>dfa
regexp boa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;