Regexp negation (partial) and cleanup of regexp.nfa
parent
fa84f4c752
commit
9565b59928
|
@ -16,11 +16,17 @@ C: <from-to> from-to
|
|||
TUPLE: at-least n ;
|
||||
C: <at-least> at-least
|
||||
|
||||
TUPLE: concatenation seq ;
|
||||
C: <concatenation> concatenation
|
||||
SINGLETON: epsilon
|
||||
|
||||
TUPLE: alternation seq ;
|
||||
C: <alternation> alternation
|
||||
TUPLE: concatenation first second ;
|
||||
|
||||
: <concatenation> ( seq -- concatenation )
|
||||
epsilon [ concatenation boa ] reduce ;
|
||||
|
||||
TUPLE: alternation first second ;
|
||||
|
||||
: <alternation> ( seq -- alternation )
|
||||
unclip [ alternation boa ] reduce ;
|
||||
|
||||
TUPLE: star term ;
|
||||
C: <star> star
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
! Copyright (C) 2008 Doug Coleman.
|
||||
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors kernel math math.order words
|
||||
ascii unicode.categories combinators.short-circuit sequences ;
|
||||
|
@ -41,9 +41,10 @@ C: <range> range
|
|||
|
||||
GENERIC: class-member? ( obj class -- ? )
|
||||
|
||||
! When does t get put in?
|
||||
M: t class-member? ( obj class -- ? ) 2drop f ;
|
||||
|
||||
M: integer class-member? ( obj class -- ? ) 2drop f ;
|
||||
M: integer class-member? ( obj class -- ? ) = ;
|
||||
|
||||
M: range class-member? ( obj class -- ? )
|
||||
[ from>> ] [ to>> ] bi between? ;
|
||||
|
@ -111,3 +112,15 @@ M: beginning-of-line class-member? ( obj class -- ? )
|
|||
|
||||
M: end-of-line class-member? ( obj class -- ? )
|
||||
2drop f ;
|
||||
|
||||
TUPLE: or-class seq ;
|
||||
C: <or-class> or-class
|
||||
|
||||
TUPLE: not-class class ;
|
||||
C: <not-class> not-class
|
||||
|
||||
M: or-class class-member?
|
||||
seq>> [ class-member? ] with any? ;
|
||||
|
||||
M: not-class class-member?
|
||||
class>> class-member? not ;
|
||||
|
|
|
@ -1,20 +1,48 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel sequences regexp.transition-tables fry assocs
|
||||
accessors locals math sorting arrays sets hashtables regexp.dfa ;
|
||||
accessors locals math sorting arrays sets hashtables regexp.dfa
|
||||
combinators.short-circuit ;
|
||||
IN: regexp.minimize
|
||||
|
||||
: number-transitions ( transitions numbering -- new-transitions )
|
||||
dup '[
|
||||
[ _ at ]
|
||||
[ [ first _ at ] assoc-map ] bi*
|
||||
] assoc-map ;
|
||||
|
||||
: table>state-numbers ( table -- assoc )
|
||||
transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
|
||||
|
||||
: map-set ( assoc quot -- new-assoc )
|
||||
'[ drop @ dup ] assoc-map ; inline
|
||||
|
||||
: rewrite-transitions ( transition-table assoc quot -- transition-table )
|
||||
[
|
||||
[ '[ _ at ] change-start-state ]
|
||||
[ '[ [ _ at ] map-set ] change-final-states ]
|
||||
[ ] tri
|
||||
] dip '[ _ @ ] change-transitions ; inline
|
||||
|
||||
: number-states ( table -- newtable )
|
||||
dup table>state-numbers
|
||||
[ number-transitions ] rewrite-transitions ;
|
||||
|
||||
: initially-same? ( s1 s2 transition-table -- ? )
|
||||
{
|
||||
[ drop <= ]
|
||||
[ transitions>> '[ _ at keys ] bi@ set= ]
|
||||
[ final-states>> '[ _ key? ] bi@ = ]
|
||||
} 3&& ;
|
||||
|
||||
:: initialize-partitions ( transition-table -- partitions )
|
||||
! Partition table is sorted-array => ?
|
||||
H{ } clone :> out
|
||||
transition-table transitions>> keys :> states
|
||||
states [| s1 |
|
||||
states [| s2 |
|
||||
s1 s2 <= [
|
||||
s1 s2 [ transition-table transitions>> at keys ] bi@ set=
|
||||
s1 s2 [ transition-table final-states>> key? ] bi@ = and
|
||||
[ t s1 s2 2array out set-at ] when
|
||||
] when
|
||||
s1 s2 transition-table initially-same?
|
||||
[ s1 s2 2array out conjoin ] when
|
||||
] each
|
||||
] each out ;
|
||||
|
||||
|
@ -29,7 +57,6 @@ IN: regexp.minimize
|
|||
'[ _ same-partition? ] assoc-all? ;
|
||||
|
||||
: partition-more ( partitions transition-table -- partitions )
|
||||
! This is horribly slow!
|
||||
over '[ drop first2 _ _ stay-same? ] assoc-filter ;
|
||||
|
||||
: partition>classes ( partitions -- synonyms ) ! old-state => new-state
|
||||
|
@ -40,7 +67,7 @@ IN: regexp.minimize
|
|||
|
||||
: state-classes ( transition-table -- synonyms )
|
||||
[ initialize-partitions ] keep
|
||||
'[ _ partition-more ] [ ] while-changes
|
||||
'[ _ partition-more ] [ assoc-size ] while-changes
|
||||
partition>classes ;
|
||||
|
||||
: canonical-state? ( state state-classes -- ? )
|
||||
|
@ -52,33 +79,12 @@ IN: regexp.minimize
|
|||
: rewrite-duplicates ( new-transitions state-classes -- new-transitions )
|
||||
'[ [ _ at ] assoc-map ] assoc-map ;
|
||||
|
||||
: map-set ( assoc quot -- new-assoc )
|
||||
'[ drop @ dup ] assoc-map ; inline
|
||||
: combine-transitions ( transitions state-classes -- new-transitions )
|
||||
[ delete-duplicates ] [ rewrite-duplicates ] bi ;
|
||||
|
||||
: combine-states ( table -- smaller-table )
|
||||
dup state-classes
|
||||
[
|
||||
'[
|
||||
_ [ delete-duplicates ]
|
||||
[ rewrite-duplicates ] bi
|
||||
] change-transitions
|
||||
]
|
||||
[ '[ [ _ at ] map-set ] change-final-states ]
|
||||
[ '[ _ at ] change-start-state ]
|
||||
tri ;
|
||||
|
||||
: number-transitions ( transitions numbering -- new-transitions )
|
||||
[
|
||||
[ at ]
|
||||
[ '[ first _ at ] assoc-map ]
|
||||
bi-curry bi*
|
||||
] curry assoc-map ;
|
||||
|
||||
: number-states ( table -- newtable )
|
||||
dup transitions>> keys <enum> [ swap ] H{ } assoc-map-as
|
||||
[ '[ _ at ] change-start-state ]
|
||||
[ '[ [ _ at ] map-set ] change-final-states ]
|
||||
[ '[ _ number-transitions ] change-transitions ] tri ;
|
||||
[ combine-transitions ] rewrite-transitions ;
|
||||
|
||||
: minimize ( table -- minimal-table )
|
||||
clone number-states combine-states ;
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
|
||||
IN: regexp.negation.tests
|
||||
|
||||
[
|
||||
! R/ |[^a]|.+/
|
||||
T{ transition-table
|
||||
{ transitions H{
|
||||
{ 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } }
|
||||
{ 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } }
|
||||
{ -1 H{ { any-char -1 } } }
|
||||
} }
|
||||
{ start-state 0 }
|
||||
{ final-states H{ { 0 0 } { -1 -1 } } }
|
||||
}
|
||||
] [
|
||||
! R/ a/
|
||||
T{ transition-table
|
||||
{ transitions H{
|
||||
{ 0 H{ { CHAR: a 1 } } }
|
||||
{ 1 H{ } }
|
||||
} }
|
||||
{ start-state 0 }
|
||||
{ final-states H{ { 1 1 } } }
|
||||
} negate-table
|
||||
] unit-test
|
|
@ -0,0 +1,36 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences
|
||||
assocs regexp.classes hashtables accessors ;
|
||||
IN: regexp.negation
|
||||
|
||||
: ast>dfa ( parse-tree -- minimal-dfa )
|
||||
construct-nfa construct-dfa minimize ;
|
||||
|
||||
CONSTANT: fail-state -1
|
||||
|
||||
: add-default-transition ( state's-transitions -- new-state's-transitions )
|
||||
clone dup
|
||||
[ [ fail-state ] dip keys <or-class> <not-class> ] keep set-at ;
|
||||
|
||||
: fail-state-recurses ( transitions -- new-transitions )
|
||||
clone dup
|
||||
[ fail-state any-char associate fail-state ] dip set-at ;
|
||||
|
||||
: add-fail-state ( transitions -- new-transitions )
|
||||
[ add-default-transition ] assoc-map
|
||||
fail-state-recurses ;
|
||||
|
||||
: assoc>set ( assoc -- keys-set )
|
||||
[ drop dup ] assoc-map ;
|
||||
|
||||
: inverse-final-states ( transition-table -- final-states )
|
||||
[ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
|
||||
|
||||
: negate-table ( transition-table -- transition-table )
|
||||
clone
|
||||
[ add-fail-state ] change-transitions
|
||||
dup inverse-final-states >>final-states ;
|
||||
|
||||
! M: negation nfa-node ( node -- )
|
||||
! ast>dfa negate-table adjoin-dfa ;
|
|
@ -3,15 +3,13 @@
|
|||
USING: accessors arrays assocs grouping kernel
|
||||
locals math namespaces sequences fry quotations
|
||||
math.order math.ranges vectors unicode.categories
|
||||
regexp.transition-tables words sets
|
||||
regexp.transition-tables words sets hashtables
|
||||
unicode.case.private regexp.ast regexp.classes ;
|
||||
! This uses unicode.case.private for ch>upper and ch>lower
|
||||
! but case-insensitive matching should be done by case-folding everything
|
||||
! before processing starts
|
||||
IN: regexp.nfa
|
||||
|
||||
ERROR: feature-is-broken feature ;
|
||||
|
||||
SYMBOL: negated?
|
||||
|
||||
: negate ( -- )
|
||||
|
@ -21,14 +19,13 @@ SINGLETON: eps
|
|||
|
||||
SYMBOL: option-stack
|
||||
|
||||
SYMBOL: combine-stack
|
||||
|
||||
SYMBOL: state
|
||||
|
||||
: next-state ( -- state )
|
||||
state [ get ] [ inc ] bi ;
|
||||
|
||||
SYMBOL: nfa-table
|
||||
: table ( -- table ) nfa-table get ;
|
||||
|
||||
: set-each ( keys value hashtable -- )
|
||||
'[ _ swap _ set-at ] each ;
|
||||
|
@ -46,84 +43,56 @@ SYMBOL: nfa-table
|
|||
: option? ( obj -- ? )
|
||||
option-stack get assoc-stack ;
|
||||
|
||||
: set-start-state ( -- nfa-table )
|
||||
nfa-table get
|
||||
combine-stack get pop first >>start-state ;
|
||||
GENERIC: nfa-node ( node -- start-state end-state )
|
||||
|
||||
GENERIC: nfa-node ( node -- )
|
||||
|
||||
:: add-simple-entry ( obj class -- )
|
||||
[let* | s0 [ next-state ]
|
||||
s1 [ next-state ]
|
||||
stack [ combine-stack get ]
|
||||
table [ nfa-table get ] |
|
||||
:: add-simple-entry ( obj class -- start-state end-state )
|
||||
next-state :> s0
|
||||
next-state :> s1
|
||||
negated? get [
|
||||
s0 f obj class make-transition table add-transition
|
||||
s0 s1 <default-transition> table add-transition
|
||||
] [
|
||||
s0 s1 obj class make-transition table add-transition
|
||||
] if
|
||||
s0 s1 2array stack push
|
||||
t s1 table final-states>> set-at ] ;
|
||||
s0 s1 ;
|
||||
|
||||
:: concatenate-nodes ( -- )
|
||||
[let* | stack [ combine-stack get ]
|
||||
table [ nfa-table get ]
|
||||
s2 [ stack peek first ]
|
||||
s3 [ stack pop second ]
|
||||
s0 [ stack peek first ]
|
||||
s1 [ stack pop second ] |
|
||||
s1 s2 eps <literal-transition> table add-transition
|
||||
s1 table final-states>> delete-at
|
||||
s0 s3 2array stack push ] ;
|
||||
: epsilon-transition ( source target -- )
|
||||
eps <literal-transition> table add-transition ;
|
||||
|
||||
:: alternate-nodes ( -- )
|
||||
[let* | stack [ combine-stack get ]
|
||||
table [ nfa-table get ]
|
||||
s2 [ stack peek first ]
|
||||
s3 [ stack pop second ]
|
||||
s0 [ stack peek first ]
|
||||
s1 [ stack pop second ]
|
||||
s4 [ next-state ]
|
||||
s5 [ next-state ] |
|
||||
s4 s0 eps <literal-transition> table add-transition
|
||||
s4 s2 eps <literal-transition> table add-transition
|
||||
s1 s5 eps <literal-transition> table add-transition
|
||||
s3 s5 eps <literal-transition> table add-transition
|
||||
s1 table final-states>> delete-at
|
||||
s3 table final-states>> delete-at
|
||||
t s5 table final-states>> set-at
|
||||
s4 s5 2array stack push ] ;
|
||||
M:: star nfa-node ( node -- start end )
|
||||
node term>> nfa-node :> s1 :> s0
|
||||
next-state :> s2
|
||||
next-state :> s3
|
||||
s1 s0 epsilon-transition
|
||||
s2 s0 epsilon-transition
|
||||
s2 s3 epsilon-transition
|
||||
s1 s3 epsilon-transition
|
||||
s2 s3 ;
|
||||
|
||||
M: star nfa-node ( node -- )
|
||||
term>> nfa-node
|
||||
[let* | stack [ combine-stack get ]
|
||||
s0 [ stack peek first ]
|
||||
s1 [ stack pop second ]
|
||||
s2 [ next-state ]
|
||||
s3 [ next-state ]
|
||||
table [ nfa-table get ] |
|
||||
s1 table final-states>> delete-at
|
||||
t s3 table final-states>> set-at
|
||||
s1 s0 eps <literal-transition> table add-transition
|
||||
s2 s0 eps <literal-transition> table add-transition
|
||||
s2 s3 eps <literal-transition> table add-transition
|
||||
s1 s3 eps <literal-transition> table add-transition
|
||||
s2 s3 2array stack push ] ;
|
||||
M: epsilon nfa-node
|
||||
drop eps literal-transition add-simple-entry ;
|
||||
|
||||
M: concatenation nfa-node ( node -- )
|
||||
seq>> [ eps literal-transition add-simple-entry ] [
|
||||
reversed-regexp option? [ <reversed> ] when
|
||||
[ [ nfa-node ] each ]
|
||||
[ length 1- [ concatenate-nodes ] times ] bi
|
||||
] if-empty ;
|
||||
M: concatenation nfa-node ( node -- start end )
|
||||
[ first>> ] [ second>> ] bi
|
||||
reversed-regexp option? [ swap ] when
|
||||
[ nfa-node ] bi@
|
||||
[ epsilon-transition ] dip ;
|
||||
|
||||
M: alternation nfa-node ( node -- )
|
||||
seq>>
|
||||
[ [ nfa-node ] each ]
|
||||
[ length 1- [ alternate-nodes ] times ] bi ;
|
||||
:: alternate-nodes ( s0 s1 s2 s3 -- start end )
|
||||
next-state :> s4
|
||||
next-state :> s5
|
||||
s4 s0 epsilon-transition
|
||||
s4 s2 epsilon-transition
|
||||
s1 s5 epsilon-transition
|
||||
s3 s5 epsilon-transition
|
||||
s4 s5 ;
|
||||
|
||||
M: integer nfa-node ( node -- )
|
||||
M: alternation nfa-node ( node -- start end )
|
||||
[ first>> ] [ second>> ] bi
|
||||
[ nfa-node ] bi@
|
||||
alternate-nodes ;
|
||||
|
||||
M: integer nfa-node ( node -- start end )
|
||||
case-insensitive option? [
|
||||
dup [ ch>lower ] [ ch>upper ] bi
|
||||
2dup = [
|
||||
|
@ -131,26 +100,26 @@ M: integer nfa-node ( node -- )
|
|||
literal-transition add-simple-entry
|
||||
] [
|
||||
[ literal-transition add-simple-entry ] bi@
|
||||
alternate-nodes drop
|
||||
alternate-nodes [ nip ] dip
|
||||
] if
|
||||
] [
|
||||
literal-transition add-simple-entry
|
||||
] if ;
|
||||
|
||||
M: primitive-class nfa-node ( node -- )
|
||||
M: primitive-class nfa-node ( node -- start end )
|
||||
class>> dup
|
||||
{ letter-class LETTER-class } member? case-insensitive option? and
|
||||
[ drop Letter-class ] when
|
||||
class-transition add-simple-entry ;
|
||||
|
||||
M: any-char nfa-node ( node -- )
|
||||
M: any-char nfa-node ( node -- start end )
|
||||
[ dotall option? ] dip any-char-no-nl ?
|
||||
class-transition add-simple-entry ;
|
||||
|
||||
M: negation nfa-node ( node -- )
|
||||
M: negation nfa-node ( node -- start end )
|
||||
negate term>> nfa-node negate ;
|
||||
|
||||
M: range nfa-node ( node -- )
|
||||
M: range nfa-node ( node -- start end )
|
||||
case-insensitive option? [
|
||||
! This should be implemented for Unicode by case-folding
|
||||
! the input and all strings in the regexp.
|
||||
|
@ -169,15 +138,16 @@ M: range nfa-node ( node -- )
|
|||
class-transition add-simple-entry
|
||||
] if ;
|
||||
|
||||
M: with-options nfa-node ( node -- )
|
||||
M: with-options nfa-node ( node -- start end )
|
||||
dup options>> [ tree>> nfa-node ] using-options ;
|
||||
|
||||
: construct-nfa ( ast -- nfa-table )
|
||||
[
|
||||
negated? off
|
||||
V{ } clone combine-stack set
|
||||
0 state set
|
||||
<transition-table> clone nfa-table set
|
||||
nfa-node
|
||||
set-start-state
|
||||
table
|
||||
swap dup associate >>final-states
|
||||
swap >>start-state
|
||||
] with-scope ;
|
||||
|
|
|
@ -4,6 +4,7 @@ USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
|
|||
combinators regexp.classes strings splitting peg locals accessors
|
||||
regexp.ast ;
|
||||
IN: regexp.parser
|
||||
|
||||
: allowed-char? ( ch -- ? )
|
||||
".()|[*+?" member? not ;
|
||||
|
||||
|
@ -130,6 +131,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
|
|||
| "?" Options:on "-"? Options:off ":" Alternation:a
|
||||
=> [[ a on off parse-options <with-options> ]]
|
||||
| "?#" [^)]* => [[ f ]]
|
||||
| "?~" Alternation:a => [[ a <negation> ]]
|
||||
| Alternation
|
||||
|
||||
Element = "(" Parenthized:p ")" => [[ p ]]
|
||||
|
|
|
@ -4,14 +4,15 @@ USING: accessors combinators kernel math sequences strings sets
|
|||
assocs prettyprint.backend prettyprint.custom make lexer
|
||||
namespaces parser arrays fry locals regexp.minimize
|
||||
regexp.parser regexp.nfa regexp.dfa regexp.traversal
|
||||
regexp.transition-tables splitting sorting regexp.ast ;
|
||||
regexp.transition-tables splitting sorting regexp.ast
|
||||
regexp.negation ;
|
||||
IN: regexp
|
||||
|
||||
TUPLE: regexp raw parse-tree options dfa ;
|
||||
|
||||
: <optioned-regexp> ( string options -- regexp )
|
||||
[ dup parse-regexp ] [ string>options ] bi*
|
||||
2dup <with-options> construct-nfa construct-dfa minimize
|
||||
2dup <with-options> ast>dfa
|
||||
regexp boa ;
|
||||
|
||||
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
||||
|
|
Loading…
Reference in New Issue