Regexp negation (partial) and cleanup of regexp.nfa

db4
Daniel Ehrenberg 2009-02-19 16:48:46 -06:00
parent fa84f4c752
commit 9565b59928
8 changed files with 184 additions and 123 deletions

View File

@ -16,11 +16,17 @@ C: <from-to> from-to
TUPLE: at-least n ; TUPLE: at-least n ;
C: <at-least> at-least C: <at-least> at-least
TUPLE: concatenation seq ; SINGLETON: epsilon
C: <concatenation> concatenation
TUPLE: alternation seq ; TUPLE: concatenation first second ;
C: <alternation> alternation
: <concatenation> ( seq -- concatenation )
epsilon [ concatenation boa ] reduce ;
TUPLE: alternation first second ;
: <alternation> ( seq -- alternation )
unclip [ alternation boa ] reduce ;
TUPLE: star term ; TUPLE: star term ;
C: <star> star C: <star> star

View File

@ -1,4 +1,4 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words USING: accessors kernel math math.order words
ascii unicode.categories combinators.short-circuit sequences ; ascii unicode.categories combinators.short-circuit sequences ;
@ -41,9 +41,10 @@ C: <range> range
GENERIC: class-member? ( obj class -- ? ) GENERIC: class-member? ( obj class -- ? )
! When does t get put in?
M: t class-member? ( obj class -- ? ) 2drop f ; M: t class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) 2drop f ; M: integer class-member? ( obj class -- ? ) = ;
M: range class-member? ( obj class -- ? ) M: range class-member? ( obj class -- ? )
[ from>> ] [ to>> ] bi between? ; [ from>> ] [ to>> ] bi between? ;
@ -111,3 +112,15 @@ M: beginning-of-line class-member? ( obj class -- ? )
M: end-of-line class-member? ( obj class -- ? ) M: end-of-line class-member? ( obj class -- ? )
2drop f ; 2drop f ;
TUPLE: or-class seq ;
C: <or-class> or-class
TUPLE: not-class class ;
C: <not-class> not-class
M: or-class class-member?
seq>> [ class-member? ] with any? ;
M: not-class class-member?
class>> class-member? not ;

View File

@ -1,20 +1,48 @@
! Copyright (C) 2009 Daniel Ehrenberg ! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences regexp.transition-tables fry assocs USING: kernel sequences regexp.transition-tables fry assocs
accessors locals math sorting arrays sets hashtables regexp.dfa ; accessors locals math sorting arrays sets hashtables regexp.dfa
combinators.short-circuit ;
IN: regexp.minimize IN: regexp.minimize
: number-transitions ( transitions numbering -- new-transitions )
dup '[
[ _ at ]
[ [ first _ at ] assoc-map ] bi*
] assoc-map ;
: table>state-numbers ( table -- assoc )
transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
: map-set ( assoc quot -- new-assoc )
'[ drop @ dup ] assoc-map ; inline
: rewrite-transitions ( transition-table assoc quot -- transition-table )
[
[ '[ _ at ] change-start-state ]
[ '[ [ _ at ] map-set ] change-final-states ]
[ ] tri
] dip '[ _ @ ] change-transitions ; inline
: number-states ( table -- newtable )
dup table>state-numbers
[ number-transitions ] rewrite-transitions ;
: initially-same? ( s1 s2 transition-table -- ? )
{
[ drop <= ]
[ transitions>> '[ _ at keys ] bi@ set= ]
[ final-states>> '[ _ key? ] bi@ = ]
} 3&& ;
:: initialize-partitions ( transition-table -- partitions ) :: initialize-partitions ( transition-table -- partitions )
! Partition table is sorted-array => ? ! Partition table is sorted-array => ?
H{ } clone :> out H{ } clone :> out
transition-table transitions>> keys :> states transition-table transitions>> keys :> states
states [| s1 | states [| s1 |
states [| s2 | states [| s2 |
s1 s2 <= [ s1 s2 transition-table initially-same?
s1 s2 [ transition-table transitions>> at keys ] bi@ set= [ s1 s2 2array out conjoin ] when
s1 s2 [ transition-table final-states>> key? ] bi@ = and
[ t s1 s2 2array out set-at ] when
] when
] each ] each
] each out ; ] each out ;
@ -29,7 +57,6 @@ IN: regexp.minimize
'[ _ same-partition? ] assoc-all? ; '[ _ same-partition? ] assoc-all? ;
: partition-more ( partitions transition-table -- partitions ) : partition-more ( partitions transition-table -- partitions )
! This is horribly slow!
over '[ drop first2 _ _ stay-same? ] assoc-filter ; over '[ drop first2 _ _ stay-same? ] assoc-filter ;
: partition>classes ( partitions -- synonyms ) ! old-state => new-state : partition>classes ( partitions -- synonyms ) ! old-state => new-state
@ -40,7 +67,7 @@ IN: regexp.minimize
: state-classes ( transition-table -- synonyms ) : state-classes ( transition-table -- synonyms )
[ initialize-partitions ] keep [ initialize-partitions ] keep
'[ _ partition-more ] [ ] while-changes '[ _ partition-more ] [ assoc-size ] while-changes
partition>classes ; partition>classes ;
: canonical-state? ( state state-classes -- ? ) : canonical-state? ( state state-classes -- ? )
@ -52,33 +79,12 @@ IN: regexp.minimize
: rewrite-duplicates ( new-transitions state-classes -- new-transitions ) : rewrite-duplicates ( new-transitions state-classes -- new-transitions )
'[ [ _ at ] assoc-map ] assoc-map ; '[ [ _ at ] assoc-map ] assoc-map ;
: map-set ( assoc quot -- new-assoc ) : combine-transitions ( transitions state-classes -- new-transitions )
'[ drop @ dup ] assoc-map ; inline [ delete-duplicates ] [ rewrite-duplicates ] bi ;
: combine-states ( table -- smaller-table ) : combine-states ( table -- smaller-table )
dup state-classes dup state-classes
[ [ combine-transitions ] rewrite-transitions ;
'[
_ [ delete-duplicates ]
[ rewrite-duplicates ] bi
] change-transitions
]
[ '[ [ _ at ] map-set ] change-final-states ]
[ '[ _ at ] change-start-state ]
tri ;
: number-transitions ( transitions numbering -- new-transitions )
[
[ at ]
[ '[ first _ at ] assoc-map ]
bi-curry bi*
] curry assoc-map ;
: number-states ( table -- newtable )
dup transitions>> keys <enum> [ swap ] H{ } assoc-map-as
[ '[ _ at ] change-start-state ]
[ '[ [ _ at ] map-set ] change-final-states ]
[ '[ _ number-transitions ] change-transitions ] tri ;
: minimize ( table -- minimal-table ) : minimize ( table -- minimal-table )
clone number-states combine-states ; clone number-states combine-states ;

View File

@ -0,0 +1,27 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: tools.test regexp.negation regexp.transition-tables regexp.classes ;
IN: regexp.negation.tests
[
! R/ |[^a]|.+/
T{ transition-table
{ transitions H{
{ 0 H{ { CHAR: a 1 } { T{ not-class f T{ or-class f { CHAR: a } } } -1 } } }
{ 1 H{ { T{ not-class f T{ or-class f { } } } -1 } } }
{ -1 H{ { any-char -1 } } }
} }
{ start-state 0 }
{ final-states H{ { 0 0 } { -1 -1 } } }
}
] [
! R/ a/
T{ transition-table
{ transitions H{
{ 0 H{ { CHAR: a 1 } } }
{ 1 H{ } }
} }
{ start-state 0 }
{ final-states H{ { 1 1 } } }
} negate-table
] unit-test

View File

@ -0,0 +1,36 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: regexp.nfa regexp.dfa regexp.minimize kernel sequences
assocs regexp.classes hashtables accessors ;
IN: regexp.negation
: ast>dfa ( parse-tree -- minimal-dfa )
construct-nfa construct-dfa minimize ;
CONSTANT: fail-state -1
: add-default-transition ( state's-transitions -- new-state's-transitions )
clone dup
[ [ fail-state ] dip keys <or-class> <not-class> ] keep set-at ;
: fail-state-recurses ( transitions -- new-transitions )
clone dup
[ fail-state any-char associate fail-state ] dip set-at ;
: add-fail-state ( transitions -- new-transitions )
[ add-default-transition ] assoc-map
fail-state-recurses ;
: assoc>set ( assoc -- keys-set )
[ drop dup ] assoc-map ;
: inverse-final-states ( transition-table -- final-states )
[ transitions>> assoc>set ] [ final-states>> ] bi assoc-diff ;
: negate-table ( transition-table -- transition-table )
clone
[ add-fail-state ] change-transitions
dup inverse-final-states >>final-states ;
! M: negation nfa-node ( node -- )
! ast>dfa negate-table adjoin-dfa ;

View File

@ -3,15 +3,13 @@
USING: accessors arrays assocs grouping kernel USING: accessors arrays assocs grouping kernel
locals math namespaces sequences fry quotations locals math namespaces sequences fry quotations
math.order math.ranges vectors unicode.categories math.order math.ranges vectors unicode.categories
regexp.transition-tables words sets regexp.transition-tables words sets hashtables
unicode.case.private regexp.ast regexp.classes ; unicode.case.private regexp.ast regexp.classes ;
! This uses unicode.case.private for ch>upper and ch>lower ! This uses unicode.case.private for ch>upper and ch>lower
! but case-insensitive matching should be done by case-folding everything ! but case-insensitive matching should be done by case-folding everything
! before processing starts ! before processing starts
IN: regexp.nfa IN: regexp.nfa
ERROR: feature-is-broken feature ;
SYMBOL: negated? SYMBOL: negated?
: negate ( -- ) : negate ( -- )
@ -21,14 +19,13 @@ SINGLETON: eps
SYMBOL: option-stack SYMBOL: option-stack
SYMBOL: combine-stack
SYMBOL: state SYMBOL: state
: next-state ( -- state ) : next-state ( -- state )
state [ get ] [ inc ] bi ; state [ get ] [ inc ] bi ;
SYMBOL: nfa-table SYMBOL: nfa-table
: table ( -- table ) nfa-table get ;
: set-each ( keys value hashtable -- ) : set-each ( keys value hashtable -- )
'[ _ swap _ set-at ] each ; '[ _ swap _ set-at ] each ;
@ -46,84 +43,56 @@ SYMBOL: nfa-table
: option? ( obj -- ? ) : option? ( obj -- ? )
option-stack get assoc-stack ; option-stack get assoc-stack ;
: set-start-state ( -- nfa-table ) GENERIC: nfa-node ( node -- start-state end-state )
nfa-table get
combine-stack get pop first >>start-state ;
GENERIC: nfa-node ( node -- ) :: add-simple-entry ( obj class -- start-state end-state )
next-state :> s0
:: add-simple-entry ( obj class -- ) next-state :> s1
[let* | s0 [ next-state ]
s1 [ next-state ]
stack [ combine-stack get ]
table [ nfa-table get ] |
negated? get [ negated? get [
s0 f obj class make-transition table add-transition s0 f obj class make-transition table add-transition
s0 s1 <default-transition> table add-transition s0 s1 <default-transition> table add-transition
] [ ] [
s0 s1 obj class make-transition table add-transition s0 s1 obj class make-transition table add-transition
] if ] if
s0 s1 2array stack push s0 s1 ;
t s1 table final-states>> set-at ] ;
:: concatenate-nodes ( -- ) : epsilon-transition ( source target -- )
[let* | stack [ combine-stack get ] eps <literal-transition> table add-transition ;
table [ nfa-table get ]
s2 [ stack peek first ]
s3 [ stack pop second ]
s0 [ stack peek first ]
s1 [ stack pop second ] |
s1 s2 eps <literal-transition> table add-transition
s1 table final-states>> delete-at
s0 s3 2array stack push ] ;
:: alternate-nodes ( -- ) M:: star nfa-node ( node -- start end )
[let* | stack [ combine-stack get ] node term>> nfa-node :> s1 :> s0
table [ nfa-table get ] next-state :> s2
s2 [ stack peek first ] next-state :> s3
s3 [ stack pop second ] s1 s0 epsilon-transition
s0 [ stack peek first ] s2 s0 epsilon-transition
s1 [ stack pop second ] s2 s3 epsilon-transition
s4 [ next-state ] s1 s3 epsilon-transition
s5 [ next-state ] | s2 s3 ;
s4 s0 eps <literal-transition> table add-transition
s4 s2 eps <literal-transition> table add-transition
s1 s5 eps <literal-transition> table add-transition
s3 s5 eps <literal-transition> table add-transition
s1 table final-states>> delete-at
s3 table final-states>> delete-at
t s5 table final-states>> set-at
s4 s5 2array stack push ] ;
M: star nfa-node ( node -- ) M: epsilon nfa-node
term>> nfa-node drop eps literal-transition add-simple-entry ;
[let* | stack [ combine-stack get ]
s0 [ stack peek first ]
s1 [ stack pop second ]
s2 [ next-state ]
s3 [ next-state ]
table [ nfa-table get ] |
s1 table final-states>> delete-at
t s3 table final-states>> set-at
s1 s0 eps <literal-transition> table add-transition
s2 s0 eps <literal-transition> table add-transition
s2 s3 eps <literal-transition> table add-transition
s1 s3 eps <literal-transition> table add-transition
s2 s3 2array stack push ] ;
M: concatenation nfa-node ( node -- ) M: concatenation nfa-node ( node -- start end )
seq>> [ eps literal-transition add-simple-entry ] [ [ first>> ] [ second>> ] bi
reversed-regexp option? [ <reversed> ] when reversed-regexp option? [ swap ] when
[ [ nfa-node ] each ] [ nfa-node ] bi@
[ length 1- [ concatenate-nodes ] times ] bi [ epsilon-transition ] dip ;
] if-empty ;
M: alternation nfa-node ( node -- ) :: alternate-nodes ( s0 s1 s2 s3 -- start end )
seq>> next-state :> s4
[ [ nfa-node ] each ] next-state :> s5
[ length 1- [ alternate-nodes ] times ] bi ; s4 s0 epsilon-transition
s4 s2 epsilon-transition
s1 s5 epsilon-transition
s3 s5 epsilon-transition
s4 s5 ;
M: integer nfa-node ( node -- ) M: alternation nfa-node ( node -- start end )
[ first>> ] [ second>> ] bi
[ nfa-node ] bi@
alternate-nodes ;
M: integer nfa-node ( node -- start end )
case-insensitive option? [ case-insensitive option? [
dup [ ch>lower ] [ ch>upper ] bi dup [ ch>lower ] [ ch>upper ] bi
2dup = [ 2dup = [
@ -131,26 +100,26 @@ M: integer nfa-node ( node -- )
literal-transition add-simple-entry literal-transition add-simple-entry
] [ ] [
[ literal-transition add-simple-entry ] bi@ [ literal-transition add-simple-entry ] bi@
alternate-nodes drop alternate-nodes [ nip ] dip
] if ] if
] [ ] [
literal-transition add-simple-entry literal-transition add-simple-entry
] if ; ] if ;
M: primitive-class nfa-node ( node -- ) M: primitive-class nfa-node ( node -- start end )
class>> dup class>> dup
{ letter-class LETTER-class } member? case-insensitive option? and { letter-class LETTER-class } member? case-insensitive option? and
[ drop Letter-class ] when [ drop Letter-class ] when
class-transition add-simple-entry ; class-transition add-simple-entry ;
M: any-char nfa-node ( node -- ) M: any-char nfa-node ( node -- start end )
[ dotall option? ] dip any-char-no-nl ? [ dotall option? ] dip any-char-no-nl ?
class-transition add-simple-entry ; class-transition add-simple-entry ;
M: negation nfa-node ( node -- ) M: negation nfa-node ( node -- start end )
negate term>> nfa-node negate ; negate term>> nfa-node negate ;
M: range nfa-node ( node -- ) M: range nfa-node ( node -- start end )
case-insensitive option? [ case-insensitive option? [
! This should be implemented for Unicode by case-folding ! This should be implemented for Unicode by case-folding
! the input and all strings in the regexp. ! the input and all strings in the regexp.
@ -169,15 +138,16 @@ M: range nfa-node ( node -- )
class-transition add-simple-entry class-transition add-simple-entry
] if ; ] if ;
M: with-options nfa-node ( node -- ) M: with-options nfa-node ( node -- start end )
dup options>> [ tree>> nfa-node ] using-options ; dup options>> [ tree>> nfa-node ] using-options ;
: construct-nfa ( ast -- nfa-table ) : construct-nfa ( ast -- nfa-table )
[ [
negated? off negated? off
V{ } clone combine-stack set
0 state set 0 state set
<transition-table> clone nfa-table set <transition-table> clone nfa-table set
nfa-node nfa-node
set-start-state table
swap dup associate >>final-states
swap >>start-state
] with-scope ; ] with-scope ;

View File

@ -4,6 +4,7 @@ USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
combinators regexp.classes strings splitting peg locals accessors combinators regexp.classes strings splitting peg locals accessors
regexp.ast ; regexp.ast ;
IN: regexp.parser IN: regexp.parser
: allowed-char? ( ch -- ? ) : allowed-char? ( ch -- ? )
".()|[*+?" member? not ; ".()|[*+?" member? not ;
@ -130,6 +131,7 @@ Parenthized = "?:" Alternation:a => [[ a ]]
| "?" Options:on "-"? Options:off ":" Alternation:a | "?" Options:on "-"? Options:off ":" Alternation:a
=> [[ a on off parse-options <with-options> ]] => [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]] | "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
| Alternation | Alternation
Element = "(" Parenthized:p ")" => [[ p ]] Element = "(" Parenthized:p ")" => [[ p ]]

View File

@ -4,14 +4,15 @@ USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry locals regexp.minimize namespaces parser arrays fry locals regexp.minimize
regexp.parser regexp.nfa regexp.dfa regexp.traversal regexp.parser regexp.nfa regexp.dfa regexp.traversal
regexp.transition-tables splitting sorting regexp.ast ; regexp.transition-tables splitting sorting regexp.ast
regexp.negation ;
IN: regexp IN: regexp
TUPLE: regexp raw parse-tree options dfa ; TUPLE: regexp raw parse-tree options dfa ;
: <optioned-regexp> ( string options -- regexp ) : <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi* [ dup parse-regexp ] [ string>options ] bi*
2dup <with-options> construct-nfa construct-dfa minimize 2dup <with-options> ast>dfa
regexp boa ; regexp boa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ; : <regexp> ( string -- regexp ) "" <optioned-regexp> ;