More regexp changes

db4
Daniel Ehrenberg 2009-03-07 16:31:46 -06:00
parent 81b68eac48
commit 42ff154ead
9 changed files with 63 additions and 49 deletions

View File

@ -58,15 +58,8 @@ M: from-to <times>
: char-class ( ranges ? -- term )
[ <or-class> ] dip [ <not-class> ] when ;
TUPLE: lookahead term ;
TUPLE: lookahead term positive? ;
C: <lookahead> lookahead
TUPLE: lookbehind term ;
TUPLE: lookbehind term positive? ;
C: <lookbehind> lookbehind
TUPLE: possessive-star term ;
C: <possessive-star> possessive-star
: <possessive-plus> ( term -- term' )
dup <possessive-star> 2array <concatenation> ;

View File

@ -239,6 +239,9 @@ M: not-class replace-question
'[ _ _ replace-question ] assoc-map
[ nip ] assoc-filter ;
: answers ( table questions answer -- new-table )
'[ _ answer ] each ;
DEFER: make-condition
: (make-condition) ( table questions question -- condition )

View File

@ -36,21 +36,17 @@ M: $ question>quot
M: ^ question>quot
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
! Maybe the condition>quot things can be combined, given a suitable method
! for question>quot on classes, but maybe that'd make stack shuffling annoying
: execution-quot ( next-state -- quot )
: (execution-quot) ( next-state -- quot )
! The conditions here are for lookaround and anchors, etc
dup condition? [
[ question>> question>quot ] [ yes>> ] [ no>> ] tri
[ execution-quot ] bi@
[ (execution-quot) ] bi@
'[ 2dup @ _ _ if ]
] [
! There shouldn't be a condition like this!
dup sequence?
[ [ [ 2drop ] ] [ first '[ _ execute ] ] if-empty ]
[ '[ _ execute ] ] if
] if ;
] [ '[ _ execute ] ] if ;
: execution-quot ( next-state -- quot )
dup sequence? [ first ] when
(execution-quot) ;
TUPLE: box contents ;
C: <box> box
@ -66,8 +62,9 @@ C: <box> box
[ [ 3drop ] ] [ execution-quot '[ drop @ ] ] if-empty
] if ;
: non-literals>dispatch ( non-literal-transitions -- quot )
: non-literals>dispatch ( literals non-literals -- quot )
[ swap ] assoc-map ! we want state => predicate, and get the opposite as input
swap keys f answers
table>condition [ <box> ] condition-map condition>quot ;
: literals>cases ( literal-transitions -- case-body )
@ -84,7 +81,7 @@ C: <box> box
: split-literals ( transitions -- case default )
>alist expand-or [ first integer? ] partition
[ literals>cases ] [ non-literals>dispatch ] bi* ;
[ [ literals>cases ] keep ] dip non-literals>dispatch ;
:: step ( last-match index str quot final? direction -- last-index/f )
final? index last-match ?

View File

@ -1,7 +1,8 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: tools.test regexp.minimize assocs regexp
accessors regexp.transition-tables regexp.parser regexp.negation ;
accessors regexp.transition-tables regexp.parser
regexp.classes regexp.negation ;
IN: regexp.minimize.tests
[ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test
@ -52,3 +53,6 @@ IN: regexp.minimize.tests
] unit-test
[ [ ] [ ] while-changes ] must-infer
[ H{ { T{ or-class f { 1 2 } } 3 } { 4 5 } } ]
[ H{ { 1 3 } { 2 3 } { 4 5 } } combine-state-transitions ] unit-test

View File

@ -11,8 +11,8 @@ IN: regexp.minimize
: number-states ( table -- newtable )
dup table>state-numbers transitions-at ;
: no-conditions? ( state transition-table -- ? )
transitions>> at values [ condition? ] any? not ;
: has-conditions? ( state transitions -- ? )
at values [ condition? ] any? ;
: initially-same? ( s1 s2 transition-table -- ? )
{
@ -25,7 +25,8 @@ IN: regexp.minimize
! Partition table is sorted-array => ?
H{ } clone :> out
transition-table transitions>> keys
[ transition-table no-conditions? ] filter :> states
[ transition-table transitions>> has-conditions? ] partition :> states
[ dup 2array out conjoin ] each
states [| s1 |
states [| s2 |
s1 s2 transition-table initially-same?
@ -68,16 +69,27 @@ IN: regexp.minimize
'[ _ partition-more ] [ assoc-size ] while-changes
partition>classes ;
: canonical-state? ( state state-classes -- ? )
dupd at = ;
: canonical-state? ( state transitions state-classes -- ? )
'[ dup _ at = ] swap '[ _ has-conditions? ] bi or ;
: delete-duplicates ( transitions state-classes -- new-transitions )
'[ drop _ canonical-state? ] assoc-filter ;
dupd '[ drop _ _ canonical-state? ] assoc-filter ;
: combine-states ( table -- smaller-table )
dup state-classes
[ transitions-at ] keep
'[ _ delete-duplicates ] change-transitions ;
: combine-state-transitions ( hash -- hash )
H{ } clone tuck '[
_ [ 2array <or-class> ] change-at
] assoc-each [ swap ] assoc-map ;
: combine-transitions ( table -- table )
[ [ combine-state-transitions ] assoc-map ] change-transitions ;
: minimize ( table -- minimal-table )
clone number-states combine-states ;
clone
number-states
combine-states
combine-transitions ;

View File

@ -43,11 +43,11 @@ CONSTANT: fail-state -1
: unify-final-state ( transition-table -- transition-table )
dup [ final-states>> keys ] keep
'[ -2 epsilon _ add-transition ] each
'[ -2 epsilon _ set-transition ] each
H{ { -2 -2 } } >>final-states ;
: adjoin-dfa ( transition-table -- start end )
box-transitions unify-final-state renumber-states
unify-final-state renumber-states box-transitions
[ start-state>> ]
[ final-states>> keys first ]
[ nfa-table get [ transitions>> ] bi@ swap update ] tri ;

View File

@ -138,10 +138,10 @@ Parenthized = "?:" Alternation:a => [[ a ]]
=> [[ a on off parse-options <with-options> ]]
| "?#" [^)]* => [[ f ]]
| "?~" Alternation:a => [[ a <negation> ]]
| "?=" Alternation:a => [[ a <lookahead> <tagged-epsilon> ]]
| "?!" Alternation:a => [[ a <negation> <lookahead> <tagged-epsilon> ]]
| "?<=" Alternation:a => [[ a <lookbehind> <tagged-epsilon> ]]
| "?<!" Alternation:a => [[ a <negation> <lookbehind> <tagged-epsilon> ]]
| "?=" Alternation:a => [[ a t <lookahead> <tagged-epsilon> ]]
| "?!" Alternation:a => [[ a f <lookahead> <tagged-epsilon> ]]
| "?<=" Alternation:a => [[ a t <lookbehind> <tagged-epsilon> ]]
| "?<!" Alternation:a => [[ a f <lookbehind> <tagged-epsilon> ]]
| Alternation
Element = "(" Parenthized:p ")" => [[ p ]]
@ -158,8 +158,6 @@ Times = "," Number:n "}" => [[ 0 n <from-to> ]]
| Number:n "," Number:m "}" => [[ n m <from-to> ]]
Repeated = Element:e "{" Times:t => [[ e t <times> ]]
| Element:e "*+" => [[ e <possessive-star> ]]
| Element:e "++" => [[ e <possessive-plus> ]]
| Element:e "?" => [[ e <maybe> ]]
| Element:e "*" => [[ e <star> ]]
| Element:e "+" => [[ e <plus> ]]

View File

@ -24,8 +24,8 @@ IN: regexp-tests
[ t ] [ "b" "b|" <regexp> matches? ] unit-test
[ t ] [ "" "b|" <regexp> matches? ] unit-test
[ t ] [ "" "b|" <regexp> matches? ] unit-test
[ f ] [ "" "|" <regexp> matches? ] unit-test
[ f ] [ "" "|||||||" <regexp> matches? ] unit-test
[ t ] [ "" "|" <regexp> matches? ] unit-test
[ t ] [ "" "|||||||" <regexp> matches? ] unit-test
[ f ] [ "aa" "a|b|c" <regexp> matches? ] unit-test
[ f ] [ "bb" "a|b|c" <regexp> matches? ] unit-test
@ -182,7 +182,7 @@ IN: regexp-tests
[ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
[ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
[ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
[ t ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
[ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
[ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
[ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
@ -300,8 +300,10 @@ IN: regexp-tests
[ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
[ f ] [ "ab" "a(?!b)" <regexp> match-head ] unit-test
[ "" ] [ "ab" "a(?!b)" <regexp> match-head >string ] unit-test
[ "a" ] [ "ac" "a(?!b)" <regexp> match-head >string ] unit-test
[ t ] [ "fxxbar" ".{3}(?!foo)bar" <regexp> matches? ] unit-test
[ t ] [ "foobar" ".{3}(?!foo)bar" <regexp> matches? ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> match-head >string ] unit-test
@ -396,9 +398,9 @@ IN: regexp-tests
[ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
[ 1 ] [ "a" R/ \Aa\Z/m count-matches ] unit-test
[ 1 ] [ "\na" R/ \Aaa\Z/m count-matches ] unit-test
[ 1 ] [ "\r\na" R/ \Aa\Z/m count-matches ] unit-test
[ 1 ] [ "\ra" R/ \Aa\Z/m count-matches ] unit-test
[ 0 ] [ "\na" R/ \Aaa\Z/m count-matches ] unit-test
[ 0 ] [ "\r\na" R/ \Aa\Z/m count-matches ] unit-test
[ 0 ] [ "\ra" R/ \Aa\Z/m count-matches ] unit-test
[ t ] [ "a" R/ ^a/m matches? ] unit-test
[ f ] [ "\na" R/ ^a/m matches? ] unit-test

View File

@ -40,13 +40,18 @@ C: <reverse-matcher> reverse-matcher
: <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ;
: maybe-negated ( lookaround quot -- regexp-quot )
'[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ;
M: lookahead question>quot ! Returns ( index string -- ? )
term>> ast>dfa dfa>shortest-quotation ;
[ ast>dfa dfa>shortest-quotation ] maybe-negated ;
M: lookbehind question>quot ! Returns ( index string -- ? )
term>> <reversed-option>
[
<reversed-option>
ast>dfa dfa>reverse-shortest-quotation
[ [ 1- ] dip ] prepose ;
[ [ 1- ] dip ] prepose
] maybe-negated ;
: compile-reverse ( regexp -- regexp )
dup '[