From 42ff154ead5d8f9e3951c77d1dc46b85b291779f Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sat, 7 Mar 2009 16:31:46 -0600 Subject: [PATCH] More regexp changes --- basis/regexp/ast/ast.factor | 11 ++------- basis/regexp/classes/classes.factor | 3 +++ basis/regexp/compiler/compiler.factor | 23 ++++++++---------- basis/regexp/minimize/minimize-tests.factor | 6 ++++- basis/regexp/minimize/minimize.factor | 26 +++++++++++++++------ basis/regexp/negation/negation.factor | 4 ++-- basis/regexp/parser/parser.factor | 10 ++++---- basis/regexp/regexp-tests.factor | 16 +++++++------ basis/regexp/regexp.factor | 13 +++++++---- 9 files changed, 63 insertions(+), 49 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index bc808bafca..9288766888 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -58,15 +58,8 @@ M: from-to : char-class ( ranges ? -- term ) [ ] dip [ ] when ; -TUPLE: lookahead term ; +TUPLE: lookahead term positive? ; C: lookahead -TUPLE: lookbehind term ; +TUPLE: lookbehind term positive? ; C: lookbehind - -TUPLE: possessive-star term ; -C: possessive-star - -: ( term -- term' ) - dup 2array ; - diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 6ea87fbb49..8912082ec3 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -239,6 +239,9 @@ M: not-class replace-question '[ _ _ replace-question ] assoc-map [ nip ] assoc-filter ; +: answers ( table questions answer -- new-table ) + '[ _ answer ] each ; + DEFER: make-condition : (make-condition) ( table questions question -- condition ) diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index 78dbbf9f25..4e615d15d7 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -36,21 +36,17 @@ M: $ question>quot M: ^ question>quot drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ; -! Maybe the condition>quot things can be combined, given a suitable method -! for question>quot on classes, but maybe that'd make stack shuffling annoying - -: execution-quot ( next-state -- quot ) +: (execution-quot) ( next-state -- quot ) ! The conditions here are for lookaround and anchors, etc dup condition? [ [ question>> question>quot ] [ yes>> ] [ no>> ] tri - [ execution-quot ] bi@ + [ (execution-quot) ] bi@ '[ 2dup @ _ _ if ] - ] [ - ! There shouldn't be a condition like this! - dup sequence? - [ [ [ 2drop ] ] [ first '[ _ execute ] ] if-empty ] - [ '[ _ execute ] ] if - ] if ; + ] [ '[ _ execute ] ] if ; + +: execution-quot ( next-state -- quot ) + dup sequence? [ first ] when + (execution-quot) ; TUPLE: box contents ; C: box @@ -66,8 +62,9 @@ C: box [ [ 3drop ] ] [ execution-quot '[ drop @ ] ] if-empty ] if ; -: non-literals>dispatch ( non-literal-transitions -- quot ) +: non-literals>dispatch ( literals non-literals -- quot ) [ swap ] assoc-map ! we want state => predicate, and get the opposite as input + swap keys f answers table>condition [ ] condition-map condition>quot ; : literals>cases ( literal-transitions -- case-body ) @@ -84,7 +81,7 @@ C: box : split-literals ( transitions -- case default ) >alist expand-or [ first integer? ] partition - [ literals>cases ] [ non-literals>dispatch ] bi* ; + [ [ literals>cases ] keep ] dip non-literals>dispatch ; :: step ( last-match index str quot final? direction -- last-index/f ) final? index last-match ? diff --git a/basis/regexp/minimize/minimize-tests.factor b/basis/regexp/minimize/minimize-tests.factor index 8cbfaf4a71..a7a9b50327 100644 --- a/basis/regexp/minimize/minimize-tests.factor +++ b/basis/regexp/minimize/minimize-tests.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: tools.test regexp.minimize assocs regexp -accessors regexp.transition-tables regexp.parser regexp.negation ; +accessors regexp.transition-tables regexp.parser +regexp.classes regexp.negation ; IN: regexp.minimize.tests [ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test @@ -52,3 +53,6 @@ IN: regexp.minimize.tests ] unit-test [ [ ] [ ] while-changes ] must-infer + +[ H{ { T{ or-class f { 1 2 } } 3 } { 4 5 } } ] +[ H{ { 1 3 } { 2 3 } { 4 5 } } combine-state-transitions ] unit-test diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor index c5b1d7e602..dd3682f937 100644 --- a/basis/regexp/minimize/minimize.factor +++ b/basis/regexp/minimize/minimize.factor @@ -11,8 +11,8 @@ IN: regexp.minimize : number-states ( table -- newtable ) dup table>state-numbers transitions-at ; -: no-conditions? ( state transition-table -- ? ) - transitions>> at values [ condition? ] any? not ; +: has-conditions? ( state transitions -- ? ) + at values [ condition? ] any? ; : initially-same? ( s1 s2 transition-table -- ? ) { @@ -25,7 +25,8 @@ IN: regexp.minimize ! Partition table is sorted-array => ? H{ } clone :> out transition-table transitions>> keys - [ transition-table no-conditions? ] filter :> states + [ transition-table transitions>> has-conditions? ] partition :> states + [ dup 2array out conjoin ] each states [| s1 | states [| s2 | s1 s2 transition-table initially-same? @@ -68,16 +69,27 @@ IN: regexp.minimize '[ _ partition-more ] [ assoc-size ] while-changes partition>classes ; -: canonical-state? ( state state-classes -- ? ) - dupd at = ; +: canonical-state? ( state transitions state-classes -- ? ) + '[ dup _ at = ] swap '[ _ has-conditions? ] bi or ; : delete-duplicates ( transitions state-classes -- new-transitions ) - '[ drop _ canonical-state? ] assoc-filter ; + dupd '[ drop _ _ canonical-state? ] assoc-filter ; : combine-states ( table -- smaller-table ) dup state-classes [ transitions-at ] keep '[ _ delete-duplicates ] change-transitions ; +: combine-state-transitions ( hash -- hash ) + H{ } clone tuck '[ + _ [ 2array ] change-at + ] assoc-each [ swap ] assoc-map ; + +: combine-transitions ( table -- table ) + [ [ combine-state-transitions ] assoc-map ] change-transitions ; + : minimize ( table -- minimal-table ) - clone number-states combine-states ; + clone + number-states + combine-states + combine-transitions ; diff --git a/basis/regexp/negation/negation.factor b/basis/regexp/negation/negation.factor index b03223fabf..fd2a4510c6 100644 --- a/basis/regexp/negation/negation.factor +++ b/basis/regexp/negation/negation.factor @@ -43,11 +43,11 @@ CONSTANT: fail-state -1 : unify-final-state ( transition-table -- transition-table ) dup [ final-states>> keys ] keep - '[ -2 epsilon _ add-transition ] each + '[ -2 epsilon _ set-transition ] each H{ { -2 -2 } } >>final-states ; : adjoin-dfa ( transition-table -- start end ) - box-transitions unify-final-state renumber-states + unify-final-state renumber-states box-transitions [ start-state>> ] [ final-states>> keys first ] [ nfa-table get [ transitions>> ] bi@ swap update ] tri ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 5870395b7c..1c001cdc57 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -138,10 +138,10 @@ Parenthized = "?:" Alternation:a => [[ a ]] => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] | "?~" Alternation:a => [[ a ]] - | "?=" Alternation:a => [[ a ]] - | "?!" Alternation:a => [[ a ]] - | "?<=" Alternation:a => [[ a ]] - | "? [[ a ]] + | "?=" Alternation:a => [[ a t ]] + | "?!" Alternation:a => [[ a f ]] + | "?<=" Alternation:a => [[ a t ]] + | "? [[ a f ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] @@ -158,8 +158,6 @@ Times = "," Number:n "}" => [[ 0 n ]] | Number:n "," Number:m "}" => [[ n m ]] Repeated = Element:e "{" Times:t => [[ e t ]] - | Element:e "*+" => [[ e ]] - | Element:e "++" => [[ e ]] | Element:e "?" => [[ e ]] | Element:e "*" => [[ e ]] | Element:e "+" => [[ e ]] diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 97b04cf62a..99cb8dbd22 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -24,8 +24,8 @@ IN: regexp-tests [ t ] [ "b" "b|" matches? ] unit-test [ t ] [ "" "b|" matches? ] unit-test [ t ] [ "" "b|" matches? ] unit-test -[ f ] [ "" "|" matches? ] unit-test -[ f ] [ "" "|||||||" matches? ] unit-test +[ t ] [ "" "|" matches? ] unit-test +[ t ] [ "" "|||||||" matches? ] unit-test [ f ] [ "aa" "a|b|c" matches? ] unit-test [ f ] [ "bb" "a|b|c" matches? ] unit-test @@ -182,7 +182,7 @@ IN: regexp-tests [ f ] [ "abc" "[\\p{Upper}]{3}" matches? ] unit-test [ t ] [ "ABC" "[\\p{Upper}]{3}" matches? ] unit-test -[ f ] [ "" "\\Q\\E" matches? ] unit-test +[ t ] [ "" "\\Q\\E" matches? ] unit-test [ f ] [ "a" "\\Q\\E" matches? ] unit-test [ t ] [ "|*+" "\\Q|*+\\E" matches? ] unit-test [ f ] [ "abc" "\\Q|*+\\E" matches? ] unit-test @@ -300,8 +300,10 @@ IN: regexp-tests [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test -[ f ] [ "ab" "a(?!b)" match-head ] unit-test +[ "" ] [ "ab" "a(?!b)" match-head >string ] unit-test [ "a" ] [ "ac" "a(?!b)" match-head >string ] unit-test +[ t ] [ "fxxbar" ".{3}(?!foo)bar" matches? ] unit-test +[ t ] [ "foobar" ".{3}(?!foo)bar" matches? ] unit-test [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test [ "a" ] [ "ab" "a(?=b)(?=b)" match-head >string ] unit-test @@ -396,9 +398,9 @@ IN: regexp-tests [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test [ 1 ] [ "a" R/ \Aa\Z/m count-matches ] unit-test -[ 1 ] [ "\na" R/ \Aaa\Z/m count-matches ] unit-test -[ 1 ] [ "\r\na" R/ \Aa\Z/m count-matches ] unit-test -[ 1 ] [ "\ra" R/ \Aa\Z/m count-matches ] unit-test +[ 0 ] [ "\na" R/ \Aaa\Z/m count-matches ] unit-test +[ 0 ] [ "\r\na" R/ \Aa\Z/m count-matches ] unit-test +[ 0 ] [ "\ra" R/ \Aa\Z/m count-matches ] unit-test [ t ] [ "a" R/ ^a/m matches? ] unit-test [ f ] [ "\na" R/ ^a/m matches? ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 6693691ba8..970e963c73 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -40,13 +40,18 @@ C: reverse-matcher : ( ast -- reversed ) "r" string>options ; +: maybe-negated ( lookaround quot -- regexp-quot ) + '[ term>> @ ] [ positive?>> [ ] [ not ] ? ] bi compose ; + M: lookahead question>quot ! Returns ( index string -- ? ) - term>> ast>dfa dfa>shortest-quotation ; + [ ast>dfa dfa>shortest-quotation ] maybe-negated ; M: lookbehind question>quot ! Returns ( index string -- ? ) - term>> - ast>dfa dfa>reverse-shortest-quotation - [ [ 1- ] dip ] prepose ; + [ + + ast>dfa dfa>reverse-shortest-quotation + [ [ 1- ] dip ] prepose + ] maybe-negated ; : compile-reverse ( regexp -- regexp ) dup '[