From eb231df4e7c5d85ff74332c5ea7da96fb7a0dc4b Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 4 Mar 2009 00:36:03 -0600 Subject: [PATCH] Beginnings of lookahead and lookbehind --- basis/regexp/ast/ast.factor | 2 +- basis/regexp/classes/classes-tests.factor | 1 + basis/regexp/classes/classes.factor | 2 +- basis/regexp/dfa/dfa-tests.factor | 2 - basis/regexp/dfa/dfa.factor | 87 +++++++++++++++---- basis/regexp/minimize/minimize-tests.factor | 2 + basis/regexp/minimize/minimize.factor | 13 ++- basis/regexp/nfa/nfa.factor | 6 +- basis/regexp/parser/parser.factor | 8 +- .../transition-tables.factor | 13 ++- 10 files changed, 105 insertions(+), 31 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index b804eacc09..bc808bafca 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -16,7 +16,7 @@ C: at-least TUPLE: tagged-epsilon tag ; C: tagged-epsilon -CONSTANT: epsilon T{ tagged-epsilon } +CONSTANT: epsilon T{ tagged-epsilon { tag t } } TUPLE: concatenation first second ; diff --git a/basis/regexp/classes/classes-tests.factor b/basis/regexp/classes/classes-tests.factor index 5eac0ea352..8d660ffa30 100644 --- a/basis/regexp/classes/classes-tests.factor +++ b/basis/regexp/classes/classes-tests.factor @@ -21,6 +21,7 @@ IN: regexp.classes.tests [ 1 ] [ 1 ] unit-test [ 1 ] [ { 1 1 } ] unit-test [ 1 ] [ { 1 1 } ] unit-test +[ t ] [ { t t } ] unit-test [ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test [ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test [ T{ or-class { seq { 2 3 1 } } } ] [ { 1 2 } { 2 3 } 2array ] unit-test diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 33652f7606..c4673cf26b 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -140,7 +140,7 @@ GENERIC: combine-or ( class1 class2 -- combined ? ) M: object combine-or replace-if-= ; M: t combine-or - drop f ; + nip t ; M: f combine-or drop t ; diff --git a/basis/regexp/dfa/dfa-tests.factor b/basis/regexp/dfa/dfa-tests.factor index b6ce13c723..129a639929 100644 --- a/basis/regexp/dfa/dfa-tests.factor +++ b/basis/regexp/dfa/dfa-tests.factor @@ -1,5 +1,3 @@ USING: regexp.dfa tools.test ; IN: regexp.dfa.tests -[ [ ] [ ] while-changes ] must-infer - diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 01e3e01119..8839e53485 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -2,35 +2,84 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators fry kernel locals math math.order regexp.nfa regexp.transition-tables sequences -sets sorting vectors regexp.ast ; +sets sorting vectors regexp.ast regexp.classes ; IN: regexp.dfa -:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj ) - obj quot call :> new-obj - new-obj comp call :> new-key - new-key old-key = - [ new-obj ] - [ new-obj quot comp new-key (while-changes) ] - if ; inline recursive - -: while-changes ( obj quot pred -- obj' ) - 3dup nip call (while-changes) ; inline - : find-delta ( states transition nfa -- new-states ) transitions>> '[ _ swap _ at at ] gather sift ; -: (find-epsilon-closure) ( states nfa -- new-states ) - epsilon swap find-delta ; +TUPLE: condition question yes no ; +C: condition -: find-epsilon-closure ( states nfa -- new-states ) - '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes - natural-sort ; +:: epsilon-loop ( state table nfa question -- ) + state table at :> old-value + old-value question 2array :> new-question + new-question old-value = [ + new-question state table set-at + state nfa transitions>> at + [ drop tagged-epsilon? ] assoc-filter + [| trans to | + to [ + table nfa + trans tag>> new-question 2array + epsilon-loop + ] each + ] assoc-each + ] unless ; + +GENERIC# replace-question 2 ( class from to -- new-class ) + +M: object replace-question + [ [ = ] keep ] dip swap ? ; + +: replace-compound ( class from to -- seq ) + [ seq>> ] 2dip '[ _ _ replace-question ] map ; + +M: and-class replace-question + replace-compound ; + +M: or-class replace-question + replace-compound ; + +: answer ( table question answer -- new-table ) + '[ _ _ replace-question ] assoc-map + [ nip ] assoc-filter ; + +DEFER: make-condition + +: (make-condition) ( table questions question -- condition ) + [ 2nip ] + [ swap [ t answer ] dip make-condition ] + [ swap [ f answer ] dip make-condition ] 3tri + ; + +: make-condition ( table questions -- condition ) + [ keys ] [ unclip (make-condition) ] if-empty ; + +GENERIC: class>questions ( class -- questions ) +: compound-questions ( class -- questions ) seq>> [ class>questions ] gather ; +M: or-class class>questions compound-questions ; +M: and-class class>questions compound-questions ; +M: object class>questions 1array ; + +: table>condition ( table -- condition ) + ! This is wrong, since actually an arbitrary and-class or or-class can be used + dup + values class>questions t swap remove + make-condition ; + +: epsilon-table ( states nfa -- table ) + [ H{ } clone tuck ] dip + '[ _ _ t epsilon-loop ] each ; + +: find-epsilon-closure ( states nfa -- dfa-state ) + epsilon-table table>condition ; : find-closure ( states transition nfa -- new-states ) [ find-delta ] keep find-epsilon-closure ; : find-start-state ( nfa -- state ) - [ start-state>> 1vector ] keep find-epsilon-closure ; + [ start-state>> 1array ] keep find-epsilon-closure ; : find-transitions ( dfa-state nfa -- next-dfa-state ) transitions>> @@ -49,7 +98,7 @@ IN: regexp.dfa [| trans | state trans nfa find-closure :> new-state new-state visited-states new-states add-todo-state - state new-state trans dfa add-transition + state new-state trans dfa set-transition ] each nfa dfa new-states visited-states new-transitions ] if-empty ; diff --git a/basis/regexp/minimize/minimize-tests.factor b/basis/regexp/minimize/minimize-tests.factor index ece7c8fd7c..c5564caa55 100644 --- a/basis/regexp/minimize/minimize-tests.factor +++ b/basis/regexp/minimize/minimize-tests.factor @@ -47,3 +47,5 @@ IN: regexp.minimize.tests { final-states H{ { 3 3 } { 6 6 } } } } combine-states ] unit-test + +[ [ ] [ ] while-changes ] must-infer diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor index c88c2a850b..b51faff371 100644 --- a/basis/regexp/minimize/minimize.factor +++ b/basis/regexp/minimize/minimize.factor @@ -8,7 +8,7 @@ IN: regexp.minimize : number-transitions ( transitions numbering -- new-transitions ) dup '[ [ _ at ] - [ [ first _ at ] assoc-map ] bi* + [ [ _ at ] assoc-map ] bi* ] assoc-map ; : table>state-numbers ( table -- assoc ) @@ -66,6 +66,17 @@ IN: regexp.minimize >hashtable ; +:: (while-changes) ( obj quot: ( obj -- obj' ) comp: ( obj -- key ) old-key -- obj ) + obj quot call :> new-obj + new-obj comp call :> new-key + new-key old-key = + [ new-obj ] + [ new-obj quot comp new-key (while-changes) ] + if ; inline recursive + +: while-changes ( obj quot pred -- obj' ) + 3dup nip call (while-changes) ; inline + : state-classes ( transition-table -- synonyms ) [ initialize-partitions ] keep '[ _ partition-more ] [ assoc-size ] while-changes diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 68f7761394..302b1ebc55 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -55,8 +55,12 @@ M:: star nfa-node ( node -- start end ) s1 s3 epsilon-transition s2 s3 ; +GENERIC: modify-epsilon ( tag -- newtag ) + +M: object modify-epsilon ; + M: tagged-epsilon nfa-node - add-simple-entry ; + clone [ modify-epsilon ] change-tag add-simple-entry ; M: concatenation nfa-node ( node -- start end ) [ first>> ] [ second>> ] bi diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index ed0762cc3a..18aef7fa49 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -137,10 +137,10 @@ Parenthized = "?:" Alternation:a => [[ a ]] => [[ a on off parse-options ]] | "?#" [^)]* => [[ f ]] | "?~" Alternation:a => [[ a ]] - | "?=" Alternation:a => [[ a ]] - | "?!" Alternation:a => [[ a ]] - | "?<=" Alternation:a => [[ a ]] - | "? [[ a ]] + | "?=" Alternation:a => [[ a ]] + | "?!" Alternation:a => [[ a ]] + | "?<=" Alternation:a => [[ a ]] + | "? [[ a ]] | Alternation Element = "(" Parenthized:p ")" => [[ p ]] diff --git a/basis/regexp/transition-tables/transition-tables.factor b/basis/regexp/transition-tables/transition-tables.factor index 2b0a5c2bcc..2fad7451b0 100644 --- a/basis/regexp/transition-tables/transition-tables.factor +++ b/basis/regexp/transition-tables/transition-tables.factor @@ -14,11 +14,20 @@ TUPLE: transition-table transitions start-state final-states ; : maybe-initialize-key ( key hashtable -- ) 2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ; -:: set-transition ( from to obj hash -- ) +:: (set-transition) ( from to obj hash -- ) + to hash maybe-initialize-key + from hash at + [ [ to obj ] dip set-at ] + [ to obj associate from hash set-at ] if* ; + +: set-transition ( from to obj transition-table -- ) + transitions>> (set-transition) ; + +:: (add-transition) ( from to obj hash -- ) to hash maybe-initialize-key from hash at [ [ to obj ] dip push-at ] [ to 1vector obj associate from hash set-at ] if* ; : add-transition ( from to obj transition-table -- ) - transitions>> set-transition ; + transitions>> (add-transition) ;