From ca19a1b728a7f86427bf712a664d99dbbe64e1ea Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 4 Mar 2009 13:22:22 -0600 Subject: [PATCH] Unfinished changes for regexp lookaround --- basis/regexp/classes/classes-tests.factor | 27 +++++++++++ basis/regexp/classes/classes.factor | 56 ++++++++++++++++++++++- basis/regexp/compiler/compiler.factor | 21 ++++++--- basis/regexp/dfa/dfa.factor | 46 +------------------ basis/regexp/minimize/minimize.factor | 10 ++-- basis/regexp/regexp-tests.factor | 2 +- basis/regexp/regexp.factor | 27 ++++++----- 7 files changed, 119 insertions(+), 70 deletions(-) diff --git a/basis/regexp/classes/classes-tests.factor b/basis/regexp/classes/classes-tests.factor index 8d660ffa30..2253cd999a 100644 --- a/basis/regexp/classes/classes-tests.factor +++ b/basis/regexp/classes/classes-tests.factor @@ -3,6 +3,8 @@ USING: regexp.classes tools.test arrays kernel ; IN: regexp.classes.tests +! Class algebra + [ f ] [ { 1 2 } ] unit-test [ T{ or-class f { 2 1 } } ] [ { 1 2 } ] unit-test [ 3 ] [ { 1 2 } 3 2array ] unit-test @@ -25,3 +27,28 @@ IN: regexp.classes.tests [ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test [ T{ primitive-class { class letter-class } } ] [ letter-class dup 2array ] unit-test [ T{ or-class { seq { 2 3 1 } } } ] [ { 1 2 } { 2 3 } 2array ] unit-test + +! Making classes into nested conditionals + +[ V{ 1 2 3 4 } ] [ T{ and-class f { 1 T{ not-class f 2 } T{ or-class f { 3 4 } } 2 } } class>questions ] unit-test +[ { 3 } ] [ { { t 3 } } table>condition ] unit-test +[ { T{ primitive-class } } ] [ { { t 1 } { T{ primitive-class } 2 } } table>questions ] unit-test +[ { { t 1 } { t 2 } } ] [ { { t 1 } { T{ primitive-class } 2 } } T{ primitive-class } t answer ] unit-test +[ { { t 1 } } ] [ { { t 1 } { T{ primitive-class } 2 } } T{ primitive-class } f answer ] unit-test +[ T{ condition f T{ primitive-class } { 1 2 } { 1 } } ] [ { { t 1 } { T{ primitive-class } 2 } } table>condition ] unit-test + +SYMBOL: foo +SYMBOL: bar + +[ T{ condition f T{ primitive-class f bar } T{ condition f T{ primitive-class f foo } { 1 2 3 } { 1 3 } } T{ condition f T{ primitive-class f foo } { 1 2 } { 1 } } } ] [ { { t 1 } { T{ primitive-class f foo } 2 } { T{ primitive-class f bar } 3 } } table>condition ] unit-test + +[ t ] [ foo dup t replace-question ] unit-test +[ f ] [ foo dup f replace-question ] unit-test +[ T{ primitive-class f foo } ] [ foo bar t replace-question ] unit-test +[ T{ primitive-class f foo } ] [ foo bar f replace-question ] unit-test +[ T{ primitive-class f foo } ] [ foo bar 2array bar t replace-question ] unit-test +[ T{ primitive-class f bar } ] [ foo bar 2array foo t replace-question ] unit-test +[ f ] [ foo bar 2array foo f replace-question ] unit-test +[ f ] [ foo bar 2array bar f replace-question ] unit-test +[ t ] [ foo bar 2array bar t replace-question ] unit-test +[ T{ primitive-class f foo } ] [ foo bar 2array bar f replace-question ] unit-test diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index c4673cf26b..229197e507 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors kernel math math.order words combinators locals ascii unicode.categories combinators.short-circuit sequences -fry macros arrays ; +fry macros arrays assocs sets ; IN: regexp.classes SINGLETONS: any-char any-char-no-nl @@ -208,3 +208,57 @@ M: primitive-class class-member? class>> class-member? ; UNION: class primitive-class not-class or-class and-class range ; + +TUPLE: condition question yes no ; +C: condition + +GENERIC# replace-question 2 ( class from to -- new-class ) + +M:: object replace-question ( class from to -- new-class ) + class from = to class ? ; + +: replace-compound ( class from to -- seq ) + [ seq>> ] 2dip '[ _ _ replace-question ] map ; + +M: and-class replace-question + replace-compound ; + +M: or-class replace-question + replace-compound ; + +M: not-class replace-question + class>> replace-question ; + +: answer ( table question answer -- new-table ) + '[ [ _ _ replace-question ] dip ] assoc-map + [ drop ] assoc-filter ; + +DEFER: make-condition + +: (make-condition) ( table questions question -- condition ) + [ 2nip ] + [ swap [ t answer ] dip make-condition ] + [ swap [ f answer ] dip make-condition ] 3tri + 2dup = [ 2nip ] [ ] if ; + +: make-condition ( table questions -- condition ) + [ values ] [ unclip (make-condition) ] if-empty ; + +GENERIC: class>questions ( class -- questions ) +: compound-questions ( class -- questions ) seq>> [ class>questions ] gather ; +M: or-class class>questions compound-questions ; +M: and-class class>questions compound-questions ; +M: not-class class>questions class>> class>questions ; +M: object class>questions 1array ; + +: table>questions ( table -- questions ) + keys class>questions t swap remove ; + +: table>condition ( table -- condition ) + >alist dup table>questions make-condition ; + +: condition-map ( condition quot: ( obj -- obj' ) -- new-condition ) + over condition? [ + [ [ question>> ] [ yes>> ] [ no>> ] tri ] dip + '[ _ condition-map ] bi@ + ] [ call ] if ; inline recursive diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index 7fda010351..88fc415b42 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -9,9 +9,17 @@ IN: regexp.compiler : literals>cases ( literal-transitions -- case-body ) [ 1quotation ] assoc-map ; +: condition>quot ( condition -- quot ) + dup condition? [ + [ question>> ] [ yes>> ] [ no>> ] tri + [ condition>quot ] bi@ + '[ dup _ class-member? _ _ if ] + ] [ + [ [ 3drop ] ] [ '[ drop _ execute ] ] if-empty + ] if ; + : non-literals>dispatch ( non-literal-transitions -- quot ) - [ [ '[ dup _ class-member? ] ] [ '[ drop _ execute ] ] bi* ] assoc-map - [ 3drop ] suffix '[ _ cond ] ; + table>condition condition>quot ; : expand-one-or ( or-class transition -- alist ) [ seq>> ] dip '[ _ 2array ] map ; @@ -36,7 +44,7 @@ IN: regexp.compiler : transitions>quot ( transitions final-state? -- quot ) [ split-literals suffix ] dip - '[ { array-capacity string } declare _ _ step ] ; + '[ { array-capacity sequence } declare _ _ step ] ; : word>quot ( word dfa -- quot ) [ transitions>> at ] @@ -67,11 +75,12 @@ IN: regexp.compiler : dfa>word ( dfa -- word ) states>words [ states>code ] keep start-state>> ; -: check-string ( string -- string ) - dup string? [ "String required" throw ] unless ; +: check-sequence ( string -- string ) + ! Make this configurable + dup sequence? [ "String required" throw ] unless ; : run-regexp ( start-index string word -- ? ) - { [ f ] [ >fixnum ] [ check-string ] [ execute ] } spread ; inline + { [ f ] [ >fixnum ] [ check-sequence ] [ execute ] } spread ; inline : dfa>quotation ( dfa -- quot ) dfa>word '[ _ run-regexp ] ; diff --git a/basis/regexp/dfa/dfa.factor b/basis/regexp/dfa/dfa.factor index 8839e53485..f05f5d5c7f 100644 --- a/basis/regexp/dfa/dfa.factor +++ b/basis/regexp/dfa/dfa.factor @@ -8,9 +8,6 @@ IN: regexp.dfa : find-delta ( states transition nfa -- new-states ) transitions>> '[ _ swap _ at at ] gather sift ; -TUPLE: condition question yes no ; -C: condition - :: epsilon-loop ( state table nfa question -- ) state table at :> old-value old-value question 2array :> new-question @@ -27,53 +24,12 @@ C: condition ] assoc-each ] unless ; -GENERIC# replace-question 2 ( class from to -- new-class ) - -M: object replace-question - [ [ = ] keep ] dip swap ? ; - -: replace-compound ( class from to -- seq ) - [ seq>> ] 2dip '[ _ _ replace-question ] map ; - -M: and-class replace-question - replace-compound ; - -M: or-class replace-question - replace-compound ; - -: answer ( table question answer -- new-table ) - '[ _ _ replace-question ] assoc-map - [ nip ] assoc-filter ; - -DEFER: make-condition - -: (make-condition) ( table questions question -- condition ) - [ 2nip ] - [ swap [ t answer ] dip make-condition ] - [ swap [ f answer ] dip make-condition ] 3tri - ; - -: make-condition ( table questions -- condition ) - [ keys ] [ unclip (make-condition) ] if-empty ; - -GENERIC: class>questions ( class -- questions ) -: compound-questions ( class -- questions ) seq>> [ class>questions ] gather ; -M: or-class class>questions compound-questions ; -M: and-class class>questions compound-questions ; -M: object class>questions 1array ; - -: table>condition ( table -- condition ) - ! This is wrong, since actually an arbitrary and-class or or-class can be used - dup - values class>questions t swap remove - make-condition ; - : epsilon-table ( states nfa -- table ) [ H{ } clone tuck ] dip '[ _ _ t epsilon-loop ] each ; : find-epsilon-closure ( states nfa -- dfa-state ) - epsilon-table table>condition ; + epsilon-table [ swap ] assoc-map table>condition ; : find-closure ( states transition nfa -- new-states ) [ find-delta ] keep find-epsilon-closure ; diff --git a/basis/regexp/minimize/minimize.factor b/basis/regexp/minimize/minimize.factor index b51faff371..c98cf131cb 100644 --- a/basis/regexp/minimize/minimize.factor +++ b/basis/regexp/minimize/minimize.factor @@ -2,13 +2,13 @@ ! See http://factorcode.org/license.txt for BSD license. USING: kernel sequences regexp.transition-tables fry assocs accessors locals math sorting arrays sets hashtables regexp.dfa -combinators.short-circuit ; +combinators.short-circuit regexp.classes ; IN: regexp.minimize : number-transitions ( transitions numbering -- new-transitions ) dup '[ [ _ at ] - [ [ _ at ] assoc-map ] bi* + [ [ [ _ at ] condition-map ] assoc-map ] bi* ] assoc-map ; : table>state-numbers ( table -- assoc ) @@ -29,6 +29,9 @@ IN: regexp.minimize dup table>state-numbers [ number-transitions ] rewrite-transitions ; +: no-conditions? ( state transition-table -- ? ) + transitions>> at values [ condition? ] any? not ; + : initially-same? ( s1 s2 transition-table -- ? ) { [ drop <= ] @@ -39,7 +42,8 @@ IN: regexp.minimize :: initialize-partitions ( transition-table -- partitions ) ! Partition table is sorted-array => ? H{ } clone :> out - transition-table transitions>> keys :> states + transition-table transitions>> keys + [ transition-table no-conditions? ] filter :> states states [| s1 | states [| s2 | s1 s2 transition-table initially-same? diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 21653077a8..9425e38727 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -1,5 +1,5 @@ USING: regexp tools.test kernel sequences regexp.parser regexp.private -regexp.traversal eval strings multiline accessors regexp.matchers ; +eval strings multiline accessors regexp.matchers ; IN: regexp-tests \ must-infer diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 0502cb4d4b..ab091a7682 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -3,7 +3,7 @@ USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend prettyprint.custom make lexer namespaces parser arrays fry locals regexp.minimize -regexp.parser regexp.nfa regexp.dfa regexp.traversal +regexp.parser regexp.nfa regexp.dfa regexp.transition-tables splitting sorting regexp.ast regexp.negation regexp.matchers regexp.compiler ; IN: regexp @@ -12,16 +12,16 @@ TUPLE: regexp { raw read-only } { parse-tree read-only } { options read-only } - dfa reverse-dfa dfa-quot ; + dfa reverse-dfa ; : make-regexp ( string ast -- regexp ) - f f f f f regexp boa ; foldable + f f f f regexp boa ; foldable ! Foldable because, when the dfa slot is set, ! it'll be set to the same thing regardless of who sets it : ( string options -- regexp ) [ dup parse-regexp ] [ string>options ] bi* - f f f regexp boa ; + f f regexp boa ; : ( string -- regexp ) "" ; @@ -34,26 +34,25 @@ C: reverse-matcher [ parse-tree>> ] [ options>> ] bi ; : compile-regexp ( regexp -- regexp ) - dup '[ [ _ get-ast ast>dfa ] unless* ] change-dfa ; - -: compile-dfa-quot ( regexp -- regexp ) - dup '[ [ _ compile-regexp dfa>> dfa>quotation ] unless* ] change-dfa-quot ; + dup '[ [ _ get-ast ast>dfa dfa>quotation ] unless* ] change-dfa ; : ( ast -- reversed ) "r" string>options ; : compile-reverse ( regexp -- regexp ) - dup '[ [ _ get-ast ast>dfa ] unless* ] change-reverse-dfa ; + dup '[ + [ + _ get-ast + ast>dfa dfa>quotation + ] unless* + ] change-reverse-dfa ; M: regexp match-index-from ( string regexp -- index/f ) - dup dfa-quot>> - [ ] - [ compile-regexp dfa>> ] ?if - match-index-from ; + compile-regexp dfa-quot>> match-index-from ; M: reverse-matcher match-index-from ( string regexp -- index/f ) [ ] [ regexp>> compile-reverse reverse-dfa>> ] bi* - do-match match-index>> ; + match-index-from ; : find-regexp-syntax ( string -- prefix suffix ) {