Unfinished changes for regexp lookaround

db4
Daniel Ehrenberg 2009-03-04 13:22:22 -06:00
parent 4f306518dc
commit ca19a1b728
7 changed files with 119 additions and 70 deletions

View File

@ -3,6 +3,8 @@
USING: regexp.classes tools.test arrays kernel ; USING: regexp.classes tools.test arrays kernel ;
IN: regexp.classes.tests IN: regexp.classes.tests
! Class algebra
[ f ] [ { 1 2 } <and-class> ] unit-test [ f ] [ { 1 2 } <and-class> ] unit-test
[ T{ or-class f { 2 1 } } ] [ { 1 2 } <or-class> ] unit-test [ T{ or-class f { 2 1 } } ] [ { 1 2 } <or-class> ] unit-test
[ 3 ] [ { 1 2 } <and-class> 3 2array <or-class> ] unit-test [ 3 ] [ { 1 2 } <and-class> 3 2array <or-class> ] unit-test
@ -25,3 +27,28 @@ IN: regexp.classes.tests
[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <and-class> ] unit-test [ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <and-class> ] unit-test
[ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <or-class> ] unit-test [ T{ primitive-class { class letter-class } } ] [ letter-class <primitive-class> dup 2array <or-class> ] unit-test
[ T{ or-class { seq { 2 3 1 } } } ] [ { 1 2 } <or-class> { 2 3 } <or-class> 2array <or-class> ] unit-test [ T{ or-class { seq { 2 3 1 } } } ] [ { 1 2 } <or-class> { 2 3 } <or-class> 2array <or-class> ] unit-test
! Making classes into nested conditionals
[ V{ 1 2 3 4 } ] [ T{ and-class f { 1 T{ not-class f 2 } T{ or-class f { 3 4 } } 2 } } class>questions ] unit-test
[ { 3 } ] [ { { t 3 } } table>condition ] unit-test
[ { T{ primitive-class } } ] [ { { t 1 } { T{ primitive-class } 2 } } table>questions ] unit-test
[ { { t 1 } { t 2 } } ] [ { { t 1 } { T{ primitive-class } 2 } } T{ primitive-class } t answer ] unit-test
[ { { t 1 } } ] [ { { t 1 } { T{ primitive-class } 2 } } T{ primitive-class } f answer ] unit-test
[ T{ condition f T{ primitive-class } { 1 2 } { 1 } } ] [ { { t 1 } { T{ primitive-class } 2 } } table>condition ] unit-test
SYMBOL: foo
SYMBOL: bar
[ T{ condition f T{ primitive-class f bar } T{ condition f T{ primitive-class f foo } { 1 2 3 } { 1 3 } } T{ condition f T{ primitive-class f foo } { 1 2 } { 1 } } } ] [ { { t 1 } { T{ primitive-class f foo } 2 } { T{ primitive-class f bar } 3 } } table>condition ] unit-test
[ t ] [ foo <primitive-class> dup t replace-question ] unit-test
[ f ] [ foo <primitive-class> dup f replace-question ] unit-test
[ T{ primitive-class f foo } ] [ foo <primitive-class> bar <primitive-class> t replace-question ] unit-test
[ T{ primitive-class f foo } ] [ foo <primitive-class> bar <primitive-class> f replace-question ] unit-test
[ T{ primitive-class f foo } ] [ foo <primitive-class> bar <primitive-class> 2array <and-class> bar <primitive-class> t replace-question ] unit-test
[ T{ primitive-class f bar } ] [ foo <primitive-class> bar <primitive-class> 2array <and-class> foo <primitive-class> t replace-question ] unit-test
[ f ] [ foo <primitive-class> bar <primitive-class> 2array <and-class> foo <primitive-class> f replace-question ] unit-test
[ f ] [ foo <primitive-class> bar <primitive-class> 2array <and-class> bar <primitive-class> f replace-question ] unit-test
[ t ] [ foo <primitive-class> bar <primitive-class> 2array <or-class> bar <primitive-class> t replace-question ] unit-test
[ T{ primitive-class f foo } ] [ foo <primitive-class> bar <primitive-class> 2array <or-class> bar <primitive-class> f replace-question ] unit-test

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel math math.order words combinators locals USING: accessors kernel math math.order words combinators locals
ascii unicode.categories combinators.short-circuit sequences ascii unicode.categories combinators.short-circuit sequences
fry macros arrays ; fry macros arrays assocs sets ;
IN: regexp.classes IN: regexp.classes
SINGLETONS: any-char any-char-no-nl SINGLETONS: any-char any-char-no-nl
@ -208,3 +208,57 @@ M: primitive-class class-member?
class>> class-member? ; class>> class-member? ;
UNION: class primitive-class not-class or-class and-class range ; UNION: class primitive-class not-class or-class and-class range ;
TUPLE: condition question yes no ;
C: <condition> condition
GENERIC# replace-question 2 ( class from to -- new-class )
M:: object replace-question ( class from to -- new-class )
class from = to class ? ;
: replace-compound ( class from to -- seq )
[ seq>> ] 2dip '[ _ _ replace-question ] map ;
M: and-class replace-question
replace-compound <and-class> ;
M: or-class replace-question
replace-compound <or-class> ;
M: not-class replace-question
class>> replace-question <not-class> ;
: answer ( table question answer -- new-table )
'[ [ _ _ replace-question ] dip ] assoc-map
[ drop ] assoc-filter ;
DEFER: make-condition
: (make-condition) ( table questions question -- condition )
[ 2nip ]
[ swap [ t answer ] dip make-condition ]
[ swap [ f answer ] dip make-condition ] 3tri
2dup = [ 2nip ] [ <condition> ] if ;
: make-condition ( table questions -- condition )
[ values ] [ unclip (make-condition) ] if-empty ;
GENERIC: class>questions ( class -- questions )
: compound-questions ( class -- questions ) seq>> [ class>questions ] gather ;
M: or-class class>questions compound-questions ;
M: and-class class>questions compound-questions ;
M: not-class class>questions class>> class>questions ;
M: object class>questions 1array ;
: table>questions ( table -- questions )
keys <and-class> class>questions t swap remove ;
: table>condition ( table -- condition )
>alist dup table>questions make-condition ;
: condition-map ( condition quot: ( obj -- obj' ) -- new-condition )
over condition? [
[ [ question>> ] [ yes>> ] [ no>> ] tri ] dip
'[ _ condition-map ] bi@ <condition>
] [ call ] if ; inline recursive

View File

@ -9,9 +9,17 @@ IN: regexp.compiler
: literals>cases ( literal-transitions -- case-body ) : literals>cases ( literal-transitions -- case-body )
[ 1quotation ] assoc-map ; [ 1quotation ] assoc-map ;
: condition>quot ( condition -- quot )
dup condition? [
[ question>> ] [ yes>> ] [ no>> ] tri
[ condition>quot ] bi@
'[ dup _ class-member? _ _ if ]
] [
[ [ 3drop ] ] [ '[ drop _ execute ] ] if-empty
] if ;
: non-literals>dispatch ( non-literal-transitions -- quot ) : non-literals>dispatch ( non-literal-transitions -- quot )
[ [ '[ dup _ class-member? ] ] [ '[ drop _ execute ] ] bi* ] assoc-map table>condition condition>quot ;
[ 3drop ] suffix '[ _ cond ] ;
: expand-one-or ( or-class transition -- alist ) : expand-one-or ( or-class transition -- alist )
[ seq>> ] dip '[ _ 2array ] map ; [ seq>> ] dip '[ _ 2array ] map ;
@ -36,7 +44,7 @@ IN: regexp.compiler
: transitions>quot ( transitions final-state? -- quot ) : transitions>quot ( transitions final-state? -- quot )
[ split-literals suffix ] dip [ split-literals suffix ] dip
'[ { array-capacity string } declare _ _ step ] ; '[ { array-capacity sequence } declare _ _ step ] ;
: word>quot ( word dfa -- quot ) : word>quot ( word dfa -- quot )
[ transitions>> at ] [ transitions>> at ]
@ -67,11 +75,12 @@ IN: regexp.compiler
: dfa>word ( dfa -- word ) : dfa>word ( dfa -- word )
states>words [ states>code ] keep start-state>> ; states>words [ states>code ] keep start-state>> ;
: check-string ( string -- string ) : check-sequence ( string -- string )
dup string? [ "String required" throw ] unless ; ! Make this configurable
dup sequence? [ "String required" throw ] unless ;
: run-regexp ( start-index string word -- ? ) : run-regexp ( start-index string word -- ? )
{ [ f ] [ >fixnum ] [ check-string ] [ execute ] } spread ; inline { [ f ] [ >fixnum ] [ check-sequence ] [ execute ] } spread ; inline
: dfa>quotation ( dfa -- quot ) : dfa>quotation ( dfa -- quot )
dfa>word '[ _ run-regexp ] ; dfa>word '[ _ run-regexp ] ;

View File

@ -8,9 +8,6 @@ IN: regexp.dfa
: find-delta ( states transition nfa -- new-states ) : find-delta ( states transition nfa -- new-states )
transitions>> '[ _ swap _ at at ] gather sift ; transitions>> '[ _ swap _ at at ] gather sift ;
TUPLE: condition question yes no ;
C: <condition> condition
:: epsilon-loop ( state table nfa question -- ) :: epsilon-loop ( state table nfa question -- )
state table at :> old-value state table at :> old-value
old-value question 2array <or-class> :> new-question old-value question 2array <or-class> :> new-question
@ -27,53 +24,12 @@ C: <condition> condition
] assoc-each ] assoc-each
] unless ; ] unless ;
GENERIC# replace-question 2 ( class from to -- new-class )
M: object replace-question
[ [ = ] keep ] dip swap ? ;
: replace-compound ( class from to -- seq )
[ seq>> ] 2dip '[ _ _ replace-question ] map ;
M: and-class replace-question
replace-compound <and-class> ;
M: or-class replace-question
replace-compound <or-class> ;
: answer ( table question answer -- new-table )
'[ _ _ replace-question ] assoc-map
[ nip ] assoc-filter ;
DEFER: make-condition
: (make-condition) ( table questions question -- condition )
[ 2nip ]
[ swap [ t answer ] dip make-condition ]
[ swap [ f answer ] dip make-condition ] 3tri
<condition> ;
: make-condition ( table questions -- condition )
[ keys ] [ unclip (make-condition) ] if-empty ;
GENERIC: class>questions ( class -- questions )
: compound-questions ( class -- questions ) seq>> [ class>questions ] gather ;
M: or-class class>questions compound-questions ;
M: and-class class>questions compound-questions ;
M: object class>questions 1array ;
: table>condition ( table -- condition )
! This is wrong, since actually an arbitrary and-class or or-class can be used
dup
values <or-class> class>questions t swap remove
make-condition ;
: epsilon-table ( states nfa -- table ) : epsilon-table ( states nfa -- table )
[ H{ } clone tuck ] dip [ H{ } clone tuck ] dip
'[ _ _ t epsilon-loop ] each ; '[ _ _ t epsilon-loop ] each ;
: find-epsilon-closure ( states nfa -- dfa-state ) : find-epsilon-closure ( states nfa -- dfa-state )
epsilon-table table>condition ; epsilon-table [ swap ] assoc-map table>condition ;
: find-closure ( states transition nfa -- new-states ) : find-closure ( states transition nfa -- new-states )
[ find-delta ] keep find-epsilon-closure ; [ find-delta ] keep find-epsilon-closure ;

View File

@ -2,13 +2,13 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel sequences regexp.transition-tables fry assocs USING: kernel sequences regexp.transition-tables fry assocs
accessors locals math sorting arrays sets hashtables regexp.dfa accessors locals math sorting arrays sets hashtables regexp.dfa
combinators.short-circuit ; combinators.short-circuit regexp.classes ;
IN: regexp.minimize IN: regexp.minimize
: number-transitions ( transitions numbering -- new-transitions ) : number-transitions ( transitions numbering -- new-transitions )
dup '[ dup '[
[ _ at ] [ _ at ]
[ [ _ at ] assoc-map ] bi* [ [ [ _ at ] condition-map ] assoc-map ] bi*
] assoc-map ; ] assoc-map ;
: table>state-numbers ( table -- assoc ) : table>state-numbers ( table -- assoc )
@ -29,6 +29,9 @@ IN: regexp.minimize
dup table>state-numbers dup table>state-numbers
[ number-transitions ] rewrite-transitions ; [ number-transitions ] rewrite-transitions ;
: no-conditions? ( state transition-table -- ? )
transitions>> at values [ condition? ] any? not ;
: initially-same? ( s1 s2 transition-table -- ? ) : initially-same? ( s1 s2 transition-table -- ? )
{ {
[ drop <= ] [ drop <= ]
@ -39,7 +42,8 @@ IN: regexp.minimize
:: initialize-partitions ( transition-table -- partitions ) :: initialize-partitions ( transition-table -- partitions )
! Partition table is sorted-array => ? ! Partition table is sorted-array => ?
H{ } clone :> out H{ } clone :> out
transition-table transitions>> keys :> states transition-table transitions>> keys
[ transition-table no-conditions? ] filter :> states
states [| s1 | states [| s1 |
states [| s2 | states [| s2 |
s1 s2 transition-table initially-same? s1 s2 transition-table initially-same?

View File

@ -1,5 +1,5 @@
USING: regexp tools.test kernel sequences regexp.parser regexp.private USING: regexp tools.test kernel sequences regexp.parser regexp.private
regexp.traversal eval strings multiline accessors regexp.matchers ; eval strings multiline accessors regexp.matchers ;
IN: regexp-tests IN: regexp-tests
\ <regexp> must-infer \ <regexp> must-infer

View File

@ -3,7 +3,7 @@
USING: accessors combinators kernel math sequences strings sets USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry locals regexp.minimize namespaces parser arrays fry locals regexp.minimize
regexp.parser regexp.nfa regexp.dfa regexp.traversal regexp.parser regexp.nfa regexp.dfa
regexp.transition-tables splitting sorting regexp.ast regexp.transition-tables splitting sorting regexp.ast
regexp.negation regexp.matchers regexp.compiler ; regexp.negation regexp.matchers regexp.compiler ;
IN: regexp IN: regexp
@ -12,16 +12,16 @@ TUPLE: regexp
{ raw read-only } { raw read-only }
{ parse-tree read-only } { parse-tree read-only }
{ options read-only } { options read-only }
dfa reverse-dfa dfa-quot ; dfa reverse-dfa ;
: make-regexp ( string ast -- regexp ) : make-regexp ( string ast -- regexp )
f f <options> f f f regexp boa ; foldable f f <options> f f regexp boa ; foldable
! Foldable because, when the dfa slot is set, ! Foldable because, when the dfa slot is set,
! it'll be set to the same thing regardless of who sets it ! it'll be set to the same thing regardless of who sets it
: <optioned-regexp> ( string options -- regexp ) : <optioned-regexp> ( string options -- regexp )
[ dup parse-regexp ] [ string>options ] bi* [ dup parse-regexp ] [ string>options ] bi*
f f f regexp boa ; f f regexp boa ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ; : <regexp> ( string -- regexp ) "" <optioned-regexp> ;
@ -34,26 +34,25 @@ C: <reverse-matcher> reverse-matcher
[ parse-tree>> ] [ options>> ] bi <with-options> ; [ parse-tree>> ] [ options>> ] bi <with-options> ;
: compile-regexp ( regexp -- regexp ) : compile-regexp ( regexp -- regexp )
dup '[ [ _ get-ast ast>dfa ] unless* ] change-dfa ; dup '[ [ _ get-ast ast>dfa dfa>quotation ] unless* ] change-dfa ;
: compile-dfa-quot ( regexp -- regexp )
dup '[ [ _ compile-regexp dfa>> dfa>quotation ] unless* ] change-dfa-quot ;
: <reversed-option> ( ast -- reversed ) : <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ; "r" string>options <with-options> ;
: compile-reverse ( regexp -- regexp ) : compile-reverse ( regexp -- regexp )
dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ; dup '[
[
_ get-ast <reversed-option>
ast>dfa dfa>quotation
] unless*
] change-reverse-dfa ;
M: regexp match-index-from ( string regexp -- index/f ) M: regexp match-index-from ( string regexp -- index/f )
dup dfa-quot>> compile-regexp dfa-quot>> <quot-matcher> match-index-from ;
[ <quot-matcher> ]
[ compile-regexp dfa>> <dfa-matcher> ] ?if
match-index-from ;
M: reverse-matcher match-index-from ( string regexp -- index/f ) M: reverse-matcher match-index-from ( string regexp -- index/f )
[ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi* [ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi*
<dfa-traverser> do-match match-index>> ; <quot-matcher> match-index-from ;
: find-regexp-syntax ( string -- prefix suffix ) : find-regexp-syntax ( string -- prefix suffix )
{ {