Lookaround and anchors work! (still need to fix some bugs)

db4
Daniel Ehrenberg 2009-03-05 16:34:04 -06:00
parent 39011fd062
commit a487ed0f32
12 changed files with 230 additions and 134 deletions

View File

@ -30,6 +30,7 @@ IN: regexp.classes.tests
[ T{ or-class { seq { 3 2 } } } ] [ { 2 3 } <or-class> 1 <not-class> 2array <and-class> ] unit-test [ T{ or-class { seq { 3 2 } } } ] [ { 2 3 } <or-class> 1 <not-class> 2array <and-class> ] unit-test
[ f ] [ t <not-class> ] unit-test [ f ] [ t <not-class> ] unit-test
[ t ] [ f <not-class> ] unit-test [ t ] [ f <not-class> ] unit-test
[ f ] [ 1 <not-class> 1 t replace-question ] unit-test
! Making classes into nested conditionals ! Making classes into nested conditionals
@ -43,7 +44,7 @@ IN: regexp.classes.tests
SYMBOL: foo SYMBOL: foo
SYMBOL: bar SYMBOL: bar
[ T{ condition f T{ primitive-class f bar } T{ condition f T{ primitive-class f foo } { 1 2 3 } { 1 3 } } T{ condition f T{ primitive-class f foo } { 1 2 } { 1 } } } ] [ { { 1 t } { 2 T{ primitive-class f foo } } { 3 T{ primitive-class f bar } } } table>condition ] unit-test [ T{ condition f T{ primitive-class f bar } T{ condition f T{ primitive-class f foo } { 1 3 2 } { 1 3 } } T{ condition f T{ primitive-class f foo } { 1 2 } { 1 } } } ] [ { { 1 t } { 3 T{ primitive-class f bar } } { 2 T{ primitive-class f foo } } } table>condition ] unit-test
[ t ] [ foo <primitive-class> dup t replace-question ] unit-test [ t ] [ foo <primitive-class> dup t replace-question ] unit-test
[ f ] [ foo <primitive-class> dup f replace-question ] unit-test [ f ] [ foo <primitive-class> dup f replace-question ] unit-test

View File

@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class terminator-class word-boundary-class ; unmatchable-class terminator-class word-boundary-class ;
SINGLETONS: beginning-of-input ^ end-of-input $ ; SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ;
TUPLE: range from to ; TUPLE: range from to ;
C: <range> range C: <range> range
@ -233,7 +233,7 @@ M: or-class replace-question
replace-compound <or-class> ; replace-compound <or-class> ;
M: not-class replace-question M: not-class replace-question
class>> replace-question <not-class> ; [ class>> ] 2dip replace-question <not-class> ;
: answer ( table question answer -- new-table ) : answer ( table question answer -- new-table )
'[ _ _ replace-question ] assoc-map '[ _ _ replace-question ] assoc-map
@ -258,7 +258,7 @@ M: not-class class>questions class>> class>questions ;
M: object class>questions 1array ; M: object class>questions 1array ;
: table>questions ( table -- questions ) : table>questions ( table -- questions )
values <and-class> class>questions t swap remove ; values [ class>questions ] gather >array t swap remove ;
: table>condition ( table -- condition ) : table>condition ( table -- condition )
! input table is state => class ! input table is state => class
@ -269,3 +269,12 @@ M: object class>questions 1array ;
[ [ question>> ] [ yes>> ] [ no>> ] tri ] dip [ [ question>> ] [ yes>> ] [ no>> ] tri ] dip
'[ _ condition-map ] bi@ <condition> '[ _ condition-map ] bi@ <condition>
] [ call ] if ; inline recursive ] [ call ] if ; inline recursive
: condition-states ( condition -- states )
dup condition? [
[ yes>> ] [ no>> ] bi
[ condition-states ] bi@ append prune
] [ 1array ] if ;
: condition-at ( condition assoc -- new-condition )
'[ _ at ] condition-map ;

View File

@ -3,27 +3,76 @@
USING: regexp.classes kernel sequences regexp.negation USING: regexp.classes kernel sequences regexp.negation
quotations regexp.minimize assocs fry math locals combinators quotations regexp.minimize assocs fry math locals combinators
accessors words compiler.units kernel.private strings accessors words compiler.units kernel.private strings
sequences.private arrays regexp.matchers call ; sequences.private arrays regexp.matchers call namespaces
regexp.transition-tables combinators.short-circuit ;
IN: regexp.compiler IN: regexp.compiler
: literals>cases ( literal-transitions -- case-body ) GENERIC: question>quot ( question -- quot )
[ 1quotation ] assoc-map ;
<PRIVATE
SYMBOL: shortest?
SYMBOL: backwards?
M: t question>quot drop [ 2drop t ] ;
M: beginning-of-input question>quot
drop [ drop zero? ] ;
M: end-of-input question>quot
drop [ length = ] ;
M: end-of-file question>quot
drop [
{
[ length swap - 2 <= ]
[ swap tail { "\n" "\r\n" "\r" "" } member? ]
} 2&&
[ [ nip [ length ] keep ] when ] keep
] ;
M: $ question>quot
drop [ { [ length = ] [ ?nth "\r\n" member? ] } 2|| ] ;
M: ^ question>quot
drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ;
! Maybe the condition>quot things can be combined, given a suitable method
! for question>quot on classes, but maybe that'd make stack shuffling annoying
: execution-quot ( next-state -- quot )
! The conditions here are for lookaround and anchors, etc
dup condition? [
[ question>> question>quot ] [ yes>> ] [ no>> ] tri
[ execution-quot ] bi@
'[ 2dup @ _ _ if ]
] [
! There shouldn't be a condition like this!
dup sequence?
[ [ [ 2drop ] ] [ first '[ _ execute ] ] if-empty ]
[ '[ _ execute ] ] if
] if ;
TUPLE: box contents ;
C: <box> box
: condition>quot ( condition -- quot ) : condition>quot ( condition -- quot )
! Conditions here are for different classes
dup condition? [ dup condition? [
[ question>> ] [ yes>> ] [ no>> ] tri [ question>> ] [ yes>> ] [ no>> ] tri
[ condition>quot ] bi@ [ condition>quot ] bi@
'[ dup _ class-member? _ _ if ] '[ dup _ class-member? _ _ if ]
] [ ] [
[ [ 3drop ] ] [ '[ drop _ execute ] ] if-empty contents>>
[ [ 3drop ] ] [ execution-quot '[ drop @ ] ] if-empty
] if ; ] if ;
: new-non-literals>dispatch ( non-literal-transitions -- quot )
table>condition condition>quot ;
: non-literals>dispatch ( non-literal-transitions -- quot ) : non-literals>dispatch ( non-literal-transitions -- quot )
[ [ '[ dup _ class-member? ] ] [ '[ drop _ execute ] ] bi* ] assoc-map [ swap ] assoc-map ! we want state => predicate, and get the opposite as input
[ 3drop ] suffix '[ _ cond ] ; table>condition [ <box> ] condition-map condition>quot ;
: literals>cases ( literal-transitions -- case-body )
[ execution-quot ] assoc-map ;
: expand-one-or ( or-class transition -- alist ) : expand-one-or ( or-class transition -- alist )
[ seq>> ] dip '[ _ 2array ] map ; [ seq>> ] dip '[ _ 2array ] map ;
@ -38,17 +87,22 @@ IN: regexp.compiler
>alist expand-or [ first integer? ] partition >alist expand-or [ first integer? ] partition
[ literals>cases ] [ non-literals>dispatch ] bi* ; [ literals>cases ] [ non-literals>dispatch ] bi* ;
:: step ( last-match index str case-body final? -- last-index/f ) :: step ( last-match index str quot final? direction -- last-index/f )
final? index last-match ? final? index last-match ?
index str bounds-check? [ index str bounds-check? [
index 1+ str index direction + str
index str nth-unsafe index str nth-unsafe
case-body case quot call
] when ; inline ] when ; inline
: direction ( -- n )
backwards? get -1 1 ? ;
: transitions>quot ( transitions final-state? -- quot ) : transitions>quot ( transitions final-state? -- quot )
[ split-literals suffix ] dip dup shortest? get and [ 2drop [ drop nip ] ] [
'[ { array-capacity sequence } declare _ _ step ] ; [ split-literals swap case>quot ] dip direction
'[ { array-capacity string } declare _ _ _ step ]
] if ;
: word>quot ( word dfa -- quot ) : word>quot ( word dfa -- quot )
[ transitions>> at ] [ transitions>> at ]
@ -64,30 +118,37 @@ IN: regexp.compiler
] each ] each
] with-compilation-unit ; ] with-compilation-unit ;
: transitions-at ( transitions assoc -- new-transitions )
dup '[
[ _ at ]
[ [ _ at ] assoc-map ] bi*
] assoc-map ;
: states>words ( dfa -- words dfa ) : states>words ( dfa -- words dfa )
dup transitions>> keys [ gensym ] H{ } map>assoc dup transitions>> keys [ gensym ] H{ } map>assoc
[ [ transitions-at ] rewrite-transitions ] [ transitions-at ]
[ values ] [ values ]
bi swap ; bi swap ;
: dfa>word ( dfa -- word ) : dfa>word ( dfa -- word )
states>words [ states>code ] keep start-state>> ; states>words [ states>code ] keep start-state>> ;
: check-sequence ( string -- string ) : check-string ( string -- string )
! Make this configurable ! Make this configurable
dup sequence? [ "String required" throw ] unless ; dup string? [ "String required" throw ] unless ;
: run-regexp ( start-index string word -- ? ) : setup-regexp ( start-index string -- f start-index string )
{ [ f ] [ >fixnum ] [ check-sequence ] [ execute ] } spread ; inline [ f ] [ >fixnum ] [ check-string ] tri* ; inline
PRIVATE>
! The quotation returned is ( start-index string -- i/f )
: dfa>quotation ( dfa -- quot ) : dfa>quotation ( dfa -- quot )
dfa>word '[ _ run-regexp ] ; dfa>word execution-quot '[ setup-regexp @ ] ;
: dfa>shortest-quotation ( dfa -- quot )
t shortest? [ dfa>quotation ] with-variable ;
: dfa>reverse-quotation ( dfa -- quot )
t backwards? [ dfa>quotation ] with-variable ;
: dfa>reverse-shortest-quotation ( dfa -- quot )
t backwards? [ dfa>shortest-quotation ] with-variable ;
TUPLE: quot-matcher quot ; TUPLE: quot-matcher quot ;
C: <quot-matcher> quot-matcher C: <quot-matcher> quot-matcher

View File

@ -39,21 +39,26 @@ IN: regexp.dfa
: find-transitions ( dfa-state nfa -- next-dfa-state ) : find-transitions ( dfa-state nfa -- next-dfa-state )
transitions>> transitions>>
'[ _ at keys ] gather '[ _ at keys [ condition-states ] map concat ] gather
epsilon swap remove ; [ tagged-epsilon? not ] filter ;
: add-todo-state ( state visited-states new-states -- ) : add-todo-state ( state visited-states new-states -- )
3dup drop key? [ 3drop ] [ 3dup drop key? [ 3drop ] [
[ conjoin ] [ push ] bi-curry* bi [ conjoin ] [ push ] bi-curry* bi
] if ; ] if ;
: add-todo-states ( state/condition visited-states new-states -- )
[ condition-states ] 2dip
'[ _ _ add-todo-state ] each ;
:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa ) :: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
new-states [ nfa dfa ] [ new-states [ nfa dfa ] [
pop :> state pop :> state
state dfa transitions>> maybe-initialize-key
state nfa find-transitions state nfa find-transitions
[| trans | [| trans |
state trans nfa find-closure :> new-state state trans nfa find-closure :> new-state
new-state visited-states new-states add-todo-state new-state visited-states new-states add-todo-states
state new-state trans dfa set-transition state new-state trans dfa set-transition
] each ] each
nfa dfa new-states visited-states new-transitions nfa dfa new-states visited-states new-transitions
@ -73,7 +78,7 @@ IN: regexp.dfa
: construct-dfa ( nfa -- dfa ) : construct-dfa ( nfa -- dfa )
dup initialize-dfa dup initialize-dfa
dup start-state>> 1vector dup start-state>> condition-states >vector
H{ } clone H{ } clone
new-transitions new-transitions
[ set-final-states ] keep ; [ set-final-states ] keep ;

View File

@ -42,6 +42,3 @@ TUPLE: parts in out ;
] preserving-epsilon ] preserving-epsilon
] assoc-map ] assoc-map
] change-transitions ; ] change-transitions ;
: nfa>dfa ( nfa -- dfa )
disambiguate construct-dfa minimize ;

View File

@ -5,29 +5,11 @@ accessors locals math sorting arrays sets hashtables regexp.dfa
combinators.short-circuit regexp.classes ; combinators.short-circuit regexp.classes ;
IN: regexp.minimize IN: regexp.minimize
: number-transitions ( transitions numbering -- new-transitions )
dup '[
[ _ at ]
[ [ [ _ at ] condition-map ] assoc-map ] bi*
] assoc-map ;
: table>state-numbers ( table -- assoc ) : table>state-numbers ( table -- assoc )
transitions>> keys <enum> [ swap ] H{ } assoc-map-as ; transitions>> keys <enum> [ swap ] H{ } assoc-map-as ;
: map-set ( assoc quot -- new-assoc )
'[ drop @ dup ] assoc-map ; inline
: rewrite-transitions ( transition-table assoc quot -- transition-table )
[
[ clone ] dip
[ '[ _ at ] change-start-state ]
[ '[ [ _ at ] map-set ] change-final-states ]
[ ] tri
] dip '[ _ @ ] change-transitions ; inline
: number-states ( table -- newtable ) : number-states ( table -- newtable )
dup table>state-numbers dup table>state-numbers transitions-at ;
[ number-transitions ] rewrite-transitions ;
: no-conditions? ( state transition-table -- ? ) : no-conditions? ( state transition-table -- ? )
transitions>> at values [ condition? ] any? not ; transitions>> at values [ condition? ] any? not ;
@ -103,4 +85,4 @@ IN: regexp.minimize
[ combine-transitions ] rewrite-transitions ; [ combine-transitions ] rewrite-transitions ;
: minimize ( table -- minimal-table ) : minimize ( table -- minimal-table )
clone number-states combine-states ; clone number-states ; ! combine-states ;

View File

@ -2,11 +2,12 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: regexp.nfa regexp.disambiguate kernel sequences USING: regexp.nfa regexp.disambiguate kernel sequences
assocs regexp.classes hashtables accessors fry vectors assocs regexp.classes hashtables accessors fry vectors
regexp.ast regexp.transition-tables regexp.minimize namespaces ; regexp.ast regexp.transition-tables regexp.minimize
regexp.dfa namespaces ;
IN: regexp.negation IN: regexp.negation
: ast>dfa ( parse-tree -- minimal-dfa ) : ast>dfa ( parse-tree -- minimal-dfa )
construct-nfa nfa>dfa ; construct-nfa disambiguate construct-dfa minimize ;
CONSTANT: fail-state -1 CONSTANT: fail-state -1
@ -33,15 +34,9 @@ CONSTANT: fail-state -1
[ add-fail-state ] change-transitions [ add-fail-state ] change-transitions
dup inverse-final-states >>final-states ; dup inverse-final-states >>final-states ;
: renumber-transitions ( transitions numbering -- new-transitions )
dup '[
[ _ at ]
[ [ [ _ at ] map ] assoc-map ] bi*
] assoc-map ;
: renumber-states ( transition-table -- transition-table ) : renumber-states ( transition-table -- transition-table )
dup transitions>> keys [ next-state ] H{ } map>assoc dup transitions>> keys [ next-state ] H{ } map>assoc
[ renumber-transitions ] rewrite-transitions ; transitions-at ;
: box-transitions ( transition-table -- transition-table ) : box-transitions ( transition-table -- transition-table )
[ [ [ 1vector ] assoc-map ] assoc-map ] change-transitions ; [ [ [ 1vector ] assoc-map ] assoc-map ] change-transitions ;

View File

@ -56,9 +56,16 @@ M:: star nfa-node ( node -- start end )
s2 s3 ; s2 s3 ;
GENERIC: modify-epsilon ( tag -- newtag ) GENERIC: modify-epsilon ( tag -- newtag )
! Potential off-by-one errors when lookaround nested in lookbehind
M: object modify-epsilon ; M: object modify-epsilon ;
M: $ modify-epsilon
multiline option? [ drop end-of-input ] unless ;
M: ^ modify-epsilon
multiline option? [ drop beginning-of-input ] unless ;
M: tagged-epsilon nfa-node M: tagged-epsilon nfa-node
clone [ modify-epsilon ] change-tag add-simple-entry ; clone [ modify-epsilon ] change-tag add-simple-entry ;

View File

@ -54,6 +54,7 @@ ERROR: bad-class name ;
{ CHAR: D [ digit-class <primitive-class> <not-class> ] } { CHAR: D [ digit-class <primitive-class> <not-class> ] }
{ CHAR: z [ end-of-input <tagged-epsilon> ] } { CHAR: z [ end-of-input <tagged-epsilon> ] }
{ CHAR: Z [ end-of-file <tagged-epsilon> ] }
{ CHAR: A [ beginning-of-input <tagged-epsilon> ] } { CHAR: A [ beginning-of-input <tagged-epsilon> ] }
[ ] [ ]
} case ; } case ;

View File

@ -45,9 +45,9 @@ IN: regexp-tests
! Dotall mode -- when on, . matches newlines. ! Dotall mode -- when on, . matches newlines.
! Off by default. ! Off by default.
[ f ] [ "\n" "." <regexp> matches? ] unit-test [ f ] [ "\n" "." <regexp> matches? ] unit-test
! [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test [ t ] [ "\n" "(?s:.)" <regexp> matches? ] unit-test
[ t ] [ "\n" R/ ./s matches? ] unit-test [ t ] [ "\n" R/ ./s matches? ] unit-test
! [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test [ f ] [ "\n\n" "(?s:.)." <regexp> matches? ] unit-test
[ f ] [ "" ".+" <regexp> matches? ] unit-test [ f ] [ "" ".+" <regexp> matches? ] unit-test
[ t ] [ "a" ".+" <regexp> matches? ] unit-test [ t ] [ "a" ".+" <regexp> matches? ] unit-test
@ -221,17 +221,15 @@ IN: regexp-tests
[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
/* [ t ] [ "a" "(?i:a)" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "a" "(?i:a)" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "A" "(?i:a)" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "A" "(?i:a)" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test [ t ] [ "a" R/ (?-i:a)/i matches? ] unit-test
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test [ t ] [ "a" R/ (?-i:a)/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" R/ (?-i:a)/i matches? ] unit-test
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" R/ (?-i:a)/i matches? ] unit-test
*/
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test [ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
[ t ] [ "A" R/ [a-z]/i matches? ] unit-test [ t ] [ "A" R/ [a-z]/i matches? ] unit-test
@ -242,8 +240,8 @@ IN: regexp-tests
[ t ] [ "abc" reverse R/ abc/r matches? ] unit-test [ t ] [ "abc" reverse R/ abc/r matches? ] unit-test
[ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test [ t ] [ "abc" reverse R/ a[bB][cC]/r matches? ] unit-test
[ t ] [ "xabc" R/ abc/ <reverse-matcher> match-index-head >boolean ] unit-test [ t ] [ 3 "xabc" R/ abc/ <reverse-matcher> match-index-from >boolean ] unit-test
[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-head >boolean ] unit-test [ t ] [ 3 "xabc" R/ a[bB][cC]/ <reverse-matcher> match-index-from >boolean ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -276,10 +274,6 @@ IN: regexp-tests
[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> match-head >string ] unit-test [ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" <regexp> match-head >string ] unit-test
! [ t ] [ "a:b" ".+:?" <regexp> matches? ] unit-test
! [ 1 ] [ "hello" ".+?" <regexp> match length ] unit-test
[ { "1" "2" "3" "4" } ] [ { "1" "2" "3" "4" } ]
[ "1ABC2DEF3GHI4" R/ [A-Z]+/ re-split [ >string ] map ] unit-test [ "1ABC2DEF3GHI4" R/ [A-Z]+/ re-split [ >string ] map ] unit-test
@ -304,18 +298,16 @@ IN: regexp-tests
[ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test [ "-- title --" ] [ "== title ==" R/ =/ "-" re-replace ] unit-test
/*
[ f ] [ "ab" "a(?!b)" <regexp> match-head ] unit-test [ f ] [ "ab" "a(?!b)" <regexp> match-head ] unit-test
[ "a" ] [ "ac" "a(?!b)" <regexp> match-head >string ] unit-test [ "a" ] [ "ac" "a(?!b)" <regexp> match-head >string ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> match-head >string ] unit-test [ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> match-head >string ] unit-test
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> match-head >string ] unit-test [ "a" ] [ "ba" "(?<=b)(?<=b)a" <regexp> match-head >string ] unit-test
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> match-head >string ] unit-test [ "a" ] [ "cab" "(?<=c)a(?=b)" <regexp> match-head >string ] unit-test
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-index-head ] unit-test [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-index-head ] unit-test
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-index-head ] unit-test [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-index-head ] unit-test
*/
! Bug in parsing word ! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test [ t ] [ "a" R' a' matches? ] unit-test
@ -349,56 +341,70 @@ IN: regexp-tests
[ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test [ f ] [ "foo" <regexp> dfa>> >boolean ] unit-test
[ t ] [ R/ foo/ dfa>> >boolean ] unit-test [ t ] [ R/ foo/ dfa>> >boolean ] unit-test
! [ t ] [ "a" R/ ^a/ matches? ] unit-test [ t ] [ "a" R/ ^a/ matches? ] unit-test
! [ f ] [ "\na" R/ ^a/ matches? ] unit-test [ f ] [ "\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
! [ f ] [ "\ra" R/ ^a/ matches? ] unit-test [ f ] [ "\ra" R/ ^a/ matches? ] unit-test
! [ t ] [ "a" R/ a$/ matches? ] unit-test [ 1 ] [ "a" R/ ^a/ count-matches ] unit-test
! [ f ] [ "a\n" R/ a$/ matches? ] unit-test [ 0 ] [ "\na" R/ ^a/ count-matches ] unit-test
! [ f ] [ "a\r" R/ a$/ matches? ] unit-test [ 0 ] [ "\r\na" R/ ^a/ count-matches ] unit-test
! [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test [ 0 ] [ "\ra" R/ ^a/ count-matches ] unit-test
! [ t ] [ "a" R/ a$|b$/ matches? ] unit-test [ t ] [ "a" R/ a$/ matches? ] unit-test
! [ t ] [ "b" R/ a$|b$/ matches? ] unit-test [ f ] [ "a\n" R/ a$/ matches? ] unit-test
! [ t ] [ "ab" R/ a$|b$/ matches? ] unit-test [ f ] [ "a\r" R/ a$/ matches? ] unit-test
! [ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test [ 1 ] [ "a" R/ a$/ count-matches ] unit-test
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test [ 0 ] [ "a\n" R/ a$/ count-matches ] unit-test
! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test [ 0 ] [ "a\r" R/ a$/ count-matches ] unit-test
! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test [ 0 ] [ "a\r\n" R/ a$/ count-matches ] unit-test
! [ t ] [ "a" R/ \Aa/m matches? ] unit-test [ t ] [ "a" R/ a$|b$/ matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test [ t ] [ "b" R/ a$|b$/ matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test [ f ] [ "ab" R/ a$|b$/ matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test [ t ] [ "ba" R/ ba$|b$/ matches? ] unit-test
! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test [ t ] [ "a" R/ \Aa/ matches? ] unit-test
[ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
[ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test
[ f ] [ "\ra" R/ \Aa/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test [ t ] [ "a" R/ \Aa/m matches? ] unit-test
! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test
[ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test
[ f ] [ "\ra" R/ \Aa/m matches? ] unit-test
[ 0 ] [ "\ra" R/ \Aa/m count-matches ] unit-test
! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test [ f ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test
! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test [ 1 ] [ "\r\n\n\n\nam" R/ ^am/m count-matches ] unit-test
! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a" R/ ^a/m matches? ] unit-test [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "\na" R/ ^a/m matches? ] unit-test [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
! [ t ] [ "a" R/ a$/m matches? ] unit-test [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a\n" R/ a$/m matches? ] unit-test [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test
! [ t ] [ "a\r" R/ a$/m matches? ] unit-test [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-index-head ] unit-test [ t ] [ "a" R/ ^a/m matches? ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-index-head ] unit-test [ f ] [ "\na" R/ ^a/m matches? ] unit-test
[ 1 ] [ "\na" R/ ^a/m count-matches ] unit-test
[ 1 ] [ "\r\na" R/ ^a/m count-matches ] unit-test
[ 1 ] [ "\ra" R/ ^a/m count-matches ] unit-test
[ t ] [ "a" R/ a$/m matches? ] unit-test
[ f ] [ "a\n" R/ a$/m matches? ] unit-test
[ 1 ] [ "a\n" R/ a$/m count-matches ] unit-test
[ 1 ] [ "a\r" R/ a$/m count-matches ] unit-test
[ 1 ] [ "a\r\n" R/ a$/m count-matches ] unit-test
[ f ] [ "foobxr" "foo\\z" <regexp> match-index-head ] unit-test
[ 3 ] [ "foo" "foo\\z" <regexp> match-index-head ] unit-test
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test ! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test ! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test

View File

@ -3,7 +3,7 @@
USING: accessors combinators kernel math sequences strings sets USING: accessors combinators kernel math sequences strings sets
assocs prettyprint.backend prettyprint.custom make lexer assocs prettyprint.backend prettyprint.custom make lexer
namespaces parser arrays fry locals regexp.minimize namespaces parser arrays fry locals regexp.minimize
regexp.parser regexp.nfa regexp.dfa regexp.parser regexp.nfa regexp.dfa regexp.classes
regexp.transition-tables splitting sorting regexp.ast regexp.transition-tables splitting sorting regexp.ast
regexp.negation regexp.matchers regexp.compiler ; regexp.negation regexp.matchers regexp.compiler ;
IN: regexp IN: regexp
@ -27,6 +27,7 @@ TUPLE: regexp
TUPLE: reverse-matcher regexp ; TUPLE: reverse-matcher regexp ;
C: <reverse-matcher> reverse-matcher C: <reverse-matcher> reverse-matcher
! Reverse matchers won't work properly with most combinators, for now
<PRIVATE <PRIVATE
@ -39,21 +40,31 @@ C: <reverse-matcher> reverse-matcher
: <reversed-option> ( ast -- reversed ) : <reversed-option> ( ast -- reversed )
"r" string>options <with-options> ; "r" string>options <with-options> ;
M: lookahead question>quot ! Returns ( index string -- ? )
term>> ast>dfa dfa>shortest-quotation ;
M: lookbehind question>quot ! Returns ( index string -- ? )
term>> <reversed-option>
ast>dfa dfa>reverse-shortest-quotation
[ [ 1- ] dip ] prepose ;
: compile-reverse ( regexp -- regexp ) : compile-reverse ( regexp -- regexp )
dup '[ dup '[
[ [
_ get-ast <reversed-option> _ get-ast <reversed-option>
ast>dfa dfa>quotation ast>dfa dfa>reverse-quotation
] unless* ] unless*
] change-reverse-dfa ; ] change-reverse-dfa ;
M: regexp match-index-from ( string regexp -- index/f ) M: regexp match-index-from
compile-regexp dfa>> <quot-matcher> match-index-from ; compile-regexp dfa>> <quot-matcher> match-index-from ;
M: reverse-matcher match-index-from ( string regexp -- index/f ) M: reverse-matcher match-index-from
[ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi* regexp>> compile-reverse reverse-dfa>>
<quot-matcher> match-index-from ; <quot-matcher> match-index-from ;
! The following two should do some caching
: find-regexp-syntax ( string -- prefix suffix ) : find-regexp-syntax ( string -- prefix suffix )
{ {
{ "R/ " "/" } { "R/ " "/" }

View File

@ -1,7 +1,7 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays assocs fry hashtables kernel sequences USING: accessors arrays assocs fry hashtables kernel sequences
vectors locals ; vectors locals regexp.classes ;
IN: regexp.transition-tables IN: regexp.transition-tables
TUPLE: transition-table transitions start-state final-states ; TUPLE: transition-table transitions start-state final-states ;
@ -12,10 +12,11 @@ TUPLE: transition-table transitions start-state final-states ;
H{ } clone >>final-states ; H{ } clone >>final-states ;
: maybe-initialize-key ( key hashtable -- ) : maybe-initialize-key ( key hashtable -- )
! Why do we have to do this?
2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ; 2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ;
:: (set-transition) ( from to obj hash -- ) :: (set-transition) ( from to obj hash -- )
to hash maybe-initialize-key to condition? [ to hash maybe-initialize-key ] unless
from hash at from hash at
[ [ to obj ] dip set-at ] [ [ to obj ] dip set-at ]
[ to obj associate from hash set-at ] if* ; [ to obj associate from hash set-at ] if* ;
@ -31,3 +32,23 @@ TUPLE: transition-table transitions start-state final-states ;
: add-transition ( from to obj transition-table -- ) : add-transition ( from to obj transition-table -- )
transitions>> (add-transition) ; transitions>> (add-transition) ;
: map-set ( assoc quot -- new-assoc )
'[ drop @ dup ] assoc-map ; inline
: rewrite-transitions ( transition-table assoc quot -- transition-table )
[
[ clone ] dip
[ '[ _ condition-at ] change-start-state ]
[ '[ [ _ at ] map-set ] change-final-states ]
[ ] tri
] dip '[ _ @ ] change-transitions ; inline
: number-transitions ( transitions numbering -- new-transitions )
dup '[
[ _ at ]
[ [ _ condition-at ] assoc-map ] bi*
] assoc-map ;
: transitions-at ( transitions numbering -- transitions )
[ number-transitions ] rewrite-transitions ;