Various regexp cleanups, and compiler from regexp to quotations
parent
9a015f56ac
commit
85432bd267
|
@ -0,0 +1,65 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: regexp regexp.private regexp.classes kernel sequences regexp.negation
|
||||||
|
quotations regexp.minimize assocs fry math locals combinators
|
||||||
|
accessors words compiler.units ;
|
||||||
|
IN: regexp.compiler
|
||||||
|
|
||||||
|
: literals>cases ( literal-transitions -- case-body )
|
||||||
|
[ 1quotation ] assoc-map ;
|
||||||
|
|
||||||
|
: non-literals>dispatch ( non-literal-transitions -- quot )
|
||||||
|
[ [ '[ dup _ class-member? ] ] [ 1quotation ] bi* ] assoc-map
|
||||||
|
[ 3drop f ] suffix '[ _ cond ] ;
|
||||||
|
|
||||||
|
: split-literals ( transitions -- case default )
|
||||||
|
! Convert disjunction of literals to literals. Also maybe small ranges.
|
||||||
|
>alist [ first integer? ] partition
|
||||||
|
[ literals>cases ] [ non-literals>dispatch ] bi* ;
|
||||||
|
|
||||||
|
USING: kernel.private strings sequences.private ;
|
||||||
|
|
||||||
|
:: step ( index str case-body final? -- match? )
|
||||||
|
index str bounds-check? [
|
||||||
|
index 1+ str
|
||||||
|
index str nth-unsafe
|
||||||
|
case-body case
|
||||||
|
] [ final? ] if ; inline
|
||||||
|
|
||||||
|
: transitions>quot ( transitions final-state? -- quot )
|
||||||
|
[ split-literals suffix ] dip
|
||||||
|
'[ { array-capacity string } declare _ _ step ] ;
|
||||||
|
|
||||||
|
: word>quot ( word dfa -- quot )
|
||||||
|
[ transitions>> at ]
|
||||||
|
[ final-states>> key? ] 2bi
|
||||||
|
transitions>quot ;
|
||||||
|
|
||||||
|
: states>code ( words dfa -- )
|
||||||
|
'[
|
||||||
|
[
|
||||||
|
dup _ word>quot
|
||||||
|
(( index string -- ? )) define-declared
|
||||||
|
] each
|
||||||
|
] with-compilation-unit ;
|
||||||
|
|
||||||
|
: transitions-at ( transitions assoc -- new-transitions )
|
||||||
|
dup '[
|
||||||
|
[ _ at ]
|
||||||
|
[ [ _ at ] assoc-map ] bi*
|
||||||
|
] assoc-map ;
|
||||||
|
|
||||||
|
: states>words ( dfa -- words dfa )
|
||||||
|
dup transitions>> keys [ gensym ] H{ } map>assoc
|
||||||
|
[ [ transitions-at ] rewrite-transitions ]
|
||||||
|
[ values ]
|
||||||
|
bi swap ;
|
||||||
|
|
||||||
|
: dfa>word ( dfa -- word )
|
||||||
|
states>words [ states>code ] keep start-state>> ;
|
||||||
|
|
||||||
|
: run-regexp ( string word -- ? )
|
||||||
|
[ 0 ] 2dip execute ; inline
|
||||||
|
|
||||||
|
: regexp>quotation ( regexp -- quot )
|
||||||
|
compile-regexp dfa>> dfa>word '[ _ run-regexp ] ;
|
|
@ -49,7 +49,7 @@ IN: regexp.dfa
|
||||||
[| trans |
|
[| trans |
|
||||||
state trans nfa find-closure :> new-state
|
state trans nfa find-closure :> new-state
|
||||||
new-state visited-states new-states add-todo-state
|
new-state visited-states new-states add-todo-state
|
||||||
state new-state trans transition make-transition dfa add-transition
|
state new-state trans dfa add-transition
|
||||||
] each
|
] each
|
||||||
nfa dfa new-states visited-states new-transitions
|
nfa dfa new-states visited-states new-transitions
|
||||||
] if-empty ;
|
] if-empty ;
|
||||||
|
|
|
@ -48,7 +48,7 @@ CONSTANT: fail-state -1
|
||||||
|
|
||||||
: unify-final-state ( transition-table -- transition-table )
|
: unify-final-state ( transition-table -- transition-table )
|
||||||
dup [ final-states>> keys ] keep
|
dup [ final-states>> keys ] keep
|
||||||
'[ -2 epsilon <literal-transition> _ add-transition ] each
|
'[ -2 epsilon _ add-transition ] each
|
||||||
H{ { -2 -2 } } >>final-states ;
|
H{ { -2 -2 } } >>final-states ;
|
||||||
|
|
||||||
: adjoin-dfa ( transition-table -- start end )
|
: adjoin-dfa ( transition-table -- start end )
|
||||||
|
|
|
@ -51,12 +51,12 @@ SYMBOL: nfa-table
|
||||||
|
|
||||||
GENERIC: nfa-node ( node -- start-state end-state )
|
GENERIC: nfa-node ( node -- start-state end-state )
|
||||||
|
|
||||||
: add-simple-entry ( obj class -- start-state end-state )
|
: add-simple-entry ( obj -- start-state end-state )
|
||||||
[ next-state next-state 2dup ] 2dip
|
[ next-state next-state 2dup ] dip
|
||||||
make-transition nfa-table get add-transition ;
|
nfa-table get add-transition ;
|
||||||
|
|
||||||
: epsilon-transition ( source target -- )
|
: epsilon-transition ( source target -- )
|
||||||
epsilon <literal-transition> nfa-table get add-transition ;
|
epsilon nfa-table get add-transition ;
|
||||||
|
|
||||||
M:: star nfa-node ( node -- start end )
|
M:: star nfa-node ( node -- start end )
|
||||||
node term>> nfa-node :> s1 :> s0
|
node term>> nfa-node :> s1 :> s0
|
||||||
|
@ -69,7 +69,7 @@ M:: star nfa-node ( node -- start end )
|
||||||
s2 s3 ;
|
s2 s3 ;
|
||||||
|
|
||||||
M: tagged-epsilon nfa-node
|
M: tagged-epsilon nfa-node
|
||||||
literal-transition add-simple-entry ;
|
add-simple-entry ;
|
||||||
|
|
||||||
M: concatenation nfa-node ( node -- start end )
|
M: concatenation nfa-node ( node -- start end )
|
||||||
[ first>> ] [ second>> ] bi
|
[ first>> ] [ second>> ] bi
|
||||||
|
@ -103,9 +103,7 @@ M: integer modify-class
|
||||||
] when ;
|
] when ;
|
||||||
|
|
||||||
M: integer nfa-node ( node -- start end )
|
M: integer nfa-node ( node -- start end )
|
||||||
modify-class dup class?
|
modify-class add-simple-entry ;
|
||||||
class-transition literal-transition ?
|
|
||||||
add-simple-entry ;
|
|
||||||
|
|
||||||
M: primitive-class modify-class
|
M: primitive-class modify-class
|
||||||
class>> modify-class <primitive-class> ;
|
class>> modify-class <primitive-class> ;
|
||||||
|
@ -141,7 +139,7 @@ M: range modify-class
|
||||||
] when ;
|
] when ;
|
||||||
|
|
||||||
M: class nfa-node
|
M: class nfa-node
|
||||||
modify-class class-transition add-simple-entry ;
|
modify-class add-simple-entry ;
|
||||||
|
|
||||||
M: with-options nfa-node ( node -- start end )
|
M: with-options nfa-node ( node -- start end )
|
||||||
dup options>> [ tree>> nfa-node ] using-options ;
|
dup options>> [ tree>> nfa-node ] using-options ;
|
||||||
|
|
|
@ -240,7 +240,9 @@ IN: regexp-tests
|
||||||
|
|
||||||
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
|
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
|
||||||
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
|
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
|
||||||
! [ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/ matches? ] unit-test ! FIXME
|
|
||||||
|
[ t ] [ "xabc" R/ abc/ match-from-end >boolean ] unit-test
|
||||||
|
[ t ] [ "xabc" R/ a[bB][cC]/ match-from-end >boolean ] unit-test
|
||||||
|
|
||||||
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||||
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||||
|
|
|
@ -12,38 +12,48 @@ TUPLE: regexp
|
||||||
{ raw read-only }
|
{ raw read-only }
|
||||||
{ parse-tree read-only }
|
{ parse-tree read-only }
|
||||||
{ options read-only }
|
{ options read-only }
|
||||||
dfa ;
|
dfa reverse-dfa ;
|
||||||
|
|
||||||
: make-regexp ( string ast -- regexp )
|
: make-regexp ( string ast -- regexp )
|
||||||
f f <options> f regexp boa ; foldable
|
f f <options> f f regexp boa ; foldable
|
||||||
! Foldable because, when the dfa slot is set,
|
! Foldable because, when the dfa slot is set,
|
||||||
! it'll be set to the same thing regardless of who sets it
|
! it'll be set to the same thing regardless of who sets it
|
||||||
|
|
||||||
: <optioned-regexp> ( string options -- regexp )
|
: <optioned-regexp> ( string options -- regexp )
|
||||||
[ dup parse-regexp ] [ string>options ] bi*
|
[ dup parse-regexp ] [ string>options ] bi*
|
||||||
f regexp boa ;
|
f f regexp boa ;
|
||||||
|
|
||||||
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
|
: get-ast ( regexp -- ast )
|
||||||
|
[ parse-tree>> ] [ options>> ] bi <with-options> ;
|
||||||
|
|
||||||
: compile-regexp ( regexp -- regexp )
|
: compile-regexp ( regexp -- regexp )
|
||||||
dup dfa>> [
|
dup '[ [ _ get-ast ast>dfa ] unless* ] change-dfa ;
|
||||||
dup
|
|
||||||
[ parse-tree>> ]
|
: <reversed-option> ( ast -- reversed )
|
||||||
[ options>> ] bi
|
"r" string>options <with-options> ;
|
||||||
<with-options> ast>dfa
|
|
||||||
>>dfa
|
: compile-reverse ( regexp -- regexp )
|
||||||
] unless ;
|
dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
|
||||||
|
|
||||||
: (match) ( string regexp -- dfa-traverser )
|
: (match) ( string regexp -- dfa-traverser )
|
||||||
compile-regexp dfa>> <dfa-traverser> do-match ; inline
|
compile-regexp dfa>> <dfa-traverser> do-match ;
|
||||||
|
|
||||||
|
: (match-reversed) ( string regexp -- dfa-traverser )
|
||||||
|
[ <reversed> ] [ compile-reverse reverse-dfa>> ] bi*
|
||||||
|
<dfa-traverser> do-match ;
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: match ( string regexp -- slice/f )
|
: match ( string regexp -- slice/f )
|
||||||
(match) return-match ;
|
(match) return-match ;
|
||||||
|
|
||||||
|
: match-from-end ( string regexp -- slice/f )
|
||||||
|
(match-reversed) return-match ;
|
||||||
|
|
||||||
: matches? ( string regexp -- ? )
|
: matches? ( string regexp -- ? )
|
||||||
dupd match
|
dupd match
|
||||||
[ [ length ] bi@ = ] [ drop f ] if* ;
|
[ [ length ] bi@ = ] [ drop f ] if* ;
|
||||||
|
@ -109,11 +119,18 @@ PRIVATE>
|
||||||
{ "R| " "|" }
|
{ "R| " "|" }
|
||||||
} swap [ subseq? not nip ] curry assoc-find drop ;
|
} swap [ subseq? not nip ] curry assoc-find drop ;
|
||||||
|
|
||||||
|
: take-until ( end lexer -- string )
|
||||||
|
dup skip-blank [
|
||||||
|
[ index-from ] 2keep
|
||||||
|
[ swapd subseq ]
|
||||||
|
[ 2drop 1+ ] 3bi
|
||||||
|
] change-lexer-column ;
|
||||||
|
|
||||||
|
: parse-noblank-token ( lexer -- str/f )
|
||||||
|
dup still-parsing-line? [ (parse-token) ] [ drop f ] if ;
|
||||||
|
|
||||||
: parsing-regexp ( accum end -- accum )
|
: parsing-regexp ( accum end -- accum )
|
||||||
lexer get dup skip-blank
|
lexer get [ take-until ] [ parse-noblank-token ] bi
|
||||||
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
|
|
||||||
lexer get dup still-parsing-line?
|
|
||||||
[ (parse-token) ] [ drop f ] if
|
|
||||||
<optioned-regexp> compile-regexp parsed ;
|
<optioned-regexp> compile-regexp parsed ;
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
|
@ -1,32 +1,9 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays assocs fry hashtables kernel sequences
|
USING: accessors arrays assocs fry hashtables kernel sequences
|
||||||
vectors ;
|
vectors locals ;
|
||||||
IN: regexp.transition-tables
|
IN: regexp.transition-tables
|
||||||
|
|
||||||
TUPLE: transition from to obj ;
|
|
||||||
TUPLE: literal-transition < transition ;
|
|
||||||
TUPLE: class-transition < transition ;
|
|
||||||
TUPLE: default-transition < transition ;
|
|
||||||
|
|
||||||
TUPLE: literal obj ;
|
|
||||||
TUPLE: class obj ;
|
|
||||||
TUPLE: default ;
|
|
||||||
: make-transition ( from to obj class -- obj )
|
|
||||||
new
|
|
||||||
swap >>obj
|
|
||||||
swap >>to
|
|
||||||
swap >>from ;
|
|
||||||
|
|
||||||
: <literal-transition> ( from to obj -- transition )
|
|
||||||
literal-transition make-transition ;
|
|
||||||
|
|
||||||
: <class-transition> ( from to obj -- transition )
|
|
||||||
class-transition make-transition ;
|
|
||||||
|
|
||||||
: <default-transition> ( from to -- transition )
|
|
||||||
t default-transition make-transition ;
|
|
||||||
|
|
||||||
TUPLE: transition-table transitions start-state final-states ;
|
TUPLE: transition-table transitions start-state final-states ;
|
||||||
|
|
||||||
: <transition-table> ( -- transition-table )
|
: <transition-table> ( -- transition-table )
|
||||||
|
@ -37,12 +14,11 @@ TUPLE: transition-table transitions start-state final-states ;
|
||||||
: maybe-initialize-key ( key hashtable -- )
|
: maybe-initialize-key ( key hashtable -- )
|
||||||
2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ;
|
2dup key? [ 2drop ] [ [ H{ } clone ] 2dip set-at ] if ;
|
||||||
|
|
||||||
: set-transition ( transition hash -- )
|
:: set-transition ( from to obj hash -- )
|
||||||
#! set the state as a key
|
to hash maybe-initialize-key
|
||||||
2dup [ to>> ] dip maybe-initialize-key
|
from hash at
|
||||||
[ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
|
[ [ to obj ] dip push-at ]
|
||||||
2dup at* [ 2nip push-at ]
|
[ to 1vector obj associate from hash set-at ] if* ;
|
||||||
[ drop [ H{ } clone [ push-at ] keep ] 2dip set-at ] if ;
|
|
||||||
|
|
||||||
: add-transition ( transition transition-table -- )
|
: add-transition ( from to obj transition-table -- )
|
||||||
transitions>> set-transition ;
|
transitions>> set-transition ;
|
||||||
|
|
|
@ -9,7 +9,6 @@ TUPLE: dfa-traverser
|
||||||
dfa-table
|
dfa-table
|
||||||
current-state
|
current-state
|
||||||
text
|
text
|
||||||
match-failed?
|
|
||||||
start-index current-index
|
start-index current-index
|
||||||
matches ;
|
matches ;
|
||||||
|
|
||||||
|
@ -25,9 +24,6 @@ TUPLE: dfa-traverser
|
||||||
[ current-state>> ]
|
[ current-state>> ]
|
||||||
[ dfa-table>> final-states>> ] bi key? ;
|
[ dfa-table>> final-states>> ] bi key? ;
|
||||||
|
|
||||||
: beginning-of-text? ( dfa-traverser -- ? )
|
|
||||||
current-index>> 0 <= ; inline
|
|
||||||
|
|
||||||
: end-of-text? ( dfa-traverser -- ? )
|
: end-of-text? ( dfa-traverser -- ? )
|
||||||
[ current-index>> ] [ text>> length ] bi >= ; inline
|
[ current-index>> ] [ text>> length ] bi >= ; inline
|
||||||
|
|
||||||
|
@ -35,7 +31,6 @@ TUPLE: dfa-traverser
|
||||||
{
|
{
|
||||||
[ current-state>> not ]
|
[ current-state>> not ]
|
||||||
[ end-of-text? ]
|
[ end-of-text? ]
|
||||||
[ match-failed?>> ]
|
|
||||||
} 1|| ;
|
} 1|| ;
|
||||||
|
|
||||||
: save-final-state ( dfa-straverser -- )
|
: save-final-state ( dfa-straverser -- )
|
||||||
|
@ -59,7 +54,8 @@ TUPLE: dfa-traverser
|
||||||
1 text-character ;
|
1 text-character ;
|
||||||
|
|
||||||
: increment-state ( dfa-traverser state -- dfa-traverser )
|
: increment-state ( dfa-traverser state -- dfa-traverser )
|
||||||
[ [ 1 + ] change-current-index ] dip >>current-state ;
|
>>current-state
|
||||||
|
[ 1 + ] change-current-index ;
|
||||||
|
|
||||||
: match-literal ( transition from-state table -- to-state/f )
|
: match-literal ( transition from-state table -- to-state/f )
|
||||||
transitions>> at at ;
|
transitions>> at at ;
|
||||||
|
@ -69,11 +65,8 @@ TUPLE: dfa-traverser
|
||||||
swap '[ drop _ swap class-member? ] assoc-find spin ?
|
swap '[ drop _ swap class-member? ] assoc-find spin ?
|
||||||
] [ drop ] if ;
|
] [ drop ] if ;
|
||||||
|
|
||||||
: match-default ( transition from-state table -- to-state/f )
|
|
||||||
[ drop ] 2dip transitions>> at t swap at ;
|
|
||||||
|
|
||||||
: match-transition ( obj from-state dfa -- to-state/f )
|
: match-transition ( obj from-state dfa -- to-state/f )
|
||||||
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
|
{ [ match-literal ] [ match-class ] } 3|| ;
|
||||||
|
|
||||||
: setup-match ( match -- obj state dfa-table )
|
: setup-match ( match -- obj state dfa-table )
|
||||||
[ [ current-index>> ] [ text>> ] bi nth ]
|
[ [ current-index>> ] [ text>> ] bi nth ]
|
||||||
|
@ -90,6 +83,6 @@ TUPLE: dfa-traverser
|
||||||
dup matches>>
|
dup matches>>
|
||||||
[ drop f ]
|
[ drop f ]
|
||||||
[
|
[
|
||||||
[ [ text>> ] [ start-index>> ] bi ]
|
[ [ start-index>> ] [ text>> ] bi ]
|
||||||
[ peek ] bi* rot <slice>
|
[ peek ] bi* swap <slice>
|
||||||
] if-empty ;
|
] if-empty ;
|
||||||
|
|
Loading…
Reference in New Issue