Regexp compiler used from literals
parent
85432bd267
commit
af2d380a7f
|
@ -1,34 +1,43 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: regexp regexp.private regexp.classes kernel sequences regexp.negation
|
||||
USING: regexp.classes kernel sequences regexp.negation
|
||||
quotations regexp.minimize assocs fry math locals combinators
|
||||
accessors words compiler.units ;
|
||||
accessors words compiler.units kernel.private strings
|
||||
sequences.private arrays regexp.matchers call ;
|
||||
IN: regexp.compiler
|
||||
|
||||
: literals>cases ( literal-transitions -- case-body )
|
||||
[ 1quotation ] assoc-map ;
|
||||
|
||||
: non-literals>dispatch ( non-literal-transitions -- quot )
|
||||
[ [ '[ dup _ class-member? ] ] [ 1quotation ] bi* ] assoc-map
|
||||
[ 3drop f ] suffix '[ _ cond ] ;
|
||||
[ [ '[ dup _ class-member? ] ] [ '[ drop _ execute ] ] bi* ] assoc-map
|
||||
[ 3drop ] suffix '[ _ cond ] ;
|
||||
|
||||
: expand-one-or ( or-class transition -- alist )
|
||||
[ seq>> ] dip '[ _ 2array ] map ;
|
||||
|
||||
: expand-or ( alist -- new-alist )
|
||||
[
|
||||
first2 over or-class?
|
||||
[ expand-one-or ] [ 2array 1array ] if
|
||||
] map concat ;
|
||||
|
||||
: split-literals ( transitions -- case default )
|
||||
! Convert disjunction of literals to literals. Also maybe small ranges.
|
||||
>alist [ first integer? ] partition
|
||||
>alist expand-or [ first integer? ] partition
|
||||
[ literals>cases ] [ non-literals>dispatch ] bi* ;
|
||||
|
||||
USING: kernel.private strings sequences.private ;
|
||||
|
||||
:: step ( index str case-body final? -- match? )
|
||||
:: step ( last-match index str case-body final? -- last-index/f )
|
||||
final? index last-match ?
|
||||
index str bounds-check? [
|
||||
index 1+ str
|
||||
index str nth-unsafe
|
||||
case-body case
|
||||
] [ final? ] if ; inline
|
||||
] when ; inline
|
||||
|
||||
: transitions>quot ( transitions final-state? -- quot )
|
||||
[ split-literals suffix ] dip
|
||||
'[ { array-capacity string } declare _ _ step ] ;
|
||||
'[ _ _ step ] ;
|
||||
! '[ { array-capacity string } declare _ _ step ] ;
|
||||
|
||||
: word>quot ( word dfa -- quot )
|
||||
[ transitions>> at ]
|
||||
|
@ -39,7 +48,8 @@ USING: kernel.private strings sequences.private ;
|
|||
'[
|
||||
[
|
||||
dup _ word>quot
|
||||
(( index string -- ? )) define-declared
|
||||
(( last-match index string -- ? ))
|
||||
define-declared
|
||||
] each
|
||||
] with-compilation-unit ;
|
||||
|
||||
|
@ -59,7 +69,13 @@ USING: kernel.private strings sequences.private ;
|
|||
states>words [ states>code ] keep start-state>> ;
|
||||
|
||||
: run-regexp ( string word -- ? )
|
||||
[ 0 ] 2dip execute ; inline
|
||||
[ f 0 ] 2dip execute ; inline
|
||||
|
||||
: regexp>quotation ( regexp -- quot )
|
||||
compile-regexp dfa>> dfa>word '[ _ run-regexp ] ;
|
||||
: dfa>quotation ( dfa -- quot )
|
||||
dfa>word '[ _ run-regexp ] ;
|
||||
|
||||
TUPLE: quot-matcher quot ;
|
||||
C: <quot-matcher> quot-matcher
|
||||
|
||||
M: quot-matcher match-index
|
||||
quot>> call( string -- i/f ) ;
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel sequences math splitting make fry ;
|
||||
IN: regexp.matchers
|
||||
|
||||
! For now, a matcher is just something with a method to do the
|
||||
! equivalent of match.
|
||||
|
||||
! matcher protocol:
|
||||
GENERIC: match-index ( string matcher -- index/f )
|
||||
|
||||
: match ( string matcher -- slice/f )
|
||||
dupd match-index [ head-slice ] [ drop f ] if* ;
|
||||
|
||||
: matches? ( string matcher -- ? )
|
||||
dupd match-index
|
||||
[ swap length = ] [ drop f ] if* ;
|
||||
|
||||
: match-head ( string matcher -- end/f ) match [ length ] [ f ] if* ;
|
||||
|
||||
: match-at ( string m matcher -- n/f finished? )
|
||||
[
|
||||
2dup swap length > [ 2drop f f ] [ tail-slice t ] if
|
||||
] dip swap [ match-head f ] [ 2drop f t ] if ;
|
||||
|
||||
: match-range ( string m matcher -- a/f b/f )
|
||||
3dup match-at over [
|
||||
drop nip rot drop dupd +
|
||||
] [
|
||||
[ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
|
||||
] if ;
|
||||
|
||||
: first-match ( string matcher -- slice/f )
|
||||
dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
|
||||
|
||||
: re-cut ( string matcher -- end/f start )
|
||||
dupd first-match
|
||||
[ split1-slice swap ] [ "" like f swap ] if* ;
|
||||
|
||||
<PRIVATE
|
||||
|
||||
: (re-split) ( string matcher -- )
|
||||
over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: re-split ( string matcher -- seq )
|
||||
[ (re-split) ] { } make ;
|
||||
|
||||
: re-replace ( string matcher replacement -- result )
|
||||
[ re-split ] dip join ;
|
||||
|
||||
: next-match ( string matcher -- end/f match/f )
|
||||
dupd first-match dup
|
||||
[ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
|
||||
|
||||
: all-matches ( string matcher -- seq )
|
||||
[ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
|
||||
|
||||
: count-matches ( string matcher -- n )
|
||||
all-matches length ;
|
|
@ -1,6 +1,7 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: tools.test regexp.minimize assocs regexp accessors regexp.transition-tables ;
|
||||
USING: tools.test regexp.minimize assocs regexp regexp.syntax
|
||||
accessors regexp.transition-tables ;
|
||||
IN: regexp.minimize.tests
|
||||
|
||||
[ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel strings help.markup help.syntax ;
|
||||
USING: kernel strings help.markup help.syntax regexp.matchers ;
|
||||
IN: regexp
|
||||
|
||||
ABOUT: "regexp"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
USING: regexp tools.test kernel sequences regexp.parser
|
||||
regexp.traversal eval strings multiline accessors ;
|
||||
regexp.traversal eval strings multiline accessors regexp.matchers ;
|
||||
IN: regexp-tests
|
||||
|
||||
\ <regexp> must-infer
|
||||
|
@ -241,8 +241,8 @@ IN: regexp-tests
|
|||
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
|
||||
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
|
||||
|
||||
[ t ] [ "xabc" R/ abc/ match-from-end >boolean ] unit-test
|
||||
[ t ] [ "xabc" R/ a[bB][cC]/ match-from-end >boolean ] unit-test
|
||||
[ t ] [ "xabc" R/ abc/ <reverse-matcher> match >boolean ] unit-test
|
||||
[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match >boolean ] unit-test
|
||||
|
||||
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||
|
|
|
@ -5,26 +5,29 @@ assocs prettyprint.backend prettyprint.custom make lexer
|
|||
namespaces parser arrays fry locals regexp.minimize
|
||||
regexp.parser regexp.nfa regexp.dfa regexp.traversal
|
||||
regexp.transition-tables splitting sorting regexp.ast
|
||||
regexp.negation ;
|
||||
regexp.negation regexp.matchers regexp.compiler ;
|
||||
IN: regexp
|
||||
|
||||
TUPLE: regexp
|
||||
{ raw read-only }
|
||||
{ parse-tree read-only }
|
||||
{ options read-only }
|
||||
dfa reverse-dfa ;
|
||||
dfa reverse-dfa dfa-quot ;
|
||||
|
||||
: make-regexp ( string ast -- regexp )
|
||||
f f <options> f f regexp boa ; foldable
|
||||
f f <options> f f f regexp boa ; foldable
|
||||
! Foldable because, when the dfa slot is set,
|
||||
! it'll be set to the same thing regardless of who sets it
|
||||
|
||||
: <optioned-regexp> ( string options -- regexp )
|
||||
[ dup parse-regexp ] [ string>options ] bi*
|
||||
f f regexp boa ;
|
||||
f f f regexp boa ;
|
||||
|
||||
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
||||
|
||||
TUPLE: reverse-matcher regexp ;
|
||||
C: <reverse-matcher> reverse-matcher
|
||||
|
||||
<PRIVATE
|
||||
|
||||
: get-ast ( regexp -- ast )
|
||||
|
@ -33,76 +36,24 @@ TUPLE: regexp
|
|||
: compile-regexp ( regexp -- regexp )
|
||||
dup '[ [ _ get-ast ast>dfa ] unless* ] change-dfa ;
|
||||
|
||||
: compile-dfa-quot ( regexp -- regexp )
|
||||
dup '[ [ _ compile-regexp dfa>> dfa>quotation ] unless* ] change-dfa-quot ;
|
||||
|
||||
: <reversed-option> ( ast -- reversed )
|
||||
"r" string>options <with-options> ;
|
||||
|
||||
: compile-reverse ( regexp -- regexp )
|
||||
dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
|
||||
|
||||
: (match) ( string regexp -- dfa-traverser )
|
||||
compile-regexp dfa>> <dfa-traverser> do-match ;
|
||||
M: regexp match-index ( string regexp -- index/f )
|
||||
dup dfa-quot>>
|
||||
[ <quot-matcher> ]
|
||||
[ compile-regexp dfa>> <dfa-matcher> ] ?if
|
||||
match-index ;
|
||||
|
||||
: (match-reversed) ( string regexp -- dfa-traverser )
|
||||
[ <reversed> ] [ compile-reverse reverse-dfa>> ] bi*
|
||||
<dfa-traverser> do-match ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: match ( string regexp -- slice/f )
|
||||
(match) return-match ;
|
||||
|
||||
: match-from-end ( string regexp -- slice/f )
|
||||
(match-reversed) return-match ;
|
||||
|
||||
: matches? ( string regexp -- ? )
|
||||
dupd match
|
||||
[ [ length ] bi@ = ] [ drop f ] if* ;
|
||||
|
||||
: match-head ( string regexp -- end/f ) match [ length ] [ f ] if* ;
|
||||
|
||||
: match-at ( string m regexp -- n/f finished? )
|
||||
[
|
||||
2dup swap length > [ 2drop f f ] [ tail-slice t ] if
|
||||
] dip swap [ match-head f ] [ 2drop f t ] if ;
|
||||
|
||||
: match-range ( string m regexp -- a/f b/f )
|
||||
3dup match-at over [
|
||||
drop nip rot drop dupd +
|
||||
] [
|
||||
[ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
|
||||
] if ;
|
||||
|
||||
: first-match ( string regexp -- slice/f )
|
||||
dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
|
||||
|
||||
: re-cut ( string regexp -- end/f start )
|
||||
dupd first-match
|
||||
[ split1-slice swap ] [ "" like f swap ] if* ;
|
||||
|
||||
<PRIVATE
|
||||
|
||||
: (re-split) ( string regexp -- )
|
||||
over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: re-split ( string regexp -- seq )
|
||||
[ (re-split) ] { } make ;
|
||||
|
||||
: re-replace ( string regexp replacement -- result )
|
||||
[ re-split ] dip join ;
|
||||
|
||||
: next-match ( string regexp -- end/f match/f )
|
||||
dupd first-match dup
|
||||
[ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
|
||||
|
||||
: all-matches ( string regexp -- seq )
|
||||
[ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
|
||||
|
||||
: count-matches ( string regexp -- n )
|
||||
all-matches length ;
|
||||
|
||||
<PRIVATE
|
||||
M: reverse-matcher match-index ( string regexp -- index/f )
|
||||
[ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi*
|
||||
<dfa-traverser> do-match match-index>> ;
|
||||
|
||||
: find-regexp-syntax ( string -- prefix suffix )
|
||||
{
|
||||
|
@ -131,7 +82,7 @@ PRIVATE>
|
|||
|
||||
: parsing-regexp ( accum end -- accum )
|
||||
lexer get [ take-until ] [ parse-noblank-token ] bi
|
||||
<optioned-regexp> compile-regexp parsed ;
|
||||
<optioned-regexp> compile-dfa-quot parsed ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
! Copyright (C) 2008 Doug Coleman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors assocs combinators kernel math
|
||||
quotations sequences regexp.classes fry arrays
|
||||
quotations sequences regexp.classes fry arrays regexp.matchers
|
||||
combinators.short-circuit prettyprint regexp.nfa ;
|
||||
IN: regexp.traversal
|
||||
|
||||
|
@ -9,16 +9,14 @@ TUPLE: dfa-traverser
|
|||
dfa-table
|
||||
current-state
|
||||
text
|
||||
start-index current-index
|
||||
matches ;
|
||||
current-index
|
||||
match-index ;
|
||||
|
||||
: <dfa-traverser> ( text dfa -- match )
|
||||
dfa-traverser new
|
||||
swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
|
||||
swap >>text
|
||||
0 >>start-index
|
||||
0 >>current-index
|
||||
V{ } clone >>matches ;
|
||||
0 >>current-index ;
|
||||
|
||||
: final-state? ( dfa-traverser -- ? )
|
||||
[ current-state>> ]
|
||||
|
@ -33,25 +31,11 @@ TUPLE: dfa-traverser
|
|||
[ end-of-text? ]
|
||||
} 1|| ;
|
||||
|
||||
: save-final-state ( dfa-straverser -- )
|
||||
[ current-index>> ] [ matches>> ] bi push ;
|
||||
: save-final-state ( dfa-traverser -- dfa-traverser )
|
||||
dup current-index>> >>match-index ;
|
||||
|
||||
: match-done? ( dfa-traverser -- ? )
|
||||
dup final-state? [
|
||||
dup save-final-state
|
||||
] when text-finished? ;
|
||||
|
||||
: text-character ( dfa-traverser n -- ch )
|
||||
[ text>> ] swap '[ current-index>> _ + ] bi nth ;
|
||||
|
||||
: previous-text-character ( dfa-traverser -- ch )
|
||||
-1 text-character ;
|
||||
|
||||
: current-text-character ( dfa-traverser -- ch )
|
||||
0 text-character ;
|
||||
|
||||
: next-text-character ( dfa-traverser -- ch )
|
||||
1 text-character ;
|
||||
dup final-state? [ save-final-state ] when text-finished? ;
|
||||
|
||||
: increment-state ( dfa-traverser state -- dfa-traverser )
|
||||
>>current-state
|
||||
|
@ -79,10 +63,7 @@ TUPLE: dfa-traverser
|
|||
[ increment-state do-match ] when*
|
||||
] unless ;
|
||||
|
||||
: return-match ( dfa-traverser -- slice/f )
|
||||
dup matches>>
|
||||
[ drop f ]
|
||||
[
|
||||
[ [ start-index>> ] [ text>> ] bi ]
|
||||
[ peek ] bi* swap <slice>
|
||||
] if-empty ;
|
||||
TUPLE: dfa-matcher dfa ;
|
||||
C: <dfa-matcher> dfa-matcher
|
||||
M: dfa-matcher match-index
|
||||
dfa>> <dfa-traverser> do-match match-index>> ;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
! Copyright (C) 2008 Slava Pestov.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors regexp prettyprint io io.encodings.ascii
|
||||
io.files kernel sequences assocs namespaces ;
|
||||
USING: accessors regexp.matchers prettyprint io io.encodings.ascii
|
||||
io.files kernel sequences assocs namespaces regexp ;
|
||||
IN: benchmark.regex-dna
|
||||
|
||||
! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1
|
||||
|
|
Loading…
Reference in New Issue