From af2d380a7ffd38cf27b8e16c690b7d12bcb61a9f Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 26 Feb 2009 18:06:57 -0600 Subject: [PATCH] Regexp compiler used from literals --- basis/regexp/compiler/compiler.factor | 46 +++++++---- basis/regexp/matchers/matchers.factor | 61 +++++++++++++++ basis/regexp/minimize/minimize-tests.factor | 3 +- basis/regexp/regexp-docs.factor | 2 +- basis/regexp/regexp-tests.factor | 6 +- basis/regexp/regexp.factor | 87 +++++---------------- basis/regexp/traversal/traversal.factor | 41 +++------- extra/benchmark/regex-dna/regex-dna.factor | 4 +- 8 files changed, 130 insertions(+), 120 deletions(-) create mode 100644 basis/regexp/matchers/matchers.factor diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index a322eb2387..fa3e67d1f9 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -1,34 +1,43 @@ ! Copyright (C) 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: regexp regexp.private regexp.classes kernel sequences regexp.negation +USING: regexp.classes kernel sequences regexp.negation quotations regexp.minimize assocs fry math locals combinators -accessors words compiler.units ; +accessors words compiler.units kernel.private strings +sequences.private arrays regexp.matchers call ; IN: regexp.compiler : literals>cases ( literal-transitions -- case-body ) [ 1quotation ] assoc-map ; : non-literals>dispatch ( non-literal-transitions -- quot ) - [ [ '[ dup _ class-member? ] ] [ 1quotation ] bi* ] assoc-map - [ 3drop f ] suffix '[ _ cond ] ; + [ [ '[ dup _ class-member? ] ] [ '[ drop _ execute ] ] bi* ] assoc-map + [ 3drop ] suffix '[ _ cond ] ; + +: expand-one-or ( or-class transition -- alist ) + [ seq>> ] dip '[ _ 2array ] map ; + +: expand-or ( alist -- new-alist ) + [ + first2 over or-class? + [ expand-one-or ] [ 2array 1array ] if + ] map concat ; : split-literals ( transitions -- case default ) - ! Convert disjunction of literals to literals. Also maybe small ranges. - >alist [ first integer? ] partition + >alist expand-or [ first integer? ] partition [ literals>cases ] [ non-literals>dispatch ] bi* ; -USING: kernel.private strings sequences.private ; - -:: step ( index str case-body final? -- match? ) +:: step ( last-match index str case-body final? -- last-index/f ) + final? index last-match ? index str bounds-check? [ index 1+ str index str nth-unsafe case-body case - ] [ final? ] if ; inline + ] when ; inline : transitions>quot ( transitions final-state? -- quot ) [ split-literals suffix ] dip - '[ { array-capacity string } declare _ _ step ] ; + '[ _ _ step ] ; + ! '[ { array-capacity string } declare _ _ step ] ; : word>quot ( word dfa -- quot ) [ transitions>> at ] @@ -39,7 +48,8 @@ USING: kernel.private strings sequences.private ; '[ [ dup _ word>quot - (( index string -- ? )) define-declared + (( last-match index string -- ? )) + define-declared ] each ] with-compilation-unit ; @@ -59,7 +69,13 @@ USING: kernel.private strings sequences.private ; states>words [ states>code ] keep start-state>> ; : run-regexp ( string word -- ? ) - [ 0 ] 2dip execute ; inline + [ f 0 ] 2dip execute ; inline -: regexp>quotation ( regexp -- quot ) - compile-regexp dfa>> dfa>word '[ _ run-regexp ] ; +: dfa>quotation ( dfa -- quot ) + dfa>word '[ _ run-regexp ] ; + +TUPLE: quot-matcher quot ; +C: quot-matcher + +M: quot-matcher match-index + quot>> call( string -- i/f ) ; diff --git a/basis/regexp/matchers/matchers.factor b/basis/regexp/matchers/matchers.factor new file mode 100644 index 0000000000..7ac1edf58c --- /dev/null +++ b/basis/regexp/matchers/matchers.factor @@ -0,0 +1,61 @@ +! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman. +! See http://factorcode.org/license.txt for BSD license. +USING: kernel sequences math splitting make fry ; +IN: regexp.matchers + +! For now, a matcher is just something with a method to do the +! equivalent of match. + +! matcher protocol: +GENERIC: match-index ( string matcher -- index/f ) + +: match ( string matcher -- slice/f ) + dupd match-index [ head-slice ] [ drop f ] if* ; + +: matches? ( string matcher -- ? ) + dupd match-index + [ swap length = ] [ drop f ] if* ; + +: match-head ( string matcher -- end/f ) match [ length ] [ f ] if* ; + +: match-at ( string m matcher -- n/f finished? ) + [ + 2dup swap length > [ 2drop f f ] [ tail-slice t ] if + ] dip swap [ match-head f ] [ 2drop f t ] if ; + +: match-range ( string m matcher -- a/f b/f ) + 3dup match-at over [ + drop nip rot drop dupd + + ] [ + [ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if + ] if ; + +: first-match ( string matcher -- slice/f ) + dupd 0 swap match-range rot over [ ] [ 3drop f ] if ; + +: re-cut ( string matcher -- end/f start ) + dupd first-match + [ split1-slice swap ] [ "" like f swap ] if* ; + + + +: re-split ( string matcher -- seq ) + [ (re-split) ] { } make ; + +: re-replace ( string matcher replacement -- result ) + [ re-split ] dip join ; + +: next-match ( string matcher -- end/f match/f ) + dupd first-match dup + [ [ split1-slice nip ] keep ] [ 2drop f f ] if ; + +: all-matches ( string matcher -- seq ) + [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ; + +: count-matches ( string matcher -- n ) + all-matches length ; diff --git a/basis/regexp/minimize/minimize-tests.factor b/basis/regexp/minimize/minimize-tests.factor index 78a90ca3ba..5781e74634 100644 --- a/basis/regexp/minimize/minimize-tests.factor +++ b/basis/regexp/minimize/minimize-tests.factor @@ -1,6 +1,7 @@ ! Copyright (C) 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: tools.test regexp.minimize assocs regexp accessors regexp.transition-tables ; +USING: tools.test regexp.minimize assocs regexp regexp.syntax +accessors regexp.transition-tables ; IN: regexp.minimize.tests [ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index eeae9f8ea6..4a77f14561 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel strings help.markup help.syntax ; +USING: kernel strings help.markup help.syntax regexp.matchers ; IN: regexp ABOUT: "regexp" diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 71df08285f..cbc582b295 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -1,5 +1,5 @@ USING: regexp tools.test kernel sequences regexp.parser -regexp.traversal eval strings multiline accessors ; +regexp.traversal eval strings multiline accessors regexp.matchers ; IN: regexp-tests \ must-infer @@ -241,8 +241,8 @@ IN: regexp-tests [ t ] [ "abc" R/ abc/r matches? ] unit-test [ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test -[ t ] [ "xabc" R/ abc/ match-from-end >boolean ] unit-test -[ t ] [ "xabc" R/ a[bB][cC]/ match-from-end >boolean ] unit-test +[ t ] [ "xabc" R/ abc/ match >boolean ] unit-test +[ t ] [ "xabc" R/ a[bB][cC]/ match >boolean ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index e9cd5328e2..45660ad309 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -5,26 +5,29 @@ assocs prettyprint.backend prettyprint.custom make lexer namespaces parser arrays fry locals regexp.minimize regexp.parser regexp.nfa regexp.dfa regexp.traversal regexp.transition-tables splitting sorting regexp.ast -regexp.negation ; +regexp.negation regexp.matchers regexp.compiler ; IN: regexp TUPLE: regexp { raw read-only } { parse-tree read-only } { options read-only } - dfa reverse-dfa ; + dfa reverse-dfa dfa-quot ; : make-regexp ( string ast -- regexp ) - f f f f regexp boa ; foldable + f f f f f regexp boa ; foldable ! Foldable because, when the dfa slot is set, ! it'll be set to the same thing regardless of who sets it : ( string options -- regexp ) [ dup parse-regexp ] [ string>options ] bi* - f f regexp boa ; + f f f regexp boa ; : ( string -- regexp ) "" ; +TUPLE: reverse-matcher regexp ; +C: reverse-matcher + dfa ] unless* ] change-dfa ; +: compile-dfa-quot ( regexp -- regexp ) + dup '[ [ _ compile-regexp dfa>> dfa>quotation ] unless* ] change-dfa-quot ; + : ( ast -- reversed ) "r" string>options ; : compile-reverse ( regexp -- regexp ) dup '[ [ _ get-ast ast>dfa ] unless* ] change-reverse-dfa ; -: (match) ( string regexp -- dfa-traverser ) - compile-regexp dfa>> do-match ; +M: regexp match-index ( string regexp -- index/f ) + dup dfa-quot>> + [ ] + [ compile-regexp dfa>> ] ?if + match-index ; -: (match-reversed) ( string regexp -- dfa-traverser ) - [ ] [ compile-reverse reverse-dfa>> ] bi* - do-match ; - -PRIVATE> - -: match ( string regexp -- slice/f ) - (match) return-match ; - -: match-from-end ( string regexp -- slice/f ) - (match-reversed) return-match ; - -: matches? ( string regexp -- ? ) - dupd match - [ [ length ] bi@ = ] [ drop f ] if* ; - -: match-head ( string regexp -- end/f ) match [ length ] [ f ] if* ; - -: match-at ( string m regexp -- n/f finished? ) - [ - 2dup swap length > [ 2drop f f ] [ tail-slice t ] if - ] dip swap [ match-head f ] [ 2drop f t ] if ; - -: match-range ( string m regexp -- a/f b/f ) - 3dup match-at over [ - drop nip rot drop dupd + - ] [ - [ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if - ] if ; - -: first-match ( string regexp -- slice/f ) - dupd 0 swap match-range rot over [ ] [ 3drop f ] if ; - -: re-cut ( string regexp -- end/f start ) - dupd first-match - [ split1-slice swap ] [ "" like f swap ] if* ; - - - -: re-split ( string regexp -- seq ) - [ (re-split) ] { } make ; - -: re-replace ( string regexp replacement -- result ) - [ re-split ] dip join ; - -: next-match ( string regexp -- end/f match/f ) - dupd first-match dup - [ [ split1-slice nip ] keep ] [ 2drop f f ] if ; - -: all-matches ( string regexp -- seq ) - [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ; - -: count-matches ( string regexp -- n ) - all-matches length ; - - ] [ regexp>> compile-reverse reverse-dfa>> ] bi* + do-match match-index>> ; : find-regexp-syntax ( string -- prefix suffix ) { @@ -131,7 +82,7 @@ PRIVATE> : parsing-regexp ( accum end -- accum ) lexer get [ take-until ] [ parse-noblank-token ] bi - compile-regexp parsed ; + compile-dfa-quot parsed ; PRIVATE> diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 7a0d83051b..e215cde416 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors assocs combinators kernel math -quotations sequences regexp.classes fry arrays +quotations sequences regexp.classes fry arrays regexp.matchers combinators.short-circuit prettyprint regexp.nfa ; IN: regexp.traversal @@ -9,16 +9,14 @@ TUPLE: dfa-traverser dfa-table current-state text - start-index current-index - matches ; + current-index + match-index ; : ( text dfa -- match ) dfa-traverser new swap [ start-state>> >>current-state ] [ >>dfa-table ] bi swap >>text - 0 >>start-index - 0 >>current-index - V{ } clone >>matches ; + 0 >>current-index ; : final-state? ( dfa-traverser -- ? ) [ current-state>> ] @@ -33,25 +31,11 @@ TUPLE: dfa-traverser [ end-of-text? ] } 1|| ; -: save-final-state ( dfa-straverser -- ) - [ current-index>> ] [ matches>> ] bi push ; +: save-final-state ( dfa-traverser -- dfa-traverser ) + dup current-index>> >>match-index ; : match-done? ( dfa-traverser -- ? ) - dup final-state? [ - dup save-final-state - ] when text-finished? ; - -: text-character ( dfa-traverser n -- ch ) - [ text>> ] swap '[ current-index>> _ + ] bi nth ; - -: previous-text-character ( dfa-traverser -- ch ) - -1 text-character ; - -: current-text-character ( dfa-traverser -- ch ) - 0 text-character ; - -: next-text-character ( dfa-traverser -- ch ) - 1 text-character ; + dup final-state? [ save-final-state ] when text-finished? ; : increment-state ( dfa-traverser state -- dfa-traverser ) >>current-state @@ -79,10 +63,7 @@ TUPLE: dfa-traverser [ increment-state do-match ] when* ] unless ; -: return-match ( dfa-traverser -- slice/f ) - dup matches>> - [ drop f ] - [ - [ [ start-index>> ] [ text>> ] bi ] - [ peek ] bi* swap - ] if-empty ; +TUPLE: dfa-matcher dfa ; +C: dfa-matcher +M: dfa-matcher match-index + dfa>> do-match match-index>> ; diff --git a/extra/benchmark/regex-dna/regex-dna.factor b/extra/benchmark/regex-dna/regex-dna.factor index 8c0aee596d..29cb0b7357 100644 --- a/extra/benchmark/regex-dna/regex-dna.factor +++ b/extra/benchmark/regex-dna/regex-dna.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors regexp prettyprint io io.encodings.ascii -io.files kernel sequences assocs namespaces ; +USING: accessors regexp.matchers prettyprint io io.encodings.ascii +io.files kernel sequences assocs namespaces regexp ; IN: benchmark.regex-dna ! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1