From af2d380a7ffd38cf27b8e16c690b7d12bcb61a9f Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <littledan@Macintosh-122.local>
Date: Thu, 26 Feb 2009 18:06:57 -0600
Subject: [PATCH] Regexp compiler used from literals

---
 basis/regexp/compiler/compiler.factor       | 46 +++++++----
 basis/regexp/matchers/matchers.factor       | 61 +++++++++++++++
 basis/regexp/minimize/minimize-tests.factor |  3 +-
 basis/regexp/regexp-docs.factor             |  2 +-
 basis/regexp/regexp-tests.factor            |  6 +-
 basis/regexp/regexp.factor                  | 87 +++++----------------
 basis/regexp/traversal/traversal.factor     | 41 +++-------
 extra/benchmark/regex-dna/regex-dna.factor  |  4 +-
 8 files changed, 130 insertions(+), 120 deletions(-)
 create mode 100644 basis/regexp/matchers/matchers.factor
diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor
index a322eb2387..fa3e67d1f9 100644
--- a/basis/regexp/compiler/compiler.factor
+++ b/basis/regexp/compiler/compiler.factor
@@ -1,34 +1,43 @@
 ! Copyright (C) 2009 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: regexp regexp.private regexp.classes kernel sequences regexp.negation
+USING: regexp.classes kernel sequences regexp.negation
 quotations regexp.minimize assocs fry math locals combinators
-accessors words compiler.units ;
+accessors words compiler.units kernel.private strings
+sequences.private arrays regexp.matchers call ;
 IN: regexp.compiler
 
 : literals>cases ( literal-transitions -- case-body )
     [ 1quotation ] assoc-map ;
 
 : non-literals>dispatch ( non-literal-transitions -- quot )
-    [ [ '[ dup _ class-member? ] ] [ 1quotation ] bi* ] assoc-map
-    [ 3drop f ] suffix '[ _ cond ] ;
+    [ [ '[ dup _ class-member? ] ] [ '[ drop _ execute ] ] bi* ] assoc-map
+    [ 3drop ] suffix '[ _ cond ] ;
+
+: expand-one-or ( or-class transition -- alist )
+    [ seq>> ] dip '[ _ 2array ] map ;
+
+: expand-or ( alist -- new-alist )
+    [
+        first2 over or-class?
+        [ expand-one-or ] [ 2array 1array ] if
+    ] map concat ;
 
 : split-literals ( transitions -- case default )
-    ! Convert disjunction of literals to literals. Also maybe small ranges.
-    >alist [ first integer? ] partition
+    >alist expand-or [ first integer? ] partition
     [ literals>cases ] [ non-literals>dispatch ] bi* ;
 
-USING: kernel.private strings sequences.private ;
-
-:: step ( index str case-body final? -- match? )
+:: step ( last-match index str case-body final? -- last-index/f )
+    final? index last-match ?
     index str bounds-check? [
         index 1+ str
         index str nth-unsafe
         case-body case
-    ] [ final? ] if ; inline
+    ] when ; inline
 
 : transitions>quot ( transitions final-state? -- quot )
     [ split-literals suffix ] dip
-    '[ { array-capacity string } declare _ _ step ] ;
+    '[ _ _ step ] ;
+    ! '[ { array-capacity string } declare _ _ step ] ;
 
 : word>quot ( word dfa -- quot )
     [ transitions>> at ]
@@ -39,7 +48,8 @@ USING: kernel.private strings sequences.private ;
     '[
         [
             dup _ word>quot
-            (( index string -- ? )) define-declared
+            (( last-match index string -- ? ))
+            define-declared
         ] each
     ] with-compilation-unit ;
 
@@ -59,7 +69,13 @@ USING: kernel.private strings sequences.private ;
     states>words [ states>code ] keep start-state>> ;
 
 : run-regexp ( string word -- ? )
-    [ 0 ] 2dip execute ; inline
+    [ f 0 ] 2dip execute ; inline
 
-: regexp>quotation ( regexp -- quot )
-    compile-regexp dfa>> dfa>word '[ _ run-regexp ] ;
+: dfa>quotation ( dfa -- quot )
+    dfa>word '[ _ run-regexp ] ;
+
+TUPLE: quot-matcher quot ;
+C: <quot-matcher> quot-matcher
+
+M: quot-matcher match-index
+    quot>> call( string -- i/f ) ;
diff --git a/basis/regexp/matchers/matchers.factor b/basis/regexp/matchers/matchers.factor
new file mode 100644
index 0000000000..7ac1edf58c
--- /dev/null
+++ b/basis/regexp/matchers/matchers.factor
@@ -0,0 +1,61 @@
+! Copyright (C) 2008, 2009 Daniel Ehrenberg, Doug Coleman.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel sequences math splitting make fry ;
+IN: regexp.matchers
+
+! For now, a matcher is just something with a method to do the
+! equivalent of match.
+
+! matcher protocol:
+GENERIC: match-index ( string matcher -- index/f )
+
+: match ( string matcher -- slice/f )
+    dupd match-index [ head-slice ] [ drop f ] if* ;
+
+: matches? ( string matcher -- ? )
+    dupd match-index
+    [ swap length = ] [ drop f ] if* ;
+
+: match-head ( string matcher -- end/f ) match [ length ] [ f ] if* ;
+
+: match-at ( string m matcher -- n/f finished? )
+    [
+        2dup swap length > [ 2drop f f ] [ tail-slice t ] if
+    ] dip swap [ match-head f ] [ 2drop f t ] if ;
+
+: match-range ( string m matcher -- a/f b/f )
+    3dup match-at over [
+        drop nip rot drop dupd +
+    ] [
+        [ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
+    ] if ;
+
+: first-match ( string matcher -- slice/f )
+    dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
+
+: re-cut ( string matcher -- end/f start )
+    dupd first-match
+    [ split1-slice swap ] [ "" like f swap ] if* ;
+
+<PRIVATE
+
+: (re-split) ( string matcher -- )
+    over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
+
+PRIVATE>
+
+: re-split ( string matcher -- seq )
+    [ (re-split) ] { } make ;
+
+: re-replace ( string matcher replacement -- result )
+    [ re-split ] dip join ;
+
+: next-match ( string matcher -- end/f match/f )
+    dupd first-match dup
+    [ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
+
+: all-matches ( string matcher -- seq )
+    [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
+
+: count-matches ( string matcher -- n )
+    all-matches length ;
diff --git a/basis/regexp/minimize/minimize-tests.factor b/basis/regexp/minimize/minimize-tests.factor
index 78a90ca3ba..5781e74634 100644
--- a/basis/regexp/minimize/minimize-tests.factor
+++ b/basis/regexp/minimize/minimize-tests.factor
@@ -1,6 +1,7 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-USING: tools.test regexp.minimize assocs regexp accessors regexp.transition-tables ;
+USING: tools.test regexp.minimize assocs regexp regexp.syntax
+accessors regexp.transition-tables ;
 IN: regexp.minimize.tests
 
 [ t ] [ 1 2 H{ { { 1 2 } t } } same-partition? ] unit-test
diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor
index eeae9f8ea6..4a77f14561 100644
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@@ -1,6 +1,6 @@
 ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel strings help.markup help.syntax ;
+USING: kernel strings help.markup help.syntax regexp.matchers ;
 IN: regexp
 
 ABOUT: "regexp"
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor
index 71df08285f..cbc582b295 100644
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -1,5 +1,5 @@
 USING: regexp tools.test kernel sequences regexp.parser
-regexp.traversal eval strings multiline accessors ;
+regexp.traversal eval strings multiline accessors regexp.matchers ;
 IN: regexp-tests
 
 \ <regexp> must-infer
@@ -241,8 +241,8 @@ IN: regexp-tests
 [ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
 [ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
 
-[ t ] [ "xabc" R/ abc/ match-from-end >boolean ] unit-test
-[ t ] [ "xabc" R/ a[bB][cC]/ match-from-end >boolean ] unit-test
+[ t ] [ "xabc" R/ abc/ <reverse-matcher> match >boolean ] unit-test
+[ t ] [ "xabc" R/ a[bB][cC]/ <reverse-matcher> match >boolean ] unit-test
 
 [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor
index e9cd5328e2..45660ad309 100644
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@@ -5,26 +5,29 @@ assocs prettyprint.backend prettyprint.custom make lexer
 namespaces parser arrays fry locals regexp.minimize
 regexp.parser regexp.nfa regexp.dfa regexp.traversal
 regexp.transition-tables splitting sorting regexp.ast
-regexp.negation ;
+regexp.negation regexp.matchers regexp.compiler ;
 IN: regexp
 
 TUPLE: regexp
     { raw read-only }
     { parse-tree read-only }
     { options read-only }
-    dfa reverse-dfa ;
+    dfa reverse-dfa dfa-quot ;
 
 : make-regexp ( string ast -- regexp )
-    f f <options> f f regexp boa ; foldable
+    f f <options> f f f regexp boa ; foldable
     ! Foldable because, when the dfa slot is set,
     ! it'll be set to the same thing regardless of who sets it
 
 : <optioned-regexp> ( string options -- regexp )
     [ dup parse-regexp ] [ string>options ] bi*
-    f f regexp boa ;
+    f f f regexp boa ;
 
 : <regexp> ( string -- regexp ) "" <optioned-regexp> ;
 
+TUPLE: reverse-matcher regexp ;
+C: <reverse-matcher> reverse-matcher
+
 <PRIVATE
 
 : get-ast ( regexp -- ast )
@@ -33,76 +36,24 @@ TUPLE: regexp
 : compile-regexp ( regexp -- regexp )
     dup '[ [ _ get-ast ast>dfa ] unless* ] change-dfa ;
 
+: compile-dfa-quot ( regexp -- regexp )
+    dup '[ [ _ compile-regexp dfa>> dfa>quotation ] unless* ] change-dfa-quot ;
+
 : <reversed-option> ( ast -- reversed )
     "r" string>options <with-options> ;
 
 : compile-reverse ( regexp -- regexp )
     dup '[ [ _ get-ast <reversed-option> ast>dfa ] unless* ] change-reverse-dfa ;
 
-: (match) ( string regexp -- dfa-traverser )
-    compile-regexp dfa>> <dfa-traverser> do-match ;
+M: regexp match-index ( string regexp -- index/f )
+    dup dfa-quot>>
+    [ <quot-matcher> ]
+    [ compile-regexp dfa>> <dfa-matcher> ] ?if
+    match-index ;
 
-: (match-reversed) ( string regexp -- dfa-traverser )
-    [ <reversed> ] [ compile-reverse reverse-dfa>> ] bi*
-    <dfa-traverser> do-match ;
-
-PRIVATE>
-
-: match ( string regexp -- slice/f )
-    (match) return-match ;
-
-: match-from-end ( string regexp -- slice/f )
-    (match-reversed) return-match ;
-
-: matches? ( string regexp -- ? )
-    dupd match
-    [ [ length ] bi@ = ] [ drop f ] if* ;
-
-: match-head ( string regexp -- end/f ) match [ length ] [ f ] if* ;
-
-: match-at ( string m regexp -- n/f finished? )
-    [
-        2dup swap length > [ 2drop f f ] [ tail-slice t ] if
-    ] dip swap [ match-head f ] [ 2drop f t ] if ;
-
-: match-range ( string m regexp -- a/f b/f )
-    3dup match-at over [
-        drop nip rot drop dupd +
-    ] [
-        [ 3drop drop f f ] [ drop [ 1+ ] dip match-range ] if
-    ] if ;
-
-: first-match ( string regexp -- slice/f )
-    dupd 0 swap match-range rot over [ <slice> ] [ 3drop f ] if ;
-
-: re-cut ( string regexp -- end/f start )
-    dupd first-match
-    [ split1-slice swap ] [ "" like f swap ] if* ;
-
-<PRIVATE
-
-: (re-split) ( string regexp -- )
-    over [ [ re-cut , ] keep (re-split) ] [ 2drop ] if ;
-
-PRIVATE>
-
-: re-split ( string regexp -- seq )
-    [ (re-split) ] { } make ;
-
-: re-replace ( string regexp replacement -- result )
-    [ re-split ] dip join ;
-
-: next-match ( string regexp -- end/f match/f )
-    dupd first-match dup
-    [ [ split1-slice nip ] keep ] [ 2drop f f ] if ;
-
-: all-matches ( string regexp -- seq )
-    [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
-
-: count-matches ( string regexp -- n )
-    all-matches length ;
-
-<PRIVATE
+M: reverse-matcher match-index ( string regexp -- index/f )
+    [ <reversed> ] [ regexp>> compile-reverse reverse-dfa>> ] bi*
+    <dfa-traverser> do-match match-index>> ;
 
 : find-regexp-syntax ( string -- prefix suffix )
     {
@@ -131,7 +82,7 @@ PRIVATE>
 
 : parsing-regexp ( accum end -- accum )
     lexer get [ take-until ] [ parse-noblank-token ] bi
-    <optioned-regexp> compile-regexp parsed ;
+    <optioned-regexp> compile-dfa-quot parsed ;
 
 PRIVATE>
 
diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor
index 7a0d83051b..e215cde416 100644
--- a/basis/regexp/traversal/traversal.factor
+++ b/basis/regexp/traversal/traversal.factor
@@ -1,7 +1,7 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors assocs combinators kernel math
-quotations sequences regexp.classes fry arrays
+quotations sequences regexp.classes fry arrays regexp.matchers
 combinators.short-circuit prettyprint regexp.nfa ;
 IN: regexp.traversal
 
@@ -9,16 +9,14 @@ TUPLE: dfa-traverser
     dfa-table
     current-state
     text
-    start-index current-index
-    matches ;
+    current-index
+    match-index ;
 
 : <dfa-traverser> ( text dfa -- match )
     dfa-traverser new
         swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
         swap >>text
-        0 >>start-index
-        0 >>current-index
-        V{ } clone >>matches ;
+        0 >>current-index ;
 
 : final-state? ( dfa-traverser -- ? )
     [ current-state>> ]
@@ -33,25 +31,11 @@ TUPLE: dfa-traverser
         [ end-of-text? ]
     } 1|| ;
 
-: save-final-state ( dfa-straverser -- )
-    [ current-index>> ] [ matches>> ] bi push ;
+: save-final-state ( dfa-traverser -- dfa-traverser )
+    dup current-index>> >>match-index ;
 
 : match-done? ( dfa-traverser -- ? )
-    dup final-state? [
-        dup save-final-state
-    ] when text-finished? ;
-
-: text-character ( dfa-traverser n -- ch )
-    [ text>> ] swap '[ current-index>> _ + ] bi nth ;
-
-: previous-text-character ( dfa-traverser -- ch )
-    -1 text-character ;
-
-: current-text-character ( dfa-traverser -- ch )
-    0 text-character ;
-
-: next-text-character ( dfa-traverser -- ch )
-    1 text-character ;
+    dup final-state? [ save-final-state ] when text-finished? ;
 
 : increment-state ( dfa-traverser state -- dfa-traverser )
     >>current-state
@@ -79,10 +63,7 @@ TUPLE: dfa-traverser
         [ increment-state do-match ] when*
     ] unless ;
 
-: return-match ( dfa-traverser -- slice/f )
-    dup matches>>
-    [ drop f ]
-    [
-        [ [ start-index>> ] [ text>> ] bi ]
-        [ peek ] bi* swap <slice>
-    ] if-empty ;
+TUPLE: dfa-matcher dfa ;
+C: <dfa-matcher> dfa-matcher
+M: dfa-matcher match-index 
+    dfa>> <dfa-traverser> do-match match-index>> ;
diff --git a/extra/benchmark/regex-dna/regex-dna.factor b/extra/benchmark/regex-dna/regex-dna.factor
index 8c0aee596d..29cb0b7357 100644
--- a/extra/benchmark/regex-dna/regex-dna.factor
+++ b/extra/benchmark/regex-dna/regex-dna.factor
@@ -1,7 +1,7 @@
 ! Copyright (C) 2008 Slava Pestov.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors regexp prettyprint io io.encodings.ascii
-io.files kernel sequences assocs namespaces ;
+USING: accessors regexp.matchers prettyprint io io.encodings.ascii
+io.files kernel sequences assocs namespaces regexp ;
 IN: benchmark.regex-dna
 
 ! Based on http://shootout.alioth.debian.org/gp4/benchmark.php?test=regexdna&lang=ruby&id=1