Almost done with regexp cleanup

2009-02-18 12:27:07 -06:00 · 2009-02-18 12:27:07 -06:00 · b8845cb87e
parent 242cfb5c19
commit b8845cb87e
13 changed files with 271 additions and 319 deletions
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@ -0,0 +1,53 @@
 ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: kernel arrays accessors fry sequences ;
 FROM: math.ranges => [a,b] ;
 IN: regexp.ast
 TUPLE: primitive-class class ;
 C: <primitive-class> primitive-class
 TUPLE: negation term ;
 C: <negation> negation
 TUPLE: from-to n m ;
 C: <from-to> from-to
 TUPLE: at-least n ;
 C: <at-least> at-least
 TUPLE: concatenation seq ;
 C: <concatenation> concatenation
 TUPLE: alternation seq ;
 C: <alternation> alternation
 TUPLE: star term ;
 C: <star> star
 TUPLE: with-options tree options ;
 C: <with-options> with-options
 TUPLE: options on off ;
 C: <options> options
 SINGLETONS: unix-lines dotall multiline comments case-insensitive
 unicode-case reversed-regexp ;
 : <maybe> ( term -- term' )
    f <concatenation> 2array <alternation> ;
 : <plus> ( term -- term' )
    dup <star> 2array <concatenation> ;
 : repetition ( n term -- term' )
    <array> <concatenation> ;
 GENERIC: <times> ( term times -- term' )
 M: at-least <times>
    n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
 M: from-to <times>
    [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
 : char-class ( ranges ? -- term )
    [ <alternation> ] dip [ <negation> ] when ;
--- a/basis/regexp/backend/backend.factor
+++ b/basis/regexp/backend/backend.factor
@ -1,27 +0,0 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors hashtables kernel math vectors ;
 IN: regexp.backend
 TUPLE: regexp
    raw
    { options hashtable }
    stack
    parse-tree
    nfa-table
    dfa-table
    minimized-table
    matchers
    { nfa-traversal-flags hashtable }
    { dfa-traversal-flags hashtable }
    { state integer }
    { new-states vector }
    { visited-states hashtable } ;
 : reset-regexp ( regexp -- regexp )
    0 >>state
    V{ } clone >>stack
    V{ } clone >>new-states
    H{ } clone >>visited-states ;
 SYMBOL: current-regexp
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@ -1,9 +1,31 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors kernel math math.order words regexp.utils
+USING: accessors kernel math math.order words
-ascii unicode.categories combinators.short-circuit ;
+ascii unicode.categories combinators.short-circuit sequences ;
 IN: regexp.classes
 : punct? ( ch -- ? )
    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
 : c-identifier-char? ( ch -- ? )
    { [ alpha? ] [ CHAR: _ = ] } 1|| ;
 : java-blank? ( ch -- ? )
    {
        CHAR: \s CHAR: \t CHAR: \n
        HEX: b HEX: 7 CHAR: \r
    } member? ;
 : java-printable? ( ch -- ? )
    [ [ alpha? ] [ punct? ] ] 1|| ;
 : hex-digit? ( ch -- ? )
    {
        [ CHAR: A CHAR: F between? ]
        [ CHAR: a CHAR: f between? ]
        [ CHAR: 0 CHAR: 9 between? ]
    } 1|| ;
 SINGLETONS: any-char any-char-no-nl
 letter-class LETTER-class Letter-class digit-class
 alpha-class non-newline-blank-class
@ -14,8 +36,8 @@ unmatchable-class terminator-class word-boundary-class ;
 SINGLETONS: beginning-of-input beginning-of-line
 end-of-input end-of-line ;
-MIXIN: node
+TUPLE: range from to ;
-TUPLE: character-class-range from to ; INSTANCE: character-class-range node
+C: <range> range
 GENERIC: class-member? ( obj class -- ? )
@ -23,7 +45,7 @@ M: t class-member? ( obj class -- ? ) 2drop f ;
 M: integer class-member? ( obj class -- ? ) 2drop f ;
-M: character-class-range class-member? ( obj class -- ? )
+M: range class-member? ( obj class -- ? )
    [ from>> ] [ to>> ] bi between? ;
 M: any-char class-member? ( obj class -- ? )
--- a/basis/regexp/dfa/dfa.factor
+++ b/basis/regexp/dfa/dfa.factor
@ -2,83 +2,74 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs combinators fry kernel locals
 math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors regexp.utils sequences.deep ;
+sets sorting vectors sequences.deep ;
 USING: io prettyprint threads ;
 IN: regexp.dfa
-: find-delta ( states transition regexp -- new-states )
+: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
-    nfa-table>> transitions>>
+    [ [ dup slip ] dip pick over call ] dip dupd =
-    rot [ swap at at ] with with gather sift ;
+    [ 3drop ] [ (while-changes) ] if ; inline recursive
-: (find-epsilon-closure) ( states regexp -- new-states )
+: while-changes ( obj quot pred -- obj' )
    3dup nip call (while-changes) ; inline
 : find-delta ( states transition nfa -- new-states )
    transitions>> '[ _ swap _ at at ] gather sift ;
 : (find-epsilon-closure) ( states nfa -- new-states )
    eps swap find-delta ;
-: find-epsilon-closure ( states regexp -- new-states )
+: find-epsilon-closure ( states nfa -- new-states )
    '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
    natural-sort ;
-: find-closure ( states transition regexp -- new-states )
+: find-closure ( states transition nfa -- new-states )
-    [ find-delta ] 2keep nip find-epsilon-closure ;
+    [ find-delta ] keep find-epsilon-closure ;
-: find-start-state ( regexp -- state )
+: find-start-state ( nfa -- state )
-    [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ;
+    [ start-state>> 1vector ] keep find-epsilon-closure ;
-: find-transitions ( seq1 regexp -- seq2 )
+: find-transitions ( dfa-state nfa -- next-dfa-state )
-    nfa-table>> transitions>>
+    transitions>>
-    [ at keys ] curry gather
+    '[ _ at keys ] gather
    eps swap remove ;
-: add-todo-state ( state regexp -- )
+: add-todo-state ( state visited-states new-states -- )
-    2dup visited-states>> key? [
+    3dup drop key? [ 3drop ] [
-        2drop
+        [ conjoin ] [ push ] bi-curry* bi
    ] [
        [ visited-states>> conjoin ]
        [ new-states>> push ] 2bi
    ] if ;
-: new-transitions ( regexp -- )
+:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
-    dup new-states>> [
+    new-states [ nfa dfa ] [
-        drop
+        new-states pop :> state
-    ] [
+        state nfa-table find-transitions
-        dupd pop dup pick find-transitions rot
+        [| trans |
-        [
+            state trans nfa find-closure :> new-state
-            [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
+            state visited-states new-state add-todo-state
-            [ swapd transition make-transition ] dip
+            state new-state trans transition make-transition dfa add-transition
-            dfa-table>> add-transition 
+        ] each
-        ] curry with each
+        nfa dfa new-states visited-states new-transitions
        new-transitions
    ] if-empty ;
 : states ( hashtable -- array )
    [ keys ]
    [ values [ values concat ] map concat append ] bi ;
-: set-final-states ( regexp -- )
+: set-final-states ( nfa dfa -- )
-    dup
+    [
-    [ nfa-table>> final-states>> keys ]
+        [ final-states>> keys ]
-    [ dfa-table>> transitions>> states ] bi
+        [ transitions>> states ] bi*
-    [ intersects? ] with filter
+        [ intersects? ] with filter
-
+    ] [ final-states>> ] bi
    swap dfa-table>> final-states>>
    [ conjoin ] curry each ;
-: set-initial-state ( regexp -- )
+: initialize-dfa ( nfa -- dfa )
-    dup
+    <transition-table>
-    [ dfa-table>> ] [ find-start-state ] bi
+        swap find-start-state >>start-state ;
    [ >>start-state drop ] keep
    1vector >>new-states drop ;
-: set-traversal-flags ( regexp -- )
+: construct-dfa ( nfa -- dfa )
-    dup
+    dup initialize-dfa
-    [ nfa-traversal-flags>> ]
+    dup start-state>> 1vector
-    [ dfa-table>> transitions>> keys ] bi
+    H{ } clone
-    [ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc
+    new-transitions
-    >>dfa-traversal-flags drop ;
+    [ set-final-states ] keep ;
 : construct-dfa ( regexp -- )
    {
        [ set-initial-state ]
        [ new-transitions ]
        [ set-final-states ]
        [ set-traversal-flags ]
    } cleave ;
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@ -1,9 +1,10 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs grouping kernel regexp.backend
+USING: accessors arrays assocs grouping kernel
-locals math namespaces regexp.parser sequences fry quotations
+locals math namespaces sequences fry quotations
-math.order math.ranges vectors unicode.categories regexp.utils
+math.order math.ranges vectors unicode.categories
-regexp.transition-tables words sets regexp.classes unicode.case.private ;
+regexp.transition-tables words sets 
 unicode.case.private regexp.ast regexp.classes ;
 ! This uses unicode.case.private for ch>upper and ch>lower
 ! but case-insensitive matching should be done by case-folding everything
 ! before processing starts
@ -13,34 +14,49 @@ ERROR: feature-is-broken feature ;
 SYMBOL: negated?
 : negate ( -- )
    negated? [ not ] change ;
 SINGLETON: eps
-: options ( -- obj ) current-regexp get options>> ;
+SYMBOL: option-stack
-: option? ( obj -- ? ) options key? ;
+SYMBOL: combine-stack
-: option-on ( obj -- ) options conjoin ;
+SYMBOL: state
-: option-off ( obj -- ) options delete-at ;
+: next-state ( -- state )
    state [ get ] [ inc ] bi ;
-: next-state ( regexp -- state )
+SYMBOL: nfa-table
    [ state>> ] [ [ 1+ ] change-state drop ] bi ;
-: set-start-state ( regexp -- )
+: set-each ( keys value hashtable -- )
-    dup stack>> [
+    '[ _ swap _ set-at ] each ;
-        drop
+
-    ] [
+: options>hash ( options -- hashtable )
-        [ nfa-table>> ] [ pop first ] bi* >>start-state drop
+    H{ } clone [
-    ] if-empty ;
+        [ [ on>> t ] dip set-each ]
        [ [ off>> f ] dip set-each ] 2bi
    ] keep ;
 : using-options ( options quot -- )
    [ options>hash option-stack [ ?push ] change ] dip
    call option-stack get pop* ; inline
 : option? ( obj -- ? )
    option-stack get assoc-stack ;
 : set-start-state ( -- nfa-table )
    nfa-table get
        combine-stack get pop first >>start-state ;
 GENERIC: nfa-node ( node -- )
 :: add-simple-entry ( obj class -- )
-    [let* | regexp [ current-regexp get ]
+    [let* | s0 [ next-state ]
-            s0 [ regexp next-state ]
+            s1 [ next-state ]
-            s1 [ regexp next-state ]
+            stack [ combine-stack get ]
-            stack [ regexp stack>> ]
+            table [ nfa-table get ] |
            table [ regexp nfa-table>> ] |
        negated? get [
            s0 f obj class make-transition table add-transition
            s0 s1 <default-transition> table add-transition
@ -51,9 +67,8 @@ GENERIC: nfa-node ( node -- )
        t s1 table final-states>> set-at ] ;
 :: concatenate-nodes ( -- )
-    [let* | regexp [ current-regexp get ]
+    [let* | stack [ combine-stack get ]
-            stack [ regexp stack>> ]
+            table [ nfa-table get ]
            table [ regexp nfa-table>> ]
            s2 [ stack peek first ]
            s3 [ stack pop second ]
            s0 [ stack peek first ]
@ -63,15 +78,14 @@ GENERIC: nfa-node ( node -- )
        s0 s3 2array stack push ] ;
 :: alternate-nodes ( -- )
-    [let* | regexp [ current-regexp get ]
+    [let* | stack [ combine-stack get ]
-            stack [ regexp stack>> ]
+            table [ nfa-table get ]
            table [ regexp nfa-table>> ]
            s2 [ stack peek first ]
            s3 [ stack pop second ]
            s0 [ stack peek first ]
            s1 [ stack pop second ]
-            s4 [ regexp next-state ]
+            s4 [ next-state ]
-            s5 [ regexp next-state ] |
+            s5 [ next-state ] |
        s4 s0 eps <literal-transition> table add-transition
        s4 s2 eps <literal-transition> table add-transition
        s1 s5 eps <literal-transition> table add-transition
@ -83,13 +97,12 @@ GENERIC: nfa-node ( node -- )
 M: star nfa-node ( node -- )
    term>> nfa-node
-    [let* | regexp [ current-regexp get ]
+    [let* | stack [ combine-stack get ]
            stack [ regexp stack>> ]
            s0 [ stack peek first ]
            s1 [ stack pop second ]
-            s2 [ regexp next-state ]
+            s2 [ next-state ]
-            s3 [ regexp next-state ]
+            s3 [ next-state ]
-            table [ regexp nfa-table>> ] |
+            table [ nfa-table get ] |
        s1 table final-states>> delete-at
        t s3 table final-states>> set-at
        s1 s0 eps <literal-transition> table add-transition
@ -99,58 +112,53 @@ M: star nfa-node ( node -- )
        s2 s3 2array stack push ] ;
 M: concatenation nfa-node ( node -- )
-    seq>>
+    seq>> [ eps literal-transition add-simple-entry ] [
-    reversed-regexp option? [ <reversed> ] when
+        reversed-regexp option? [ <reversed> ] when
-    [ [ nfa-node ] each ]
+        [ [ nfa-node ] each ]
-    [ length 1- [ concatenate-nodes ] times ] bi ;
+        [ length 1- [ concatenate-nodes ] times ] bi
    ] if-empty ;
 M: alternation nfa-node ( node -- )
    seq>>
    [ [ nfa-node ] each ]
    [ length 1- [ alternate-nodes ] times ] bi ;
-M: constant nfa-node ( node -- )
+M: integer nfa-node ( node -- )
    case-insensitive option? [
-        dup char>> [ ch>lower ] [ ch>upper ] bi
+        dup [ ch>lower ] [ ch>upper ] bi
        2dup = [
            2drop
-            char>> literal-transition add-simple-entry
+            literal-transition add-simple-entry
        ] [
            [ literal-transition add-simple-entry ] bi@
            alternate-nodes drop
        ] if
    ] [
-        char>> literal-transition add-simple-entry
+        literal-transition add-simple-entry
    ] if ;
-M: word nfa-node ( node -- ) class-transition add-simple-entry ;
+M: primitive-class nfa-node ( node -- )
    class>> dup
    { letter-class LETTER-class } member? case-insensitive option? and
    [ drop Letter-class ] when
    class-transition add-simple-entry ;
 M: any-char nfa-node ( node -- )
    [ dotall option? ] dip any-char-no-nl ?
    class-transition add-simple-entry ;
-M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: negation nfa-node ( node -- )
    negate term>> nfa-node negate ;
-M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: range nfa-node ( node -- )
 : choose-letter-class ( node -- node' )
    case-insensitive option? Letter-class rot ? ;
 M: letter-class nfa-node ( node -- )
    choose-letter-class class-transition add-simple-entry ;
 M: LETTER-class nfa-node ( node -- )
    choose-letter-class class-transition add-simple-entry ;
 M: character-class-range nfa-node ( node -- )
    case-insensitive option? [
        ! This should be implemented for Unicode by case-folding
        ! the input and all strings in the regexp.
        dup [ from>> ] [ to>> ] bi
        2dup [ Letter? ] bi@ and [
            rot drop
-            [ [ ch>lower ] bi@ character-class-range boa ]
+            [ [ ch>lower ] bi@ <range> ]
-            [ [ ch>upper ] bi@ character-class-range boa ] 2bi 
+            [ [ ch>upper ] bi@ <range> ] 2bi 
            [ class-transition add-simple-entry ] bi@
            alternate-nodes
        ] [
@ -161,14 +169,15 @@ M: character-class-range nfa-node ( node -- )
        class-transition add-simple-entry
    ] if ;
-M: option nfa-node ( node -- )
+M: with-options nfa-node ( node -- )
-    [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
+    dup options>> [ tree>> nfa-node ] using-options ;
    eps literal-transition add-simple-entry ;
-: construct-nfa ( regexp -- )
+: construct-nfa ( ast -- nfa-table )
    [
-        reset-regexp
+        negated? off
-        [ current-regexp set ]
+        V{ } clone combine-stack set
-        [ parse-tree>> nfa-node ]
+        0 state set
-        [ set-start-state ] tri
+        <transition-table> clone nfa-table set
        nfa-node
        set-start-state
    ] with-scope ;
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@ -1,28 +1,9 @@
 ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: peg.ebnf kernel math.parser sequences assocs arrays
+USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
-combinators regexp.classes strings splitting peg locals ;
+combinators regexp.classes strings splitting peg locals accessors
 regexp.ast ;
 IN: regexp.parser
 TUPLE: range from to ;
 TUPLE: char-class ranges ;
 TUPLE: primitive-class class ;
 TUPLE: not-char-class ranges ;
 TUPLE: not-primitive-class class ;
 TUPLE: from-to n m ;
 TUPLE: at-least n ;
 TUPLE: up-to n ;
 TUPLE: exactly n ;
 TUPLE: times expression times ;
 TUPLE: concatenation seq ;
 TUPLE: alternation seq ;
 TUPLE: maybe term ;
 TUPLE: star term ;
 TUPLE: plus term ;
 TUPLE: with-options tree options ;
 TUPLE: ast ^? $? tree ;
 SINGLETON: any-char
 : allowed-char? ( ch -- ? )
    ".()|[*+?" member? not ;
@ -64,21 +45,16 @@ ERROR: bad-class name ;
        { CHAR: e [ HEX: 1b ] }
        { CHAR: \\ [ CHAR: \\ ] }
-        { CHAR: w [ c-identifier-class primitive-class boa ] }
+        { CHAR: w [ c-identifier-class <primitive-class> ] }
-        { CHAR: W [ c-identifier-class not-primitive-class boa ] }
+        { CHAR: W [ c-identifier-class <primitive-class> <negation> ] }
-        { CHAR: s [ java-blank-class primitive-class boa ] }
+        { CHAR: s [ java-blank-class <primitive-class> ] }
-        { CHAR: S [ java-blank-class not-primitive-class boa ] }
+        { CHAR: S [ java-blank-class <primitive-class> <negation> ] }
-        { CHAR: d [ digit-class primitive-class boa ] }
+        { CHAR: d [ digit-class <primitive-class> ] }
-        { CHAR: D [ digit-class not-primitive-class boa ] }
+        { CHAR: D [ digit-class <primitive-class> <negation> ] }
        [ ]
    } case ;
 TUPLE: options on off ;
 SINGLETONS: unix-lines dotall multiline comments case-insensitive
 unicode-case reversed-regexp ;
 : options-assoc ( -- assoc )
    H{
        { CHAR: i case-insensitive }
@ -98,19 +74,30 @@ unicode-case reversed-regexp ;
    options-assoc value-at ;
 : parse-options ( on off -- options )
-    [ [ ch>option ] map ] bi@ options boa ;
+    [ [ ch>option ] { } map-as ] bi@ <options> ;
-! TODO: make range syntax better (negation, and, etc),
+: string>options ( string -- options )
-!       add syntax for various parenthized things,
+    "-" split1 parse-options ;
 : options>string ( options -- string )
    [ on>> ] [ off>> ] bi
    [ [ option>ch ] map ] bi@
    [ "-" swap 3append ] unless-empty
    "" like ;
 ! TODO: add syntax for various parenthized things,
 !       add greedy and nongreedy forms of matching
 ! (once it's all implemented)
-EBNF: (parse-regexp)
+EBNF: parse-regexp
 CharacterInBracket = !("}") Character
-Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]]
+QuotedCharacter = !("\\E") .
-       | "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]]
+
 Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
       | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
       | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
       | "u" Character:a Character:b Character:c Character:d
            => [[ { a b c d } hex> ensure-number ]]
       | "x" Character:a Character:b
@ -119,30 +106,30 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-cla
            => [[ { a b c } oct> ensure-number ]]
       | . => [[ lookup-escape ]]
-Character = "\\" Escape:e => [[ e ]]
+EscapeSequence = "\\" Escape:e => [[ e ]]
          | . ?[ allowed-char? ]?
-AnyRangeCharacter = Character | "["
+Character = EscapeSequence | . ?[ allowed-char? ]?
 AnyRangeCharacter = EscapeSequence | .
 RangeCharacter = !("]") AnyRangeCharacter
-Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
      | RangeCharacter
-StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
+StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
           | AnyRangeCharacter
 Ranges = StartRange:s Range*:r => [[ r s prefix ]]
-CharClass = "^" Ranges:e => [[ e not-char-class boa ]]
+CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
          | Ranges:e => [[ e char-class boa ]]
 Options = [idmsux]*
 Parenthized = "?:" Alternation:a => [[ a ]]
            | "?" Options:on "-"? Options:off ":" Alternation:a
-                => [[ a on off parse-options with-options boa ]]
+                => [[ a on off parse-options <with-options> ]]
-            | "?#" [^)]* => [[ ignore ]]
+            | "?#" [^)]* => [[ f ]]
            | Alternation
 Element = "(" Parenthized:p ")" => [[ p ]]
@ -152,32 +139,24 @@ Element = "(" Parenthized:p ")" => [[ p ]]
 Number = (!(","|"}").)* => [[ string>number ensure-number ]]
-Times = "," Number:n "}" => [[ n up-to boa ]]
+Times = "," Number:n "}" => [[ 0 n <from-to> ]]
-      | Number:n ",}" => [[ n at-least boa ]]
+      | Number:n ",}" => [[ n <at-least> ]]
-      | Number:n "}" => [[ n exactly boa ]]
+      | Number:n "}" => [[ n n <from-to> ]]
      | "}" => [[ bad-number ]]
-      | Number:n "," Number:m "}" => [[ n m from-to boa ]]
+      | Number:n "," Number:m "}" => [[ n m <from-to> ]]
-Repeated = Element:e "{" Times:t => [[ e t times boa ]]
+Repeated = Element:e "{" Times:t => [[ e t <times> ]]
-         | Element:e "?" => [[ e maybe boa ]]
+         | Element:e "?" => [[ e <maybe> ]]
-         | Element:e "*" => [[ e star boa ]]
+         | Element:e "*" => [[ e <star> ]]
-         | Element:e "+" => [[ e plus boa ]]
+         | Element:e "+" => [[ e <plus> ]]
         | Element
-Concatenation = Repeated*:r => [[ r concatenation boa ]]
+Concatenation = Repeated*:r => [[ r sift <concatenation> ]]
 Alternation = Concatenation:c ("|" Concatenation)*:a
-                => [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]]
+                => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]
 End = !(.)
 Main = Alternation End
 ;EBNF
 : parse-regexp ( string -- regexp )
    ! Hack because I want $ allowable in regexps,
    ! but with special behavior at the end
    ! This fails if the regexp is stupid, though...
    dup first CHAR: ^ = tuck [ rest ] when
    dup peek CHAR: $ = tuck [ but-last ] when
    (parse-regexp) ast boa ;
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@ -1,6 +1,6 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel strings help.markup help.syntax regexp.backend ;
+USING: kernel strings help.markup help.syntax ;
 IN: regexp
 HELP: <regexp>
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@ -189,8 +189,8 @@ IN: regexp-tests
 [ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
 [ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
 [ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
-[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
+[ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
-[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
+[ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test
 [ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
 [ f ] [ "b" "a+b" <regexp> matches? ] unit-test
@ -317,16 +317,6 @@ IN: regexp-tests
 ! Bug in parsing word
 [ t ] [ "a" R' a' matches? ] unit-test
 ! Convert to lowercase until E
 [ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
 [ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
 ! Convert to uppercase until E
 [ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
 [ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
 ! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
 ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
 ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
 ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
@ -370,10 +360,10 @@ IN: regexp-tests
 ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
 ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
-! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\n" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test
 ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
 ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@ -2,33 +2,15 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors combinators kernel math sequences strings sets
 assocs prettyprint.backend prettyprint.custom make lexer
-namespaces parser arrays fry regexp.backend regexp.utils
+namespaces parser arrays fry locals
 regexp.parser regexp.nfa regexp.dfa regexp.traversal
-regexp.transition-tables splitting sorting ;
+regexp.transition-tables splitting sorting regexp.ast ;
 IN: regexp
-: default-regexp ( string -- regexp )
+TUPLE: regexp raw options parse-tree dfa ;
    regexp new
        swap >>raw
        <transition-table> >>nfa-table
        <transition-table> >>dfa-table
        <transition-table> >>minimized-table
        H{ } clone >>nfa-traversal-flags
        H{ } clone >>dfa-traversal-flags
        H{ } clone >>options
        H{ } clone >>matchers
        reset-regexp ;
 : construct-regexp ( regexp -- regexp' )
    {
        [ dup raw>> parse-regexp >>parse-tree drop ]
        [ construct-nfa ]
        [ construct-dfa ]
        [ ]
    } cleave ;
 : (match) ( string regexp -- dfa-traverser )
-    <dfa-traverser> do-match ; inline
+    dfa>> <dfa-traverser> do-match ; inline
 : match ( string regexp -- slice/f )
    (match) return-match ;
@ -94,17 +76,17 @@ IN: regexp
        { "R| "  "|"  }
    } swap [ subseq? not nip ] curry assoc-find drop ;
 : string>options ( string -- options )
    [ ch>option dup ] H{ } map>assoc ;
 : options>string ( options -- string )
    keys [ option>ch ] map natural-sort >string ;
 PRIVATE>
-: <optioned-regexp> ( string option-string -- regexp )
+:: <optioned-regexp> ( string options -- regexp )
-    [ default-regexp ] [ string>options ] bi* >>options
+    string parse-regexp :> tree
-    construct-regexp ;
+    options parse-options :> opt
    tree opt <with-options> :> ast
    regexp new
        string >>raw
        opt >>options
        tree >>parse-tree
        tree opt <with-options> construct-nfa construct-dfa >>dfa ;
 : <regexp> ( string -- regexp ) "" <optioned-regexp> ;
--- a/basis/regexp/transition-tables/transition-tables.factor
+++ b/basis/regexp/transition-tables/transition-tables.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs fry hashtables kernel sequences
-vectors regexp.utils ;
+vectors ;
 IN: regexp.transition-tables
 TUPLE: transition from to obj ;
--- a/basis/regexp/traversal/traversal.factor
+++ b/basis/regexp/traversal/traversal.factor
@ -1,8 +1,8 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors assocs combinators kernel math
-quotations sequences regexp.parser regexp.classes fry arrays
+quotations sequences regexp.classes fry arrays
-combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
+combinators.short-circuit prettyprint regexp.nfa ;
 IN: regexp.traversal
 TUPLE: dfa-traverser
@ -13,8 +13,7 @@ TUPLE: dfa-traverser
    start-index current-index
    matches ;
-: <dfa-traverser> ( text regexp -- match )
+: <dfa-traverser> ( text dfa -- match )
    dfa-table>>
    dfa-traverser new
        swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
        swap >>text
--- a/basis/regexp/utils/utils-tests.factor
+++ b/basis/regexp/utils/utils-tests.factor
@ -1,4 +0,0 @@
 USING: regexp.utils tools.test ;
 IN: regexp.utils.tests
 [ [ ] [ ] while-changes ] must-infer
--- a/basis/regexp/utils/utils.factor
+++ b/basis/regexp/utils/utils.factor
@ -1,42 +0,0 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs io kernel math math.order
 namespaces regexp.backend sequences unicode.categories
 math.ranges fry combinators.short-circuit vectors ;
 IN: regexp.utils
 : (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
    [ [ dup slip ] dip pick over call ] dip dupd =
    [ 3drop ] [ (while-changes) ] if ; inline recursive
 : while-changes ( obj quot pred -- obj' )
    pick over call (while-changes) ; inline
 ERROR: bad-octal number ;
 ERROR: bad-hex number ;
 : check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
 : check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
 : decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
 : hex-digit? ( n -- ? )
    {
        [ decimal-digit? ]
        [ CHAR: a CHAR: f between? ]
        [ CHAR: A CHAR: F between? ]
    } 1|| ;
 : punct? ( n -- ? )
    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
 : c-identifier-char? ( ch -- ? )
    { [ alpha? ] [ CHAR: _ = ] } 1|| ;
 : java-blank? ( n -- ? )
    {
        CHAR: \s CHAR: \t CHAR: \n
        HEX: b HEX: 7 CHAR: \r
    } member? ;
 : java-printable? ( n -- ? )
    [ [ alpha? ] [ punct? ] ] 1|| ;