Almost done with regexp cleanup

2009-02-18 12:27:07 -06:00 · 2009-02-18 12:27:07 -06:00 · b8845cb87e
parent 242cfb5c19
commit b8845cb87e
13 changed files with 271 additions and 319 deletions
--- a/basis/regexp/ast/ast.factor
+++ b/basis/regexp/ast/ast.factor
@ -0,0 +1,53 @@
+! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel arrays accessors fry sequences ;
+FROM: math.ranges => [a,b] ;
+IN: regexp.ast
+
+TUPLE: primitive-class class ;
+C: <primitive-class> primitive-class
+
+TUPLE: negation term ;
+C: <negation> negation
+
+TUPLE: from-to n m ;
+C: <from-to> from-to
+
+TUPLE: at-least n ;
+C: <at-least> at-least
+
+TUPLE: concatenation seq ;
+C: <concatenation> concatenation
+
+TUPLE: alternation seq ;
+C: <alternation> alternation
+
+TUPLE: star term ;
+C: <star> star
+
+TUPLE: with-options tree options ;
+C: <with-options> with-options
+
+TUPLE: options on off ;
+C: <options> options
+
+SINGLETONS: unix-lines dotall multiline comments case-insensitive
+unicode-case reversed-regexp ;
+
+: <maybe> ( term -- term' )
+    f <concatenation> 2array <alternation> ;
+
+: <plus> ( term -- term' )
+    dup <star> 2array <concatenation> ;
+
+: repetition ( n term -- term' )
+    <array> <concatenation> ;
+
+GENERIC: <times> ( term times -- term' )
+M: at-least <times>
+    n>> swap [ repetition ] [ <star> ] bi 2array <concatenation> ;
+M: from-to <times>
+    [ n>> ] [ m>> ] bi [a,b] swap '[ _ repetition ] map <alternation> ;
+
+: char-class ( ranges ? -- term )
+    [ <alternation> ] dip [ <negation> ] when ;
--- a/basis/regexp/backend/backend.factor
+++ b/basis/regexp/backend/backend.factor
@ -1,27 +0,0 @@
-! Copyright (C) 2008 Doug Coleman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors hashtables kernel math vectors ;
-IN: regexp.backend
-
-TUPLE: regexp
-    raw
-    { options hashtable }
-    stack
-    parse-tree
-    nfa-table
-    dfa-table
-    minimized-table
-    matchers
-    { nfa-traversal-flags hashtable }
-    { dfa-traversal-flags hashtable }
-    { state integer }
-    { new-states vector }
-    { visited-states hashtable } ;
-
-: reset-regexp ( regexp -- regexp )
-    0 >>state
-    V{ } clone >>stack
-    V{ } clone >>new-states
-    H{ } clone >>visited-states ;
-
-SYMBOL: current-regexp
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@ -1,9 +1,31 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors kernel math math.order words regexp.utils
-ascii unicode.categories combinators.short-circuit ;
+USING: accessors kernel math math.order words
+ascii unicode.categories combinators.short-circuit sequences ;
 IN: regexp.classes

+: punct? ( ch -- ? )
+    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
+
+: c-identifier-char? ( ch -- ? )
+    { [ alpha? ] [ CHAR: _ = ] } 1|| ;
+
+: java-blank? ( ch -- ? )
+    {
+        CHAR: \s CHAR: \t CHAR: \n
+        HEX: b HEX: 7 CHAR: \r
+    } member? ;
+
+: java-printable? ( ch -- ? )
+    [ [ alpha? ] [ punct? ] ] 1|| ;
+
+: hex-digit? ( ch -- ? )
+    {
+        [ CHAR: A CHAR: F between? ]
+        [ CHAR: a CHAR: f between? ]
+        [ CHAR: 0 CHAR: 9 between? ]
+    } 1|| ;
+
 SINGLETONS: any-char any-char-no-nl
 letter-class LETTER-class Letter-class digit-class
 alpha-class non-newline-blank-class
@ -14,8 +36,8 @@ unmatchable-class terminator-class word-boundary-class ;
 SINGLETONS: beginning-of-input beginning-of-line
 end-of-input end-of-line ;

-MIXIN: node
-TUPLE: character-class-range from to ; INSTANCE: character-class-range node
+TUPLE: range from to ;
+C: <range> range

 GENERIC: class-member? ( obj class -- ? )

@ -23,7 +45,7 @@ M: t class-member? ( obj class -- ? ) 2drop f ;

 M: integer class-member? ( obj class -- ? ) 2drop f ;

-M: character-class-range class-member? ( obj class -- ? )
+M: range class-member? ( obj class -- ? )
    [ from>> ] [ to>> ] bi between? ;

 M: any-char class-member? ( obj class -- ? )
--- a/basis/regexp/dfa/dfa.factor
+++ b/basis/regexp/dfa/dfa.factor
@ -2,83 +2,74 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs combinators fry kernel locals
 math math.order regexp.nfa regexp.transition-tables sequences
-sets sorting vectors regexp.utils sequences.deep ;
+sets sorting vectors sequences.deep ;
 USING: io prettyprint threads ;
 IN: regexp.dfa

-: find-delta ( states transition regexp -- new-states )
-    nfa-table>> transitions>>
-    rot [ swap at at ] with with gather sift ;
+: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
+    [ [ dup slip ] dip pick over call ] dip dupd =
+    [ 3drop ] [ (while-changes) ] if ; inline recursive

-: (find-epsilon-closure) ( states regexp -- new-states )
+: while-changes ( obj quot pred -- obj' )
+    3dup nip call (while-changes) ; inline
+
+: find-delta ( states transition nfa -- new-states )
+    transitions>> '[ _ swap _ at at ] gather sift ;
+
+: (find-epsilon-closure) ( states nfa -- new-states )
    eps swap find-delta ;

-: find-epsilon-closure ( states regexp -- new-states )
+: find-epsilon-closure ( states nfa -- new-states )
    '[ dup _ (find-epsilon-closure) union ] [ length ] while-changes
    natural-sort ;

-: find-closure ( states transition regexp -- new-states )
-    [ find-delta ] 2keep nip find-epsilon-closure ;
+: find-closure ( states transition nfa -- new-states )
+    [ find-delta ] keep find-epsilon-closure ;

-: find-start-state ( regexp -- state )
-    [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ;
+: find-start-state ( nfa -- state )
+    [ start-state>> 1vector ] keep find-epsilon-closure ;

-: find-transitions ( seq1 regexp -- seq2 )
-    nfa-table>> transitions>>
-    [ at keys ] curry gather
+: find-transitions ( dfa-state nfa -- next-dfa-state )
+    transitions>>
+    '[ _ at keys ] gather
    eps swap remove ;

-: add-todo-state ( state regexp -- )
-    2dup visited-states>> key? [
-        2drop
-    ] [
-        [ visited-states>> conjoin ]
-        [ new-states>> push ] 2bi
+: add-todo-state ( state visited-states new-states -- )
+    3dup drop key? [ 3drop ] [
+        [ conjoin ] [ push ] bi-curry* bi
    ] if ;

-: new-transitions ( regexp -- )
-    dup new-states>> [
-        drop
-    ] [
-        dupd pop dup pick find-transitions rot
-        [
-            [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
-            [ swapd transition make-transition ] dip
-            dfa-table>> add-transition 
-        ] curry with each
-        new-transitions
+:: new-transitions ( nfa dfa new-states visited-states -- nfa dfa )
+    new-states [ nfa dfa ] [
+        new-states pop :> state
+        state nfa-table find-transitions
+        [| trans |
+            state trans nfa find-closure :> new-state
+            state visited-states new-state add-todo-state
+            state new-state trans transition make-transition dfa add-transition
+        ] each
+        nfa dfa new-states visited-states new-transitions
    ] if-empty ;

 : states ( hashtable -- array )
    [ keys ]
    [ values [ values concat ] map concat append ] bi ;

-: set-final-states ( regexp -- )
-    dup
-    [ nfa-table>> final-states>> keys ]
-    [ dfa-table>> transitions>> states ] bi
-    [ intersects? ] with filter
-
-    swap dfa-table>> final-states>>
+: set-final-states ( nfa dfa -- )
+    [
+        [ final-states>> keys ]
+        [ transitions>> states ] bi*
+        [ intersects? ] with filter
+    ] [ final-states>> ] bi
    [ conjoin ] curry each ;

-: set-initial-state ( regexp -- )
-    dup
-    [ dfa-table>> ] [ find-start-state ] bi
-    [ >>start-state drop ] keep
-    1vector >>new-states drop ;
+: initialize-dfa ( nfa -- dfa )
+    <transition-table>
+        swap find-start-state >>start-state ;

-: set-traversal-flags ( regexp -- )
-    dup
-    [ nfa-traversal-flags>> ]
-    [ dfa-table>> transitions>> keys ] bi
-    [ [ nip ] [ [ swap at ] with map concat ] 2bi ] with H{ } map>assoc
-    >>dfa-traversal-flags drop ;
-
-: construct-dfa ( regexp -- )
-    {
-        [ set-initial-state ]
-        [ new-transitions ]
-        [ set-final-states ]
-        [ set-traversal-flags ]
-    } cleave ;
+: construct-dfa ( nfa -- dfa )
+    dup initialize-dfa
+    dup start-state>> 1vector
+    H{ } clone
+    new-transitions
+    [ set-final-states ] keep ;
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@ -1,9 +1,10 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs grouping kernel regexp.backend
-locals math namespaces regexp.parser sequences fry quotations
-math.order math.ranges vectors unicode.categories regexp.utils
-regexp.transition-tables words sets regexp.classes unicode.case.private ;
+USING: accessors arrays assocs grouping kernel
+locals math namespaces sequences fry quotations
+math.order math.ranges vectors unicode.categories
+regexp.transition-tables words sets 
+unicode.case.private regexp.ast regexp.classes ;
 ! This uses unicode.case.private for ch>upper and ch>lower
 ! but case-insensitive matching should be done by case-folding everything
 ! before processing starts
@ -13,34 +14,49 @@ ERROR: feature-is-broken feature ;

 SYMBOL: negated?

+: negate ( -- )
+    negated? [ not ] change ;
+
 SINGLETON: eps

-: options ( -- obj ) current-regexp get options>> ;
+SYMBOL: option-stack

-: option? ( obj -- ? ) options key? ;
+SYMBOL: combine-stack

-: option-on ( obj -- ) options conjoin ;
+SYMBOL: state

-: option-off ( obj -- ) options delete-at ;
+: next-state ( -- state )
+    state [ get ] [ inc ] bi ;

-: next-state ( regexp -- state )
-    [ state>> ] [ [ 1+ ] change-state drop ] bi ;
+SYMBOL: nfa-table

-: set-start-state ( regexp -- )
-    dup stack>> [
-        drop
-    ] [
-        [ nfa-table>> ] [ pop first ] bi* >>start-state drop
-    ] if-empty ;
+: set-each ( keys value hashtable -- )
+    '[ _ swap _ set-at ] each ;
+
+: options>hash ( options -- hashtable )
+    H{ } clone [
+        [ [ on>> t ] dip set-each ]
+        [ [ off>> f ] dip set-each ] 2bi
+    ] keep ;
+
+: using-options ( options quot -- )
+    [ options>hash option-stack [ ?push ] change ] dip
+    call option-stack get pop* ; inline
+
+: option? ( obj -- ? )
+    option-stack get assoc-stack ;
+
+: set-start-state ( -- nfa-table )
+    nfa-table get
+        combine-stack get pop first >>start-state ;

 GENERIC: nfa-node ( node -- )

 :: add-simple-entry ( obj class -- )
-    [let* | regexp [ current-regexp get ]
-            s0 [ regexp next-state ]
-            s1 [ regexp next-state ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ] |
+    [let* | s0 [ next-state ]
+            s1 [ next-state ]
+            stack [ combine-stack get ]
+            table [ nfa-table get ] |
        negated? get [
            s0 f obj class make-transition table add-transition
            s0 s1 <default-transition> table add-transition
@ -51,9 +67,8 @@ GENERIC: nfa-node ( node -- )
        t s1 table final-states>> set-at ] ;

 :: concatenate-nodes ( -- )
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ]
+    [let* | stack [ combine-stack get ]
+            table [ nfa-table get ]
            s2 [ stack peek first ]
            s3 [ stack pop second ]
            s0 [ stack peek first ]
@ -63,15 +78,14 @@ GENERIC: nfa-node ( node -- )
        s0 s3 2array stack push ] ;

 :: alternate-nodes ( -- )
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
-            table [ regexp nfa-table>> ]
+    [let* | stack [ combine-stack get ]
+            table [ nfa-table get ]
            s2 [ stack peek first ]
            s3 [ stack pop second ]
            s0 [ stack peek first ]
            s1 [ stack pop second ]
-            s4 [ regexp next-state ]
-            s5 [ regexp next-state ] |
+            s4 [ next-state ]
+            s5 [ next-state ] |
        s4 s0 eps <literal-transition> table add-transition
        s4 s2 eps <literal-transition> table add-transition
        s1 s5 eps <literal-transition> table add-transition
@ -83,13 +97,12 @@ GENERIC: nfa-node ( node -- )

 M: star nfa-node ( node -- )
    term>> nfa-node
-    [let* | regexp [ current-regexp get ]
-            stack [ regexp stack>> ]
+    [let* | stack [ combine-stack get ]
            s0 [ stack peek first ]
            s1 [ stack pop second ]
-            s2 [ regexp next-state ]
-            s3 [ regexp next-state ]
-            table [ regexp nfa-table>> ] |
+            s2 [ next-state ]
+            s3 [ next-state ]
+            table [ nfa-table get ] |
        s1 table final-states>> delete-at
        t s3 table final-states>> set-at
        s1 s0 eps <literal-transition> table add-transition
@ -99,58 +112,53 @@ M: star nfa-node ( node -- )
        s2 s3 2array stack push ] ;

 M: concatenation nfa-node ( node -- )
-    seq>>
-    reversed-regexp option? [ <reversed> ] when
-    [ [ nfa-node ] each ]
-    [ length 1- [ concatenate-nodes ] times ] bi ;
+    seq>> [ eps literal-transition add-simple-entry ] [
+        reversed-regexp option? [ <reversed> ] when
+        [ [ nfa-node ] each ]
+        [ length 1- [ concatenate-nodes ] times ] bi
+    ] if-empty ;

 M: alternation nfa-node ( node -- )
    seq>>
    [ [ nfa-node ] each ]
    [ length 1- [ alternate-nodes ] times ] bi ;

-M: constant nfa-node ( node -- )
+M: integer nfa-node ( node -- )
    case-insensitive option? [
-        dup char>> [ ch>lower ] [ ch>upper ] bi
+        dup [ ch>lower ] [ ch>upper ] bi
        2dup = [
            2drop
-            char>> literal-transition add-simple-entry
+            literal-transition add-simple-entry
        ] [
            [ literal-transition add-simple-entry ] bi@
            alternate-nodes drop
        ] if
    ] [
-        char>> literal-transition add-simple-entry
+        literal-transition add-simple-entry
    ] if ;

-M: word nfa-node ( node -- ) class-transition add-simple-entry ;
+M: primitive-class nfa-node ( node -- )
+    class>> dup
+    { letter-class LETTER-class } member? case-insensitive option? and
+    [ drop Letter-class ] when
+    class-transition add-simple-entry ;

 M: any-char nfa-node ( node -- )
    [ dotall option? ] dip any-char-no-nl ?
    class-transition add-simple-entry ;

-M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
+M: negation nfa-node ( node -- )
+    negate term>> nfa-node negate ;

-M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
-
-: choose-letter-class ( node -- node' )
-    case-insensitive option? Letter-class rot ? ;
-
-M: letter-class nfa-node ( node -- )
-    choose-letter-class class-transition add-simple-entry ;
-
-M: LETTER-class nfa-node ( node -- )
-    choose-letter-class class-transition add-simple-entry ;
-
-M: character-class-range nfa-node ( node -- )
+M: range nfa-node ( node -- )
    case-insensitive option? [
        ! This should be implemented for Unicode by case-folding
        ! the input and all strings in the regexp.
        dup [ from>> ] [ to>> ] bi
        2dup [ Letter? ] bi@ and [
            rot drop
-            [ [ ch>lower ] bi@ character-class-range boa ]
-            [ [ ch>upper ] bi@ character-class-range boa ] 2bi 
+            [ [ ch>lower ] bi@ <range> ]
+            [ [ ch>upper ] bi@ <range> ] 2bi 
            [ class-transition add-simple-entry ] bi@
            alternate-nodes
        ] [
@ -161,14 +169,15 @@ M: character-class-range nfa-node ( node -- )
        class-transition add-simple-entry
    ] if ;

-M: option nfa-node ( node -- )
-    [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
-    eps literal-transition add-simple-entry ;
+M: with-options nfa-node ( node -- )
+    dup options>> [ tree>> nfa-node ] using-options ;

-: construct-nfa ( regexp -- )
+: construct-nfa ( ast -- nfa-table )
    [
-        reset-regexp
-        [ current-regexp set ]
-        [ parse-tree>> nfa-node ]
-        [ set-start-state ] tri
+        negated? off
+        V{ } clone combine-stack set
+        0 state set
+        <transition-table> clone nfa-table set
+        nfa-node
+        set-start-state
    ] with-scope ;
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@ -1,28 +1,9 @@
 ! Copyright (C) 2008, 2009 Doug Coleman, Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: peg.ebnf kernel math.parser sequences assocs arrays
-combinators regexp.classes strings splitting peg locals ;
+USING: peg.ebnf kernel math.parser sequences assocs arrays fry math
+combinators regexp.classes strings splitting peg locals accessors
+regexp.ast ;
 IN: regexp.parser
-
-TUPLE: range from to ;
-TUPLE: char-class ranges ;
-TUPLE: primitive-class class ;
-TUPLE: not-char-class ranges ;
-TUPLE: not-primitive-class class ;
-TUPLE: from-to n m ;
-TUPLE: at-least n ;
-TUPLE: up-to n ;
-TUPLE: exactly n ;
-TUPLE: times expression times ;
-TUPLE: concatenation seq ;
-TUPLE: alternation seq ;
-TUPLE: maybe term ;
-TUPLE: star term ;
-TUPLE: plus term ;
-TUPLE: with-options tree options ;
-TUPLE: ast ^? $? tree ;
-SINGLETON: any-char
-
 : allowed-char? ( ch -- ? )
    ".()|[*+?" member? not ;

@ -64,21 +45,16 @@ ERROR: bad-class name ;
        { CHAR: e [ HEX: 1b ] }
        { CHAR: \\ [ CHAR: \\ ] }

-        { CHAR: w [ c-identifier-class primitive-class boa ] }
-        { CHAR: W [ c-identifier-class not-primitive-class boa ] }
-        { CHAR: s [ java-blank-class primitive-class boa ] }
-        { CHAR: S [ java-blank-class not-primitive-class boa ] }
-        { CHAR: d [ digit-class primitive-class boa ] }
-        { CHAR: D [ digit-class not-primitive-class boa ] }
+        { CHAR: w [ c-identifier-class <primitive-class> ] }
+        { CHAR: W [ c-identifier-class <primitive-class> <negation> ] }
+        { CHAR: s [ java-blank-class <primitive-class> ] }
+        { CHAR: S [ java-blank-class <primitive-class> <negation> ] }
+        { CHAR: d [ digit-class <primitive-class> ] }
+        { CHAR: D [ digit-class <primitive-class> <negation> ] }

        [ ]
    } case ;

-TUPLE: options on off ;
-
-SINGLETONS: unix-lines dotall multiline comments case-insensitive
-unicode-case reversed-regexp ;
-
 : options-assoc ( -- assoc )
    H{
        { CHAR: i case-insensitive }
@ -98,19 +74,30 @@ unicode-case reversed-regexp ;
    options-assoc value-at ;

 : parse-options ( on off -- options )
-    [ [ ch>option ] map ] bi@ options boa ;
+    [ [ ch>option ] { } map-as ] bi@ <options> ;

-! TODO: make range syntax better (negation, and, etc),
-!       add syntax for various parenthized things,
+: string>options ( string -- options )
+    "-" split1 parse-options ;
+ 
+: options>string ( options -- string )
+    [ on>> ] [ off>> ] bi
+    [ [ option>ch ] map ] bi@
+    [ "-" swap 3append ] unless-empty
+    "" like ;
+
+! TODO: add syntax for various parenthized things,
 !       add greedy and nongreedy forms of matching
 ! (once it's all implemented)

-EBNF: (parse-regexp)
+EBNF: parse-regexp

 CharacterInBracket = !("}") Character

-Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-class boa ]]
-       | "P{" CharacterInBracket*:s "}" => [[ s >string name>class not-primitive-class boa ]]
+QuotedCharacter = !("\\E") .
+
+Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> ]]
+       | "P{" CharacterInBracket*:s "}" => [[ s >string name>class <primitive-class> <negation> ]]
+       | "Q" QuotedCharacter*:s "\\E" => [[ s <concatenation> ]]
       | "u" Character:a Character:b Character:c Character:d
            => [[ { a b c d } hex> ensure-number ]]
       | "x" Character:a Character:b
@ -119,30 +106,30 @@ Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class primitive-cla
            => [[ { a b c } oct> ensure-number ]]
       | . => [[ lookup-escape ]]

-Character = "\\" Escape:e => [[ e ]]
-          | . ?[ allowed-char? ]?
+EscapeSequence = "\\" Escape:e => [[ e ]]

-AnyRangeCharacter = Character | "["
+Character = EscapeSequence | . ?[ allowed-char? ]?
+
+AnyRangeCharacter = EscapeSequence | .

 RangeCharacter = !("]") AnyRangeCharacter

-Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
+Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
      | RangeCharacter

-StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b range boa ]]
+StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range> ]]
           | AnyRangeCharacter

 Ranges = StartRange:s Range*:r => [[ r s prefix ]]

-CharClass = "^" Ranges:e => [[ e not-char-class boa ]]
-          | Ranges:e => [[ e char-class boa ]]
+CharClass = "^"?:n Ranges:e => [[ e n char-class ]]

 Options = [idmsux]*

 Parenthized = "?:" Alternation:a => [[ a ]]
            | "?" Options:on "-"? Options:off ":" Alternation:a
-                => [[ a on off parse-options with-options boa ]]
-            | "?#" [^)]* => [[ ignore ]]
+                => [[ a on off parse-options <with-options> ]]
+            | "?#" [^)]* => [[ f ]]
            | Alternation

 Element = "(" Parenthized:p ")" => [[ p ]]
@ -152,32 +139,24 @@ Element = "(" Parenthized:p ")" => [[ p ]]

 Number = (!(","|"}").)* => [[ string>number ensure-number ]]

-Times = "," Number:n "}" => [[ n up-to boa ]]
-      | Number:n ",}" => [[ n at-least boa ]]
-      | Number:n "}" => [[ n exactly boa ]]
+Times = "," Number:n "}" => [[ 0 n <from-to> ]]
+      | Number:n ",}" => [[ n <at-least> ]]
+      | Number:n "}" => [[ n n <from-to> ]]
      | "}" => [[ bad-number ]]
-      | Number:n "," Number:m "}" => [[ n m from-to boa ]]
+      | Number:n "," Number:m "}" => [[ n m <from-to> ]]

-Repeated = Element:e "{" Times:t => [[ e t times boa ]]
-         | Element:e "?" => [[ e maybe boa ]]
-         | Element:e "*" => [[ e star boa ]]
-         | Element:e "+" => [[ e plus boa ]]
+Repeated = Element:e "{" Times:t => [[ e t <times> ]]
+         | Element:e "?" => [[ e <maybe> ]]
+         | Element:e "*" => [[ e <star> ]]
+         | Element:e "+" => [[ e <plus> ]]
         | Element

-Concatenation = Repeated*:r => [[ r concatenation boa ]]
+Concatenation = Repeated*:r => [[ r sift <concatenation> ]]

 Alternation = Concatenation:c ("|" Concatenation)*:a
-                => [[ a empty? [ c ] [ a values c prefix alternation boa ] if ]]
+                => [[ a empty? [ c ] [ a values c prefix <alternation> ] if ]]

 End = !(.)

 Main = Alternation End
 ;EBNF
-
-: parse-regexp ( string -- regexp )
-    ! Hack because I want $ allowable in regexps,
-    ! but with special behavior at the end
-    ! This fails if the regexp is stupid, though...
-    dup first CHAR: ^ = tuck [ rest ] when
-    dup peek CHAR: $ = tuck [ but-last ] when
-    (parse-regexp) ast boa ;
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@ -1,6 +1,6 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel strings help.markup help.syntax regexp.backend ;
+USING: kernel strings help.markup help.syntax ;
 IN: regexp

 HELP: <regexp>
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@ -189,8 +189,8 @@ IN: regexp-tests
 [ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
 [ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
 [ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
-[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
-[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
+[ t ] [ "x" "\\u0078" <regexp> matches? ] unit-test
+[ f ] [ "y" "\\u0078" <regexp> matches? ] unit-test

 [ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
 [ f ] [ "b" "a+b" <regexp> matches? ] unit-test
@ -317,16 +317,6 @@ IN: regexp-tests
 ! Bug in parsing word
 [ t ] [ "a" R' a' matches? ] unit-test

-! Convert to lowercase until E
-[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
-[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
-
-! Convert to uppercase until E
-[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
-[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
-
-! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
-
 ! [ t ] [ "a" R/ ^a/ matches? ] unit-test
 ! [ f ] [ "\na" R/ ^a/ matches? ] unit-test
 ! [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
@ -370,10 +360,10 @@ IN: regexp-tests
 ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
 ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test

-! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
-! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\n" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r\n" R/ a$/m matches? ] unit-test

 ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
 ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@ -2,33 +2,15 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors combinators kernel math sequences strings sets
 assocs prettyprint.backend prettyprint.custom make lexer
-namespaces parser arrays fry regexp.backend regexp.utils
+namespaces parser arrays fry locals
 regexp.parser regexp.nfa regexp.dfa regexp.traversal
-regexp.transition-tables splitting sorting ;
+regexp.transition-tables splitting sorting regexp.ast ;
 IN: regexp

-: default-regexp ( string -- regexp )
-    regexp new
-        swap >>raw
-        <transition-table> >>nfa-table
-        <transition-table> >>dfa-table
-        <transition-table> >>minimized-table
-        H{ } clone >>nfa-traversal-flags
-        H{ } clone >>dfa-traversal-flags
-        H{ } clone >>options
-        H{ } clone >>matchers
-        reset-regexp ;
-
-: construct-regexp ( regexp -- regexp' )
-    {
-        [ dup raw>> parse-regexp >>parse-tree drop ]
-        [ construct-nfa ]
-        [ construct-dfa ]
-        [ ]
-    } cleave ;
+TUPLE: regexp raw options parse-tree dfa ;

 : (match) ( string regexp -- dfa-traverser )
-    <dfa-traverser> do-match ; inline
+    dfa>> <dfa-traverser> do-match ; inline

 : match ( string regexp -- slice/f )
    (match) return-match ;
@ -94,17 +76,17 @@ IN: regexp
        { "R| "  "|"  }
    } swap [ subseq? not nip ] curry assoc-find drop ;

-: string>options ( string -- options )
-    [ ch>option dup ] H{ } map>assoc ;
-
-: options>string ( options -- string )
-    keys [ option>ch ] map natural-sort >string ;
-
 PRIVATE>

-: <optioned-regexp> ( string option-string -- regexp )
-    [ default-regexp ] [ string>options ] bi* >>options
-    construct-regexp ;
+:: <optioned-regexp> ( string options -- regexp )
+    string parse-regexp :> tree
+    options parse-options :> opt
+    tree opt <with-options> :> ast
+    regexp new
+        string >>raw
+        opt >>options
+        tree >>parse-tree
+        tree opt <with-options> construct-nfa construct-dfa >>dfa ;

 : <regexp> ( string -- regexp ) "" <optioned-regexp> ;

--- a/basis/regexp/transition-tables/transition-tables.factor
+++ b/basis/regexp/transition-tables/transition-tables.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs fry hashtables kernel sequences
-vectors regexp.utils ;
+vectors ;
 IN: regexp.transition-tables

 TUPLE: transition from to obj ;
--- a/basis/regexp/traversal/traversal.factor
+++ b/basis/regexp/traversal/traversal.factor
@ -1,8 +1,8 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors assocs combinators kernel math
-quotations sequences regexp.parser regexp.classes fry arrays
-combinators.short-circuit regexp.utils prettyprint regexp.nfa ;
+quotations sequences regexp.classes fry arrays
+combinators.short-circuit prettyprint regexp.nfa ;
 IN: regexp.traversal

 TUPLE: dfa-traverser
@ -13,8 +13,7 @@ TUPLE: dfa-traverser
    start-index current-index
    matches ;

-: <dfa-traverser> ( text regexp -- match )
-    dfa-table>>
+: <dfa-traverser> ( text dfa -- match )
    dfa-traverser new
        swap [ start-state>> >>current-state ] [ >>dfa-table ] bi
        swap >>text
--- a/basis/regexp/utils/utils-tests.factor
+++ b/basis/regexp/utils/utils-tests.factor
@ -1,4 +0,0 @@
-USING: regexp.utils tools.test ;
-IN: regexp.utils.tests
-
-[ [ ] [ ] while-changes ] must-infer
--- a/basis/regexp/utils/utils.factor
+++ b/basis/regexp/utils/utils.factor
@ -1,42 +0,0 @@
-! Copyright (C) 2008 Doug Coleman.
-! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays assocs io kernel math math.order
-namespaces regexp.backend sequences unicode.categories
-math.ranges fry combinators.short-circuit vectors ;
-IN: regexp.utils
-
-: (while-changes) ( obj quot: ( obj -- obj' ) pred: ( obj -- <=> ) pred-ret -- obj )
-    [ [ dup slip ] dip pick over call ] dip dupd =
-    [ 3drop ] [ (while-changes) ] if ; inline recursive
-
-: while-changes ( obj quot pred -- obj' )
-    pick over call (while-changes) ; inline
-
-ERROR: bad-octal number ;
-ERROR: bad-hex number ;
-: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
-: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
-
-: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
-
-: hex-digit? ( n -- ? )
-    {
-        [ decimal-digit? ]
-        [ CHAR: a CHAR: f between? ]
-        [ CHAR: A CHAR: F between? ]
-    } 1|| ;
-
-: punct? ( n -- ? )
-    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
-
-: c-identifier-char? ( ch -- ? )
-    { [ alpha? ] [ CHAR: _ = ] } 1|| ;
-
-: java-blank? ( n -- ? )
-    {
-        CHAR: \s CHAR: \t CHAR: \n
-        HEX: b HEX: 7 CHAR: \r
-    } member? ;
-
-: java-printable? ( n -- ? )
-    [ [ alpha? ] [ punct? ] ] 1|| ;