Merge branch 'master' of git://factorcode.org/git/factor

2008-11-24 00:30:09 -06:00 · 2008-11-24 00:30:09 -06:00 · d3d57c800b
parent 87bc7d8c8b 636d8ac58e
commit d3d57c800b
9 changed files with 310 additions and 160 deletions
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@ -7,6 +7,7 @@ IN: regexp.classes
 GENERIC: class-member? ( obj class -- ? )

 M: word class-member? ( obj class -- ? ) 2drop f ;
+
 M: integer class-member? ( obj class -- ? ) 2drop f ;

 M: character-class-range class-member? ( obj class -- ? )
@ -60,3 +61,12 @@ M: java-blank-class class-member? ( obj class -- ? )

 M: unmatchable-class class-member? ( obj class -- ? )
    2drop f ;
+
+M: terminator-class class-member? ( obj class -- ? )
+    drop {
+        [ CHAR: \r = ]
+        [ CHAR: \n = ]
+        [ CHAR: \u000085 = ]
+        [ CHAR: \u002028 = ]
+        [ CHAR: \u002029 = ]
+    } 1|| ;
--- a/basis/regexp/dfa/dfa.factor
+++ b/basis/regexp/dfa/dfa.factor
@ -43,7 +43,8 @@ IN: regexp.dfa
        dupd pop dup pick find-transitions rot
        [
            [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
-            >r swapd transition make-transition r> dfa-table>> add-transition 
+            [ swapd transition make-transition ] dip
+            dfa-table>> add-transition 
        ] curry with each
        new-transitions
    ] if-empty ;
--- a/basis/regexp/nfa/nfa.factor
+++ b/basis/regexp/nfa/nfa.factor
@ -18,6 +18,9 @@ SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
 SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
 SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
 SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
+SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
+SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
+SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag

 : next-state ( regexp -- state )
    [ state>> ] [ [ 1+ ] change-state drop ] bi ;
@ -135,7 +138,21 @@ M: non-capture-group nfa-node ( node -- )
 M: reluctant-kleene-star nfa-node ( node -- )
    term>> <kleene-star> nfa-node ;

-!
+
+: add-epsilon-flag ( flag -- )
+    eps literal-transition add-simple-entry add-traversal-flag ;
+
+M: beginning-of-line nfa-node ( node -- )
+    drop beginning-of-line add-epsilon-flag ;
+
+M: end-of-line nfa-node ( node -- )
+    drop end-of-line add-epsilon-flag ;
+
+M: beginning-of-input nfa-node ( node -- )
+    drop beginning-of-input add-epsilon-flag ;
+
+M: end-of-input nfa-node ( node -- )
+    drop end-of-input add-epsilon-flag ;

 M: negation nfa-node ( node -- )
    negation-mode inc
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators io io.streams.string
 kernel math math.parser namespaces qualified sets
 quotations sequences splitting symbols vectors math.order
 unicode.categories strings regexp.backend regexp.utils
-unicode.case words ;
+unicode.case words locals ;
 IN: regexp.parser

 FROM: math.ranges => [a,b] ;
@ -44,18 +44,21 @@ TUPLE: character-class-range from to ; INSTANCE: character-class-range node
 SINGLETON: epsilon INSTANCE: epsilon node
 SINGLETON: any-char INSTANCE: any-char node
 SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
-SINGLETON: front-anchor INSTANCE: front-anchor node
-SINGLETON: back-anchor INSTANCE: back-anchor node
+SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
+SINGLETON: end-of-input INSTANCE: end-of-input node
+SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
+SINGLETON: end-of-line INSTANCE: end-of-line node

 TUPLE: option-on option ; INSTANCE: option-on node
 TUPLE: option-off option ; INSTANCE: option-off node
-SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;
+SINGLETONS: unix-lines dotall multiline comments case-insensitive
+unicode-case reversed-regexp ;

 SINGLETONS: letter-class LETTER-class Letter-class digit-class
 alpha-class non-newline-blank-class
 ascii-class punctuation-class java-printable-class blank-class
 control-character-class hex-digit-class java-blank-class c-identifier-class
-unmatchable-class ;
+terminator-class unmatchable-class word-boundary-class ;

 SINGLETONS: beginning-of-group end-of-group
 beginning-of-character-class end-of-character-class
@ -225,26 +228,12 @@ ERROR: invalid-range a b ;

 : handle-left-brace ( -- )
    parse-repetition
-    >r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r>
+    [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
    [
        2dup and [ from-m-to-n ]
        [ [ nip at-most-n ] [ at-least-n ] if* ] if
    ] [ drop 0 max exactly-n ] if ;

-SINGLETON: beginning-of-input
-SINGLETON: end-of-input
-
-: newlines ( -- obj1 obj2 obj3 )
-    CHAR: \r <constant>
-    CHAR: \n <constant>
-    2dup 2array <concatenation> ;
-
-: beginning-of-line ( -- obj )
-    beginning-of-input newlines 4array <alternation> lookbehind boa ;
-
-: end-of-line ( -- obj )
-    end-of-input newlines 4array <alternation> lookahead boa ;
-
 : handle-front-anchor ( -- )
    get-multiline beginning-of-line beginning-of-input ? push-stack ;

@ -281,13 +270,26 @@ ERROR: expected-posix-class ;
 : parse-control-character ( -- n ) read1 ;

 ERROR: bad-escaped-literals seq ;
-: parse-escaped-literals ( -- obj )
-    "\\E" read-until [ bad-escaped-literals ] unless
+
+: parse-til-E ( -- obj )
+    "\\E" read-until [ bad-escaped-literals ] unless ;
+    
+:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
+    parse-til-E
    drop1
    [ epsilon ] [
-        [ <constant> ] V{ } map-as
+        [ quot call <constant> ] V{ } map-as
        first|concatenation
-    ] if-empty ;
+    ] if-empty ; inline
+
+: parse-escaped-literals ( -- obj )
+    [ ] (parse-escaped-literals) ;
+
+: lower-case-literals ( -- obj )
+    [ ch>lower ] (parse-escaped-literals) ;
+
+: upper-case-literals ( -- obj )
+    [ ch>upper ] (parse-escaped-literals) ;

 : parse-escaped ( -- obj )
    read1
@ -299,12 +301,12 @@ ERROR: bad-escaped-literals seq ;
        { CHAR: a [ HEX: 7 <constant> ] }
        { CHAR: e [ HEX: 1b <constant> ] }

-        { CHAR: d [ digit-class ] }
-        { CHAR: D [ digit-class <negation> ] }
-        { CHAR: s [ java-blank-class ] }
-        { CHAR: S [ java-blank-class <negation> ] }
        { CHAR: w [ c-identifier-class ] }
        { CHAR: W [ c-identifier-class <negation> ] }
+        { CHAR: s [ java-blank-class ] }
+        { CHAR: S [ java-blank-class <negation> ] }
+        { CHAR: d [ digit-class ] }
+        { CHAR: D [ digit-class <negation> ] }

        { CHAR: p [ parse-posix-class ] }
        { CHAR: P [ parse-posix-class <negation> ] }
@ -313,13 +315,19 @@ ERROR: bad-escaped-literals seq ;
        { CHAR: 0 [ parse-octal <constant> ] }
        { CHAR: c [ parse-control-character ] }

-        ! { CHAR: b [ handle-word-boundary ] }
-        ! { CHAR: B [ handle-word-boundary <negation> ] }
-        ! { CHAR: A [ handle-beginning-of-input ] }
-        ! { CHAR: G [ end of previous match ] }
-        ! { CHAR: Z [ handle-end-of-input ] }
-        ! { CHAR: z [ handle-end-of-input ] } ! except for terminator
+        { CHAR: Q [ parse-escaped-literals ] }

+        ! { CHAR: b [ word-boundary-class ] }
+        ! { CHAR: B [ word-boundary-class <negation> ] }
+        ! { CHAR: A [ handle-beginning-of-input ] }
+        ! { CHAR: z [ handle-end-of-input ] }
+
+        ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
+
+        ! m//g mode
+        ! { CHAR: G [ end of previous match ] }
+
+        ! Group capture
        ! { CHAR: 1 [ CHAR: 1 <constant> ] }
        ! { CHAR: 2 [ CHAR: 2 <constant> ] }
        ! { CHAR: 3 [ CHAR: 3 <constant> ] }
@ -330,7 +338,11 @@ ERROR: bad-escaped-literals seq ;
        ! { CHAR: 8 [ CHAR: 8 <constant> ] }
        ! { CHAR: 9 [ CHAR: 9 <constant> ] }

-        { CHAR: Q [ parse-escaped-literals ] }
+        ! Perl extensions
+        ! can't do \l and \u because \u is already a 4-hex
+        { CHAR: L [ lower-case-literals ] }
+        { CHAR: U [ upper-case-literals ] }
+
        [ <constant> ]
    } case ;

--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@ -45,6 +45,7 @@ IN: regexp-tests
 ! Off by default.
 [ f ] [ "\n" "." <regexp> matches? ] unit-test
 [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
+[ t ] [ "\n" R/ ./s matches? ] unit-test
 [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test

 [ f ] [ "" ".+" <regexp> matches? ] unit-test
@ -210,34 +211,34 @@ IN: regexp-tests
 [ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
 [ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test

-[ t ] [ "aaa" "AAA" <iregexp> matches? ] unit-test
-[ f ] [ "aax" "AAA" <iregexp> matches? ] unit-test
-[ t ] [ "aaa" "A*" <iregexp> matches? ] unit-test
-[ f ] [ "aaba" "A*" <iregexp> matches? ] unit-test
-[ t ] [ "b" "[AB]" <iregexp> matches? ] unit-test
-[ f ] [ "c" "[AB]" <iregexp> matches? ] unit-test
-[ t ] [ "c" "[A-Z]" <iregexp> matches? ] unit-test
-[ f ] [ "3" "[A-Z]" <iregexp> matches? ] unit-test
+[ t ] [ "aaa" R/ AAA/i matches? ] unit-test
+[ f ] [ "aax" R/ AAA/i matches? ] unit-test
+[ t ] [ "aaa" R/ A*/i matches? ] unit-test
+[ f ] [ "aaba" R/ A*/i matches? ] unit-test
+[ t ] [ "b" R/ [AB]/i matches? ] unit-test
+[ f ] [ "c" R/ [AB]/i matches? ] unit-test
+[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
+[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test

 [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
 [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
 [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
 [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test

-[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
-[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
-[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
-[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
+[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
+[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
+[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
+[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test

 [ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
-[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
+[ t ] [ "A" R/ [a-z]/i matches? ] unit-test

 [ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
-[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
+[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test

-[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
-[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
-[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
+[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
+[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
+[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test

 [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -253,7 +254,7 @@ IN: regexp-tests

 [ ] [ "(\\$[\\p{XDigit}]|[\\p{Digit}])" <regexp> drop ] unit-test

-! Comment
+! Comment inside a regular expression
 [ t ] [ "ac" "a(?#boo)c" <regexp> matches? ] unit-test

 [ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test
@ -283,32 +284,90 @@ IN: regexp-tests
 [ { "ABC" "DEF" "GHI" } ]
 [ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test

+[ 3 ]
+[ "1ABC2DEF3GHI4" R/ [A-Z]+/ count-matches ] unit-test
+
+[ 0 ]
+[ "123" R/ [A-Z]+/ count-matches ] unit-test
+
 [ "1.2.3.4" ]
 [ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test

 [ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
+[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
+[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
+[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
 [ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
 [ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
 [ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test

-! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
-
-! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
-! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
-! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
-! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
-! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
-! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
-
-! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
-! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
-
 [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
 [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test

+! Bug in parsing word
+[ t ] [ "a" R' a' matches?  ] unit-test
+
+! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
+
+[ t ] [ "a" R/ ^a/ matches? ] unit-test
+[ f ] [ "\na" R/ ^a/ matches? ] unit-test
+[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
+[ f ] [ "\ra" R/ ^a/ matches? ] unit-test
+
+[ t ] [ "a" R/ a$/ matches? ] unit-test
+[ f ] [ "a\n" R/ a$/ matches? ] unit-test
+[ f ] [ "a\r" R/ a$/ matches? ] unit-test
+[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
+
+! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
+! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
+! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test
+! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test
+
+! [ t ] [ "a" R/ \Aa/m matches? ] unit-test
+! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test
+! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test
+! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test
+
+! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test
+
+! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test
+! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test
+
+! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test
+! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test
+
+! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test
+! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test
+! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test
+! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
+
+! [ t ] [ "a" R/ ^a/m matches? ] unit-test
+! [ t ] [ "\na" R/ ^a/m matches? ] unit-test
+! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
+! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
+
+! Convert to lowercase until E
+[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
+[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
+
+! Convert to uppercase until E
+[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
+[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
+
+! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
+! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
+
 ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
 ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test

+! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
+! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
+! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
+! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
+
 ! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
 ! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
 ! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
@ -323,39 +382,29 @@ IN: regexp-tests
 ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
 ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test

+! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
+! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
+! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
+! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
+! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
+! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test

-! Bug in parsing word
-! [ t ] [ "a" R' a' matches?  ] unit-test
+! "ab" "a(?=b*)" <regexp> match
+! "abbbbbc" "a(?=b*c)" <regexp> match
+! "ab" "a(?=b*)" <regexp> match

-! clear "a(?=b*)" <regexp> "ab" over match
-! clear "a(?=b*c)" <regexp> "abbbbbc" over match
-! clear "a(?=b*)" <regexp> "ab" over match
+! "baz" "(az)(?<=b)" <regexp> first-match
+! "cbaz" "a(?<=b*)" <regexp> first-match
+! "baz" "a(?<=b)" <regexp> first-match

-! clear "^a" <regexp> "a" over match
-! clear "^a" <regexp> "\na" over match
-! clear "^a" <regexp> "\r\na" over match
-! clear "^a" <regexp> "\ra" over match
+! "baz" "a(?<!b)" <regexp> first-match
+! "caz" "a(?<!b)" <regexp> first-match

-! clear "a$" <regexp> "a" over match
-! clear "a$" <regexp> "a\n" over match
-! clear "a$" <regexp> "a\r" over match
-! clear "a$" <regexp> "a\r\n" over match
-
-! "(az)(?<=b)" <regexp> "baz" over first-match
-! "a(?<=b*)" <regexp> "cbaz" over first-match
-! "a(?<=b)" <regexp> "baz" over first-match
-
-! "a(?<!b)" <regexp> "baz" over first-match
-! "a(?<!b)" <regexp> "caz" over first-match
-
-! "a(?=bcdefg)bcd" <regexp> "abcdefg" over first-match
-! "a(?#bcdefg)bcd" <regexp> "abcdefg" over first-match
-! "a(?:bcdefg)" <regexp> "abcdefg" over first-match
-
-[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
-
-! "a(?<=b)" <regexp> "caba" over first-match
+! "abcdefg" "a(?=bcdefg)bcd" <regexp> first-match
+! "abcdefg" "a(?#bcdefg)bcd" <regexp> first-match
+! "abcdefg" "a(?:bcdefg)" <regexp> first-match

+! "caba" "a(?<=b)" <regexp> first-match

 ! capture group 1: "aaaa"  2: ""
 ! "aaaa" "(a*)(a*)" <regexp> match*
--- a/basis/regexp/regexp.factor
+++ b/basis/regexp/regexp.factor
@ -1,9 +1,10 @@
 ! Copyright (C) 2008 Doug Coleman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors combinators kernel math sequences
+USING: accessors combinators kernel math sequences strings
 sets assocs prettyprint.backend make lexer namespaces parser
 arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa
-regexp.dfa regexp.traversal regexp.transition-tables splitting ;
+regexp.dfa regexp.traversal regexp.transition-tables splitting
+sorting ;
 IN: regexp

 : default-regexp ( string -- regexp )
@ -73,42 +74,9 @@ IN: regexp
    [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;

 : count-matches ( string regexp -- n )
-    all-matches length 1- ;
+    all-matches length ;

-: initial-option ( regexp option -- regexp' )
-    over options>> conjoin ;
-
-: <regexp> ( string -- regexp )
-    default-regexp construct-regexp ;
-
-: <iregexp> ( string -- regexp )
-    default-regexp
-    case-insensitive initial-option
-    construct-regexp ;
-
-: <rregexp> ( string -- regexp )
-    default-regexp
-    reversed-regexp initial-option
-    construct-regexp ;
-
-: parsing-regexp ( accum end -- accum )
-    lexer get dup skip-blank
-    [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
-    lexer get dup still-parsing-line?
-    [ (parse-token) ] [ drop f ] if
-    "i" = [ <iregexp> ] [ <regexp> ] if parsed ;
-
-: R! CHAR: ! parsing-regexp ; parsing
-: R" CHAR: " parsing-regexp ; parsing
-: R# CHAR: # parsing-regexp ; parsing
-: R' CHAR: ' parsing-regexp ; parsing
-: R( CHAR: ) parsing-regexp ; parsing
-: R/ CHAR: / parsing-regexp ; parsing
-: R@ CHAR: @ parsing-regexp ; parsing
-: R[ CHAR: ] parsing-regexp ; parsing
-: R` CHAR: ` parsing-regexp ; parsing
-: R{ CHAR: } parsing-regexp ; parsing
-: R| CHAR: | parsing-regexp ; parsing
+<PRIVATE

 : find-regexp-syntax ( string -- prefix suffix )
    {
@ -125,14 +93,67 @@ IN: regexp
        { "R| "  "|"  }
    } swap [ subseq? not nip ] curry assoc-find drop ;

-: option? ( option regexp -- ? )
-    options>> key? ;
+ERROR: unknown-regexp-option option ;
+
+: option>ch ( option -- string )
+    {
+        { case-insensitive [ CHAR: i ] }
+        { multiline [ CHAR: m ] }
+        { reversed-regexp [ CHAR: r ] }
+        { dotall [ CHAR: s ] }
+        [ unknown-regexp-option ]
+    } case ;
+
+: ch>option ( ch -- option )
+    {
+        { CHAR: i [ case-insensitive ] }
+        { CHAR: m [ multiline ] }
+        { CHAR: r [ reversed-regexp ] }
+        { CHAR: s [ dotall ] }
+        [ unknown-regexp-option ]
+    } case ;
+
+: string>options ( string -- options )
+    [ ch>option dup ] H{ } map>assoc ;
+
+: options>string ( options -- string )
+    keys [ option>ch ] map natural-sort >string ;
+
+PRIVATE>
+
+: <optioned-regexp> ( string option-string -- regexp )
+    [ default-regexp ] [ string>options ] bi* >>options
+    construct-regexp ;
+
+: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
+
+<PRIVATE
+
+: parsing-regexp ( accum end -- accum )
+    lexer get dup skip-blank
+    [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
+    lexer get dup still-parsing-line?
+    [ (parse-token) ] [ drop f ] if
+    <optioned-regexp> parsed ;
+
+PRIVATE>
+
+: R! CHAR: ! parsing-regexp ; parsing
+: R" CHAR: " parsing-regexp ; parsing
+: R# CHAR: # parsing-regexp ; parsing
+: R' CHAR: ' parsing-regexp ; parsing
+: R( CHAR: ) parsing-regexp ; parsing
+: R/ CHAR: / parsing-regexp ; parsing
+: R@ CHAR: @ parsing-regexp ; parsing
+: R[ CHAR: ] parsing-regexp ; parsing
+: R` CHAR: ` parsing-regexp ; parsing
+: R{ CHAR: } parsing-regexp ; parsing
+: R| CHAR: | parsing-regexp ; parsing

 M: regexp pprint*
    [
        [
-            dup raw>>
-            dup find-regexp-syntax swap % swap % %
-            case-insensitive swap option? [ "i" % ] when
+            [ raw>> dup find-regexp-syntax swap % swap % % ]
+            [ options>> options>string % ] bi
        ] "" make
    ] keep present-text ;
--- a/basis/regexp/transition-tables/transition-tables.factor
+++ b/basis/regexp/transition-tables/transition-tables.factor
@ -40,7 +40,7 @@ TUPLE: transition-table transitions start-state final-states ;
    2dup [ to>> ] dip maybe-initialize-key
    [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
    2dup at* [ 2nip insert-at ]
-    [ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ;
+    [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;

 : add-transition ( transition transition-table -- )
    transitions>> set-transition ;
--- a/basis/regexp/traversal/traversal.factor
+++ b/basis/regexp/traversal/traversal.factor
@ -17,6 +17,7 @@ TUPLE: dfa-traverser
    capture-group-index
    last-state current-state
    text
+    match-failed?
    start-index current-index
    matches ;

@ -37,14 +38,20 @@ TUPLE: dfa-traverser
        H{ } clone >>captured-groups ;

 : final-state? ( dfa-traverser -- ? )
-    [ current-state>> ] [ dfa-table>> final-states>> ] bi
-    key? ;
+    [ current-state>> ]
+    [ dfa-table>> final-states>> ] bi key? ;
+
+: beginning-of-text? ( dfa-traverser -- ? )
+    current-index>> 0 <= ; inline
+
+: end-of-text? ( dfa-traverser -- ? )
+    [ current-index>> ] [ text>> length ] bi >= ; inline

 : text-finished? ( dfa-traverser -- ? )
    {
        [ current-state>> empty? ]
-        [ [ current-index>> ] [ text>> length ] bi >= ]
-        ! [ current-index>> 0 < ]
+        [ end-of-text? ]
+        [ match-failed?>> ]
    } 1|| ;

 : save-final-state ( dfa-straverser -- )
@ -55,8 +62,47 @@ TUPLE: dfa-traverser
        dup save-final-state
    ] when text-finished? ;

+: previous-text-character ( dfa-traverser -- ch )
+    [ text>> ] [ current-index>> 1- ] bi nth ;
+
+: current-text-character ( dfa-traverser -- ch )
+    [ text>> ] [ current-index>> ] bi nth ;
+
+: next-text-character ( dfa-traverser -- ch )
+    [ text>> ] [ current-index>> 1+ ] bi nth ;
+
 GENERIC: flag-action ( dfa-traverser flag -- )

+
+M: beginning-of-input flag-action ( dfa-traverser flag -- )
+    drop
+    dup beginning-of-text? [ t >>match-failed? ] unless drop ;
+
+M: end-of-input flag-action ( dfa-traverser flag -- )
+    drop
+    dup end-of-text? [ t >>match-failed? ] unless drop ;
+
+M: beginning-of-line flag-action ( dfa-traverser flag -- )
+    drop
+    dup {
+        [ beginning-of-text? ]
+        [ previous-text-character terminator-class class-member? ]
+    } 1|| [ t >>match-failed? ] unless drop ;
+
+M: end-of-line flag-action ( dfa-traverser flag -- )
+    drop
+    dup {
+        [ end-of-text? ]
+        [ next-text-character terminator-class class-member? ]
+    } 1|| [ t >>match-failed? ] unless drop ;
+
+M: word-boundary flag-action ( dfa-traverser flag -- )
+    drop
+    dup {
+        [ end-of-text? ]
+        [ current-text-character terminator-class class-member? ]
+    } 1|| [ t >>match-failed? ] unless drop ;
+
 M: lookahead-on flag-action ( dfa-traverser flag -- )
    drop
    lookahead-counters>> 0 swap push ;
@ -110,11 +156,7 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
        [ [ 1+ ] change-current-index ]
        [ [ 1- ] change-current-index ] if
        dup current-state>> >>last-state
-    ] dip
-    first >>current-state ;
-
-: match-failed ( dfa-traverser -- dfa-traverser )
-    V{ } clone >>matches ;
+    ] [ first ] bi* >>current-state ;

 : match-literal ( transition from-state table -- to-state/f )
    transitions>> at at ;
@ -131,11 +173,9 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
    { [ match-literal ] [ match-class ] [ match-default ] } 3|| ;

 : setup-match ( match -- obj state dfa-table )
-    {
-        [ current-index>> ] [ text>> ]
-        [ current-state>> ] [ dfa-table>> ]
-    } cleave
-    [ nth ] 2dip ;
+    [ [ current-index>> ] [ text>> ] bi nth ]
+    [ current-state>> ]
+    [ dfa-table>> ] tri ;

 : do-match ( dfa-traverser -- dfa-traverser )
    dup process-flags
--- a/extra/benchmark/regex-dna/regex-dna.factor
+++ b/extra/benchmark/regex-dna/regex-dna.factor
@ -11,14 +11,14 @@ IN: benchmark.regex-dna

 : count-patterns ( string -- )
    {
-        R/ agggtaaa|tttaccct/i,
-        R/ [cgt]gggtaaa|tttaccc[acg]/i,
-        R/ a[act]ggtaaa|tttacc[agt]t/i,
-        R/ ag[act]gtaaa|tttac[agt]ct/i,
-        R/ agg[act]taaa|ttta[agt]cct/i,
-        R/ aggg[acg]aaa|ttt[cgt]ccct/i,
-        R/ agggt[cgt]aa|tt[acg]accct/i,
-        R/ agggta[cgt]a|t[acg]taccct/i,
+        R/ agggtaaa|tttaccct/i
+        R/ [cgt]gggtaaa|tttaccc[acg]/i
+        R/ a[act]ggtaaa|tttacc[agt]t/i
+        R/ ag[act]gtaaa|tttac[agt]ct/i
+        R/ agg[act]taaa|ttta[agt]cct/i
+        R/ aggg[acg]aaa|ttt[cgt]ccct/i
+        R/ agggt[cgt]aa|tt[acg]accct/i
+        R/ agggta[cgt]a|t[acg]taccct/i
        R/ agggtaa[cgt]|[acg]ttaccct/i
    } [
        [ raw>> write bl ]