Merge branch 'master' of git://factorcode.org/git/factor
						commit
						d3d57c800b
					
				|  | @ -7,6 +7,7 @@ IN: regexp.classes | ||||||
| GENERIC: class-member? ( obj class -- ? ) | GENERIC: class-member? ( obj class -- ? ) | ||||||
| 
 | 
 | ||||||
| M: word class-member? ( obj class -- ? ) 2drop f ; | M: word class-member? ( obj class -- ? ) 2drop f ; | ||||||
|  | 
 | ||||||
| M: integer class-member? ( obj class -- ? ) 2drop f ; | M: integer class-member? ( obj class -- ? ) 2drop f ; | ||||||
| 
 | 
 | ||||||
| M: character-class-range class-member? ( obj class -- ? ) | M: character-class-range class-member? ( obj class -- ? ) | ||||||
|  | @ -60,3 +61,12 @@ M: java-blank-class class-member? ( obj class -- ? ) | ||||||
| 
 | 
 | ||||||
| M: unmatchable-class class-member? ( obj class -- ? ) | M: unmatchable-class class-member? ( obj class -- ? ) | ||||||
|     2drop f ; |     2drop f ; | ||||||
|  | 
 | ||||||
|  | M: terminator-class class-member? ( obj class -- ? ) | ||||||
|  |     drop { | ||||||
|  |         [ CHAR: \r = ] | ||||||
|  |         [ CHAR: \n = ] | ||||||
|  |         [ CHAR: \u000085 = ] | ||||||
|  |         [ CHAR: \u002028 = ] | ||||||
|  |         [ CHAR: \u002029 = ] | ||||||
|  |     } 1|| ; | ||||||
|  |  | ||||||
|  | @ -43,7 +43,8 @@ IN: regexp.dfa | ||||||
|         dupd pop dup pick find-transitions rot |         dupd pop dup pick find-transitions rot | ||||||
|         [ |         [ | ||||||
|             [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep |             [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep | ||||||
|             >r swapd transition make-transition r> dfa-table>> add-transition  |             [ swapd transition make-transition ] dip | ||||||
|  |             dfa-table>> add-transition  | ||||||
|         ] curry with each |         ] curry with each | ||||||
|         new-transitions |         new-transitions | ||||||
|     ] if-empty ; |     ] if-empty ; | ||||||
|  |  | ||||||
|  | @ -18,6 +18,9 @@ SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag | ||||||
| SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag | SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag | ||||||
| SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag | SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag | ||||||
| SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag | SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag | ||||||
|  | SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag | ||||||
|  | SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag | ||||||
|  | SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag | ||||||
| 
 | 
 | ||||||
| : next-state ( regexp -- state ) | : next-state ( regexp -- state ) | ||||||
|     [ state>> ] [ [ 1+ ] change-state drop ] bi ; |     [ state>> ] [ [ 1+ ] change-state drop ] bi ; | ||||||
|  | @ -135,7 +138,21 @@ M: non-capture-group nfa-node ( node -- ) | ||||||
| M: reluctant-kleene-star nfa-node ( node -- ) | M: reluctant-kleene-star nfa-node ( node -- ) | ||||||
|     term>> <kleene-star> nfa-node ; |     term>> <kleene-star> nfa-node ; | ||||||
| 
 | 
 | ||||||
| ! | 
 | ||||||
|  | : add-epsilon-flag ( flag -- ) | ||||||
|  |     eps literal-transition add-simple-entry add-traversal-flag ; | ||||||
|  | 
 | ||||||
|  | M: beginning-of-line nfa-node ( node -- ) | ||||||
|  |     drop beginning-of-line add-epsilon-flag ; | ||||||
|  | 
 | ||||||
|  | M: end-of-line nfa-node ( node -- ) | ||||||
|  |     drop end-of-line add-epsilon-flag ; | ||||||
|  | 
 | ||||||
|  | M: beginning-of-input nfa-node ( node -- ) | ||||||
|  |     drop beginning-of-input add-epsilon-flag ; | ||||||
|  | 
 | ||||||
|  | M: end-of-input nfa-node ( node -- ) | ||||||
|  |     drop end-of-input add-epsilon-flag ; | ||||||
| 
 | 
 | ||||||
| M: negation nfa-node ( node -- ) | M: negation nfa-node ( node -- ) | ||||||
|     negation-mode inc |     negation-mode inc | ||||||
|  |  | ||||||
|  | @ -4,7 +4,7 @@ USING: accessors arrays assocs combinators io io.streams.string | ||||||
| kernel math math.parser namespaces qualified sets | kernel math math.parser namespaces qualified sets | ||||||
| quotations sequences splitting symbols vectors math.order | quotations sequences splitting symbols vectors math.order | ||||||
| unicode.categories strings regexp.backend regexp.utils | unicode.categories strings regexp.backend regexp.utils | ||||||
| unicode.case words ; | unicode.case words locals ; | ||||||
| IN: regexp.parser | IN: regexp.parser | ||||||
| 
 | 
 | ||||||
| FROM: math.ranges => [a,b] ; | FROM: math.ranges => [a,b] ; | ||||||
|  | @ -44,18 +44,21 @@ TUPLE: character-class-range from to ; INSTANCE: character-class-range node | ||||||
| SINGLETON: epsilon INSTANCE: epsilon node | SINGLETON: epsilon INSTANCE: epsilon node | ||||||
| SINGLETON: any-char INSTANCE: any-char node | SINGLETON: any-char INSTANCE: any-char node | ||||||
| SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node | SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node | ||||||
| SINGLETON: front-anchor INSTANCE: front-anchor node | SINGLETON: beginning-of-input INSTANCE: beginning-of-input node | ||||||
| SINGLETON: back-anchor INSTANCE: back-anchor node | SINGLETON: end-of-input INSTANCE: end-of-input node | ||||||
|  | SINGLETON: beginning-of-line INSTANCE: beginning-of-line node | ||||||
|  | SINGLETON: end-of-line INSTANCE: end-of-line node | ||||||
| 
 | 
 | ||||||
| TUPLE: option-on option ; INSTANCE: option-on node | TUPLE: option-on option ; INSTANCE: option-on node | ||||||
| TUPLE: option-off option ; INSTANCE: option-off node | TUPLE: option-off option ; INSTANCE: option-off node | ||||||
| SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ; | SINGLETONS: unix-lines dotall multiline comments case-insensitive | ||||||
|  | unicode-case reversed-regexp ; | ||||||
| 
 | 
 | ||||||
| SINGLETONS: letter-class LETTER-class Letter-class digit-class | SINGLETONS: letter-class LETTER-class Letter-class digit-class | ||||||
| alpha-class non-newline-blank-class | alpha-class non-newline-blank-class | ||||||
| ascii-class punctuation-class java-printable-class blank-class | ascii-class punctuation-class java-printable-class blank-class | ||||||
| control-character-class hex-digit-class java-blank-class c-identifier-class | control-character-class hex-digit-class java-blank-class c-identifier-class | ||||||
| unmatchable-class ; | terminator-class unmatchable-class word-boundary-class ; | ||||||
| 
 | 
 | ||||||
| SINGLETONS: beginning-of-group end-of-group | SINGLETONS: beginning-of-group end-of-group | ||||||
| beginning-of-character-class end-of-character-class | beginning-of-character-class end-of-character-class | ||||||
|  | @ -225,26 +228,12 @@ ERROR: invalid-range a b ; | ||||||
| 
 | 
 | ||||||
| : handle-left-brace ( -- ) | : handle-left-brace ( -- ) | ||||||
|     parse-repetition |     parse-repetition | ||||||
|     >r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r> |     [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip | ||||||
|     [ |     [ | ||||||
|         2dup and [ from-m-to-n ] |         2dup and [ from-m-to-n ] | ||||||
|         [ [ nip at-most-n ] [ at-least-n ] if* ] if |         [ [ nip at-most-n ] [ at-least-n ] if* ] if | ||||||
|     ] [ drop 0 max exactly-n ] if ; |     ] [ drop 0 max exactly-n ] if ; | ||||||
| 
 | 
 | ||||||
| SINGLETON: beginning-of-input |  | ||||||
| SINGLETON: end-of-input |  | ||||||
| 
 |  | ||||||
| : newlines ( -- obj1 obj2 obj3 ) |  | ||||||
|     CHAR: \r <constant> |  | ||||||
|     CHAR: \n <constant> |  | ||||||
|     2dup 2array <concatenation> ; |  | ||||||
| 
 |  | ||||||
| : beginning-of-line ( -- obj ) |  | ||||||
|     beginning-of-input newlines 4array <alternation> lookbehind boa ; |  | ||||||
| 
 |  | ||||||
| : end-of-line ( -- obj ) |  | ||||||
|     end-of-input newlines 4array <alternation> lookahead boa ; |  | ||||||
| 
 |  | ||||||
| : handle-front-anchor ( -- ) | : handle-front-anchor ( -- ) | ||||||
|     get-multiline beginning-of-line beginning-of-input ? push-stack ; |     get-multiline beginning-of-line beginning-of-input ? push-stack ; | ||||||
| 
 | 
 | ||||||
|  | @ -281,13 +270,26 @@ ERROR: expected-posix-class ; | ||||||
| : parse-control-character ( -- n ) read1 ; | : parse-control-character ( -- n ) read1 ; | ||||||
| 
 | 
 | ||||||
| ERROR: bad-escaped-literals seq ; | ERROR: bad-escaped-literals seq ; | ||||||
| : parse-escaped-literals ( -- obj ) | 
 | ||||||
|     "\\E" read-until [ bad-escaped-literals ] unless | : parse-til-E ( -- obj ) | ||||||
|  |     "\\E" read-until [ bad-escaped-literals ] unless ; | ||||||
|  |      | ||||||
|  | :: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj ) | ||||||
|  |     parse-til-E | ||||||
|     drop1 |     drop1 | ||||||
|     [ epsilon ] [ |     [ epsilon ] [ | ||||||
|         [ <constant> ] V{ } map-as |         [ quot call <constant> ] V{ } map-as | ||||||
|         first|concatenation |         first|concatenation | ||||||
|     ] if-empty ; |     ] if-empty ; inline | ||||||
|  | 
 | ||||||
|  | : parse-escaped-literals ( -- obj ) | ||||||
|  |     [ ] (parse-escaped-literals) ; | ||||||
|  | 
 | ||||||
|  | : lower-case-literals ( -- obj ) | ||||||
|  |     [ ch>lower ] (parse-escaped-literals) ; | ||||||
|  | 
 | ||||||
|  | : upper-case-literals ( -- obj ) | ||||||
|  |     [ ch>upper ] (parse-escaped-literals) ; | ||||||
| 
 | 
 | ||||||
| : parse-escaped ( -- obj ) | : parse-escaped ( -- obj ) | ||||||
|     read1 |     read1 | ||||||
|  | @ -299,12 +301,12 @@ ERROR: bad-escaped-literals seq ; | ||||||
|         { CHAR: a [ HEX: 7 <constant> ] } |         { CHAR: a [ HEX: 7 <constant> ] } | ||||||
|         { CHAR: e [ HEX: 1b <constant> ] } |         { CHAR: e [ HEX: 1b <constant> ] } | ||||||
| 
 | 
 | ||||||
|         { CHAR: d [ digit-class ] } |  | ||||||
|         { CHAR: D [ digit-class <negation> ] } |  | ||||||
|         { CHAR: s [ java-blank-class ] } |  | ||||||
|         { CHAR: S [ java-blank-class <negation> ] } |  | ||||||
|         { CHAR: w [ c-identifier-class ] } |         { CHAR: w [ c-identifier-class ] } | ||||||
|         { CHAR: W [ c-identifier-class <negation> ] } |         { CHAR: W [ c-identifier-class <negation> ] } | ||||||
|  |         { CHAR: s [ java-blank-class ] } | ||||||
|  |         { CHAR: S [ java-blank-class <negation> ] } | ||||||
|  |         { CHAR: d [ digit-class ] } | ||||||
|  |         { CHAR: D [ digit-class <negation> ] } | ||||||
| 
 | 
 | ||||||
|         { CHAR: p [ parse-posix-class ] } |         { CHAR: p [ parse-posix-class ] } | ||||||
|         { CHAR: P [ parse-posix-class <negation> ] } |         { CHAR: P [ parse-posix-class <negation> ] } | ||||||
|  | @ -313,13 +315,19 @@ ERROR: bad-escaped-literals seq ; | ||||||
|         { CHAR: 0 [ parse-octal <constant> ] } |         { CHAR: 0 [ parse-octal <constant> ] } | ||||||
|         { CHAR: c [ parse-control-character ] } |         { CHAR: c [ parse-control-character ] } | ||||||
| 
 | 
 | ||||||
|         ! { CHAR: b [ handle-word-boundary ] } |         { CHAR: Q [ parse-escaped-literals ] } | ||||||
|         ! { CHAR: B [ handle-word-boundary <negation> ] } |  | ||||||
|         ! { CHAR: A [ handle-beginning-of-input ] } |  | ||||||
|         ! { CHAR: G [ end of previous match ] } |  | ||||||
|         ! { CHAR: Z [ handle-end-of-input ] } |  | ||||||
|         ! { CHAR: z [ handle-end-of-input ] } ! except for terminator |  | ||||||
| 
 | 
 | ||||||
|  |         ! { CHAR: b [ word-boundary-class ] } | ||||||
|  |         ! { CHAR: B [ word-boundary-class <negation> ] } | ||||||
|  |         ! { CHAR: A [ handle-beginning-of-input ] } | ||||||
|  |         ! { CHAR: z [ handle-end-of-input ] } | ||||||
|  | 
 | ||||||
|  |         ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator | ||||||
|  | 
 | ||||||
|  |         ! m//g mode | ||||||
|  |         ! { CHAR: G [ end of previous match ] } | ||||||
|  | 
 | ||||||
|  |         ! Group capture | ||||||
|         ! { CHAR: 1 [ CHAR: 1 <constant> ] } |         ! { CHAR: 1 [ CHAR: 1 <constant> ] } | ||||||
|         ! { CHAR: 2 [ CHAR: 2 <constant> ] } |         ! { CHAR: 2 [ CHAR: 2 <constant> ] } | ||||||
|         ! { CHAR: 3 [ CHAR: 3 <constant> ] } |         ! { CHAR: 3 [ CHAR: 3 <constant> ] } | ||||||
|  | @ -330,7 +338,11 @@ ERROR: bad-escaped-literals seq ; | ||||||
|         ! { CHAR: 8 [ CHAR: 8 <constant> ] } |         ! { CHAR: 8 [ CHAR: 8 <constant> ] } | ||||||
|         ! { CHAR: 9 [ CHAR: 9 <constant> ] } |         ! { CHAR: 9 [ CHAR: 9 <constant> ] } | ||||||
| 
 | 
 | ||||||
|         { CHAR: Q [ parse-escaped-literals ] } |         ! Perl extensions | ||||||
|  |         ! can't do \l and \u because \u is already a 4-hex | ||||||
|  |         { CHAR: L [ lower-case-literals ] } | ||||||
|  |         { CHAR: U [ upper-case-literals ] } | ||||||
|  | 
 | ||||||
|         [ <constant> ] |         [ <constant> ] | ||||||
|     } case ; |     } case ; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -45,6 +45,7 @@ IN: regexp-tests | ||||||
| ! Off by default. | ! Off by default. | ||||||
| [ f ] [ "\n" "." <regexp> matches? ] unit-test | [ f ] [ "\n" "." <regexp> matches? ] unit-test | ||||||
| [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test | [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test | ||||||
|  | [ t ] [ "\n" R/ ./s matches? ] unit-test | ||||||
| [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test | [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ f ] [ "" ".+" <regexp> matches? ] unit-test | [ f ] [ "" ".+" <regexp> matches? ] unit-test | ||||||
|  | @ -210,34 +211,34 @@ IN: regexp-tests | ||||||
| [ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test | [ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test | ||||||
| [ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test | [ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test | ||||||
| 
 | 
 | ||||||
| [ t ] [ "aaa" "AAA" <iregexp> matches? ] unit-test | [ t ] [ "aaa" R/ AAA/i matches? ] unit-test | ||||||
| [ f ] [ "aax" "AAA" <iregexp> matches? ] unit-test | [ f ] [ "aax" R/ AAA/i matches? ] unit-test | ||||||
| [ t ] [ "aaa" "A*" <iregexp> matches? ] unit-test | [ t ] [ "aaa" R/ A*/i matches? ] unit-test | ||||||
| [ f ] [ "aaba" "A*" <iregexp> matches? ] unit-test | [ f ] [ "aaba" R/ A*/i matches? ] unit-test | ||||||
| [ t ] [ "b" "[AB]" <iregexp> matches? ] unit-test | [ t ] [ "b" R/ [AB]/i matches? ] unit-test | ||||||
| [ f ] [ "c" "[AB]" <iregexp> matches? ] unit-test | [ f ] [ "c" R/ [AB]/i matches? ] unit-test | ||||||
| [ t ] [ "c" "[A-Z]" <iregexp> matches? ] unit-test | [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test | ||||||
| [ f ] [ "3" "[A-Z]" <iregexp> matches? ] unit-test | [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test | [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test | ||||||
| [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test | [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test | ||||||
| [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test | [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test | ||||||
| [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test | [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test | [ t ] [ "a" R/ (?-i)a/i matches? ] unit-test | ||||||
| [ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test | [ t ] [ "a" R/ (?-i)a/i matches? ] unit-test | ||||||
| [ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test | [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test | ||||||
| [ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test | [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test | [ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test | ||||||
| [ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test | [ t ] [ "A" R/ [a-z]/i matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test | [ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test | ||||||
| [ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test | [ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test | [ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test | ||||||
| [ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test | [ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test | ||||||
| [ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test | [ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test | [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test | ||||||
| [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test | [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test | ||||||
|  | @ -253,7 +254,7 @@ IN: regexp-tests | ||||||
| 
 | 
 | ||||||
| [ ] [ "(\\$[\\p{XDigit}]|[\\p{Digit}])" <regexp> drop ] unit-test | [ ] [ "(\\$[\\p{XDigit}]|[\\p{Digit}])" <regexp> drop ] unit-test | ||||||
| 
 | 
 | ||||||
| ! Comment | ! Comment inside a regular expression | ||||||
| [ t ] [ "ac" "a(?#boo)c" <regexp> matches? ] unit-test | [ t ] [ "ac" "a(?#boo)c" <regexp> matches? ] unit-test | ||||||
| 
 | 
 | ||||||
| [ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test | [ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test | ||||||
|  | @ -283,32 +284,90 @@ IN: regexp-tests | ||||||
| [ { "ABC" "DEF" "GHI" } ] | [ { "ABC" "DEF" "GHI" } ] | ||||||
| [ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test | [ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test | ||||||
| 
 | 
 | ||||||
|  | [ 3 ] | ||||||
|  | [ "1ABC2DEF3GHI4" R/ [A-Z]+/ count-matches ] unit-test | ||||||
|  | 
 | ||||||
|  | [ 0 ] | ||||||
|  | [ "123" R/ [A-Z]+/ count-matches ] unit-test | ||||||
|  | 
 | ||||||
| [ "1.2.3.4" ] | [ "1.2.3.4" ] | ||||||
| [ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test | [ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test | ||||||
| 
 | 
 | ||||||
| [ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test | [ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test | ||||||
|  | [ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test | ||||||
|  | [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test | ||||||
|  | [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test | ||||||
| [ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test | [ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test | ||||||
| [ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test | [ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test | ||||||
| [ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test | [ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test | ||||||
| 
 | 
 | ||||||
| ! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with |  | ||||||
| 
 |  | ||||||
| ! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test |  | ||||||
| ! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test |  | ||||||
| ! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test |  | ||||||
| ! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test |  | ||||||
| ! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test |  | ||||||
| ! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test |  | ||||||
| 
 |  | ||||||
| ! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test |  | ||||||
| ! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test |  | ||||||
| 
 |  | ||||||
| [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test | [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test | ||||||
| [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test | [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test | ||||||
| 
 | 
 | ||||||
|  | ! Bug in parsing word | ||||||
|  | [ t ] [ "a" R' a' matches?  ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with | ||||||
|  | 
 | ||||||
|  | [ t ] [ "a" R/ ^a/ matches? ] unit-test | ||||||
|  | [ f ] [ "\na" R/ ^a/ matches? ] unit-test | ||||||
|  | [ f ] [ "\r\na" R/ ^a/ matches? ] unit-test | ||||||
|  | [ f ] [ "\ra" R/ ^a/ matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | [ t ] [ "a" R/ a$/ matches? ] unit-test | ||||||
|  | [ f ] [ "a\n" R/ a$/ matches? ] unit-test | ||||||
|  | [ f ] [ "a\r" R/ a$/ matches? ] unit-test | ||||||
|  | [ f ] [ "a\r\n" R/ a$/ matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ t ] [ "a" R/ \Aa/ matches? ] unit-test | ||||||
|  | ! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test | ||||||
|  | ! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test | ||||||
|  | ! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ t ] [ "a" R/ \Aa/m matches? ] unit-test | ||||||
|  | ! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test | ||||||
|  | ! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test | ||||||
|  | ! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test | ||||||
|  | ! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test | ||||||
|  | ! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test | ||||||
|  | ! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test | ||||||
|  | ! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test | ||||||
|  | ! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ t ] [ "a" R/ ^a/m matches? ] unit-test | ||||||
|  | ! [ t ] [ "\na" R/ ^a/m matches? ] unit-test | ||||||
|  | ! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test | ||||||
|  | ! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Convert to lowercase until E | ||||||
|  | [ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test | ||||||
|  | [ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! Convert to uppercase until E | ||||||
|  | [ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test | ||||||
|  | [ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test | ||||||
|  | 
 | ||||||
|  | ! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test | ||||||
|  | ! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test | ||||||
|  | ! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test | ||||||
|  | ! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test | ||||||
|  | 
 | ||||||
| ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test | ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test | ||||||
| ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test | ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test | ||||||
| 
 | 
 | ||||||
|  | ! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test | ||||||
|  | ! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test | ||||||
|  | ! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test | ||||||
|  | ! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test | ||||||
|  | 
 | ||||||
| ! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test | ! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test | ||||||
| ! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test | ! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test | ||||||
| ! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test | ! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test | ||||||
|  | @ -323,39 +382,29 @@ IN: regexp-tests | ||||||
| ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test | ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test | ||||||
| ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test | ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test | ||||||
| 
 | 
 | ||||||
|  | ! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test | ||||||
|  | ! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test | ||||||
|  | ! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test | ||||||
|  | ! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test | ||||||
|  | ! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test | ||||||
|  | ! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test | ||||||
| 
 | 
 | ||||||
| ! Bug in parsing word | ! "ab" "a(?=b*)" <regexp> match | ||||||
| ! [ t ] [ "a" R' a' matches?  ] unit-test | ! "abbbbbc" "a(?=b*c)" <regexp> match | ||||||
|  | ! "ab" "a(?=b*)" <regexp> match | ||||||
| 
 | 
 | ||||||
| ! clear "a(?=b*)" <regexp> "ab" over match | ! "baz" "(az)(?<=b)" <regexp> first-match | ||||||
| ! clear "a(?=b*c)" <regexp> "abbbbbc" over match | ! "cbaz" "a(?<=b*)" <regexp> first-match | ||||||
| ! clear "a(?=b*)" <regexp> "ab" over match | ! "baz" "a(?<=b)" <regexp> first-match | ||||||
| 
 | 
 | ||||||
| ! clear "^a" <regexp> "a" over match | ! "baz" "a(?<!b)" <regexp> first-match | ||||||
| ! clear "^a" <regexp> "\na" over match | ! "caz" "a(?<!b)" <regexp> first-match | ||||||
| ! clear "^a" <regexp> "\r\na" over match |  | ||||||
| ! clear "^a" <regexp> "\ra" over match |  | ||||||
| 
 | 
 | ||||||
| ! clear "a$" <regexp> "a" over match | ! "abcdefg" "a(?=bcdefg)bcd" <regexp> first-match | ||||||
| ! clear "a$" <regexp> "a\n" over match | ! "abcdefg" "a(?#bcdefg)bcd" <regexp> first-match | ||||||
| ! clear "a$" <regexp> "a\r" over match | ! "abcdefg" "a(?:bcdefg)" <regexp> first-match | ||||||
| ! clear "a$" <regexp> "a\r\n" over match |  | ||||||
| 
 |  | ||||||
| ! "(az)(?<=b)" <regexp> "baz" over first-match |  | ||||||
| ! "a(?<=b*)" <regexp> "cbaz" over first-match |  | ||||||
| ! "a(?<=b)" <regexp> "baz" over first-match |  | ||||||
| 
 |  | ||||||
| ! "a(?<!b)" <regexp> "baz" over first-match |  | ||||||
| ! "a(?<!b)" <regexp> "caz" over first-match |  | ||||||
| 
 |  | ||||||
| ! "a(?=bcdefg)bcd" <regexp> "abcdefg" over first-match |  | ||||||
| ! "a(?#bcdefg)bcd" <regexp> "abcdefg" over first-match |  | ||||||
| ! "a(?:bcdefg)" <regexp> "abcdefg" over first-match |  | ||||||
| 
 |  | ||||||
| [ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test |  | ||||||
| 
 |  | ||||||
| ! "a(?<=b)" <regexp> "caba" over first-match |  | ||||||
| 
 | 
 | ||||||
|  | ! "caba" "a(?<=b)" <regexp> first-match | ||||||
| 
 | 
 | ||||||
| ! capture group 1: "aaaa"  2: "" | ! capture group 1: "aaaa"  2: "" | ||||||
| ! "aaaa" "(a*)(a*)" <regexp> match* | ! "aaaa" "(a*)(a*)" <regexp> match* | ||||||
|  |  | ||||||
|  | @ -1,9 +1,10 @@ | ||||||
| ! Copyright (C) 2008 Doug Coleman. | ! Copyright (C) 2008 Doug Coleman. | ||||||
| ! See http://factorcode.org/license.txt for BSD license. | ! See http://factorcode.org/license.txt for BSD license. | ||||||
| USING: accessors combinators kernel math sequences | USING: accessors combinators kernel math sequences strings | ||||||
| sets assocs prettyprint.backend make lexer namespaces parser | sets assocs prettyprint.backend make lexer namespaces parser | ||||||
| arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa | arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa | ||||||
| regexp.dfa regexp.traversal regexp.transition-tables splitting ; | regexp.dfa regexp.traversal regexp.transition-tables splitting | ||||||
|  | sorting ; | ||||||
| IN: regexp | IN: regexp | ||||||
| 
 | 
 | ||||||
| : default-regexp ( string -- regexp ) | : default-regexp ( string -- regexp ) | ||||||
|  | @ -73,42 +74,9 @@ IN: regexp | ||||||
|     [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ; |     [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ; | ||||||
| 
 | 
 | ||||||
| : count-matches ( string regexp -- n ) | : count-matches ( string regexp -- n ) | ||||||
|     all-matches length 1- ; |     all-matches length ; | ||||||
| 
 | 
 | ||||||
| : initial-option ( regexp option -- regexp' ) | <PRIVATE | ||||||
|     over options>> conjoin ; |  | ||||||
| 
 |  | ||||||
| : <regexp> ( string -- regexp ) |  | ||||||
|     default-regexp construct-regexp ; |  | ||||||
| 
 |  | ||||||
| : <iregexp> ( string -- regexp ) |  | ||||||
|     default-regexp |  | ||||||
|     case-insensitive initial-option |  | ||||||
|     construct-regexp ; |  | ||||||
| 
 |  | ||||||
| : <rregexp> ( string -- regexp ) |  | ||||||
|     default-regexp |  | ||||||
|     reversed-regexp initial-option |  | ||||||
|     construct-regexp ; |  | ||||||
| 
 |  | ||||||
| : parsing-regexp ( accum end -- accum ) |  | ||||||
|     lexer get dup skip-blank |  | ||||||
|     [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column |  | ||||||
|     lexer get dup still-parsing-line? |  | ||||||
|     [ (parse-token) ] [ drop f ] if |  | ||||||
|     "i" = [ <iregexp> ] [ <regexp> ] if parsed ; |  | ||||||
| 
 |  | ||||||
| : R! CHAR: ! parsing-regexp ; parsing |  | ||||||
| : R" CHAR: " parsing-regexp ; parsing |  | ||||||
| : R# CHAR: # parsing-regexp ; parsing |  | ||||||
| : R' CHAR: ' parsing-regexp ; parsing |  | ||||||
| : R( CHAR: ) parsing-regexp ; parsing |  | ||||||
| : R/ CHAR: / parsing-regexp ; parsing |  | ||||||
| : R@ CHAR: @ parsing-regexp ; parsing |  | ||||||
| : R[ CHAR: ] parsing-regexp ; parsing |  | ||||||
| : R` CHAR: ` parsing-regexp ; parsing |  | ||||||
| : R{ CHAR: } parsing-regexp ; parsing |  | ||||||
| : R| CHAR: | parsing-regexp ; parsing |  | ||||||
| 
 | 
 | ||||||
| : find-regexp-syntax ( string -- prefix suffix ) | : find-regexp-syntax ( string -- prefix suffix ) | ||||||
|     { |     { | ||||||
|  | @ -125,14 +93,67 @@ IN: regexp | ||||||
|         { "R| "  "|"  } |         { "R| "  "|"  } | ||||||
|     } swap [ subseq? not nip ] curry assoc-find drop ; |     } swap [ subseq? not nip ] curry assoc-find drop ; | ||||||
| 
 | 
 | ||||||
| : option? ( option regexp -- ? ) | ERROR: unknown-regexp-option option ; | ||||||
|     options>> key? ; | 
 | ||||||
|  | : option>ch ( option -- string ) | ||||||
|  |     { | ||||||
|  |         { case-insensitive [ CHAR: i ] } | ||||||
|  |         { multiline [ CHAR: m ] } | ||||||
|  |         { reversed-regexp [ CHAR: r ] } | ||||||
|  |         { dotall [ CHAR: s ] } | ||||||
|  |         [ unknown-regexp-option ] | ||||||
|  |     } case ; | ||||||
|  | 
 | ||||||
|  | : ch>option ( ch -- option ) | ||||||
|  |     { | ||||||
|  |         { CHAR: i [ case-insensitive ] } | ||||||
|  |         { CHAR: m [ multiline ] } | ||||||
|  |         { CHAR: r [ reversed-regexp ] } | ||||||
|  |         { CHAR: s [ dotall ] } | ||||||
|  |         [ unknown-regexp-option ] | ||||||
|  |     } case ; | ||||||
|  | 
 | ||||||
|  | : string>options ( string -- options ) | ||||||
|  |     [ ch>option dup ] H{ } map>assoc ; | ||||||
|  | 
 | ||||||
|  | : options>string ( options -- string ) | ||||||
|  |     keys [ option>ch ] map natural-sort >string ; | ||||||
|  | 
 | ||||||
|  | PRIVATE> | ||||||
|  | 
 | ||||||
|  | : <optioned-regexp> ( string option-string -- regexp ) | ||||||
|  |     [ default-regexp ] [ string>options ] bi* >>options | ||||||
|  |     construct-regexp ; | ||||||
|  | 
 | ||||||
|  | : <regexp> ( string -- regexp ) "" <optioned-regexp> ; | ||||||
|  | 
 | ||||||
|  | <PRIVATE | ||||||
|  | 
 | ||||||
|  | : parsing-regexp ( accum end -- accum ) | ||||||
|  |     lexer get dup skip-blank | ||||||
|  |     [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column | ||||||
|  |     lexer get dup still-parsing-line? | ||||||
|  |     [ (parse-token) ] [ drop f ] if | ||||||
|  |     <optioned-regexp> parsed ; | ||||||
|  | 
 | ||||||
|  | PRIVATE> | ||||||
|  | 
 | ||||||
|  | : R! CHAR: ! parsing-regexp ; parsing | ||||||
|  | : R" CHAR: " parsing-regexp ; parsing | ||||||
|  | : R# CHAR: # parsing-regexp ; parsing | ||||||
|  | : R' CHAR: ' parsing-regexp ; parsing | ||||||
|  | : R( CHAR: ) parsing-regexp ; parsing | ||||||
|  | : R/ CHAR: / parsing-regexp ; parsing | ||||||
|  | : R@ CHAR: @ parsing-regexp ; parsing | ||||||
|  | : R[ CHAR: ] parsing-regexp ; parsing | ||||||
|  | : R` CHAR: ` parsing-regexp ; parsing | ||||||
|  | : R{ CHAR: } parsing-regexp ; parsing | ||||||
|  | : R| CHAR: | parsing-regexp ; parsing | ||||||
| 
 | 
 | ||||||
| M: regexp pprint* | M: regexp pprint* | ||||||
|     [ |     [ | ||||||
|         [ |         [ | ||||||
|             dup raw>> |             [ raw>> dup find-regexp-syntax swap % swap % % ] | ||||||
|             dup find-regexp-syntax swap % swap % % |             [ options>> options>string % ] bi | ||||||
|             case-insensitive swap option? [ "i" % ] when |  | ||||||
|         ] "" make |         ] "" make | ||||||
|     ] keep present-text ; |     ] keep present-text ; | ||||||
|  |  | ||||||
|  | @ -40,7 +40,7 @@ TUPLE: transition-table transitions start-state final-states ; | ||||||
|     2dup [ to>> ] dip maybe-initialize-key |     2dup [ to>> ] dip maybe-initialize-key | ||||||
|     [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip |     [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip | ||||||
|     2dup at* [ 2nip insert-at ] |     2dup at* [ 2nip insert-at ] | ||||||
|     [ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ; |     [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ; | ||||||
| 
 | 
 | ||||||
| : add-transition ( transition transition-table -- ) | : add-transition ( transition transition-table -- ) | ||||||
|     transitions>> set-transition ; |     transitions>> set-transition ; | ||||||
|  |  | ||||||
|  | @ -17,6 +17,7 @@ TUPLE: dfa-traverser | ||||||
|     capture-group-index |     capture-group-index | ||||||
|     last-state current-state |     last-state current-state | ||||||
|     text |     text | ||||||
|  |     match-failed? | ||||||
|     start-index current-index |     start-index current-index | ||||||
|     matches ; |     matches ; | ||||||
| 
 | 
 | ||||||
|  | @ -37,14 +38,20 @@ TUPLE: dfa-traverser | ||||||
|         H{ } clone >>captured-groups ; |         H{ } clone >>captured-groups ; | ||||||
| 
 | 
 | ||||||
| : final-state? ( dfa-traverser -- ? ) | : final-state? ( dfa-traverser -- ? ) | ||||||
|     [ current-state>> ] [ dfa-table>> final-states>> ] bi |     [ current-state>> ] | ||||||
|     key? ; |     [ dfa-table>> final-states>> ] bi key? ; | ||||||
|  | 
 | ||||||
|  | : beginning-of-text? ( dfa-traverser -- ? ) | ||||||
|  |     current-index>> 0 <= ; inline | ||||||
|  | 
 | ||||||
|  | : end-of-text? ( dfa-traverser -- ? ) | ||||||
|  |     [ current-index>> ] [ text>> length ] bi >= ; inline | ||||||
| 
 | 
 | ||||||
| : text-finished? ( dfa-traverser -- ? ) | : text-finished? ( dfa-traverser -- ? ) | ||||||
|     { |     { | ||||||
|         [ current-state>> empty? ] |         [ current-state>> empty? ] | ||||||
|         [ [ current-index>> ] [ text>> length ] bi >= ] |         [ end-of-text? ] | ||||||
|         ! [ current-index>> 0 < ] |         [ match-failed?>> ] | ||||||
|     } 1|| ; |     } 1|| ; | ||||||
| 
 | 
 | ||||||
| : save-final-state ( dfa-straverser -- ) | : save-final-state ( dfa-straverser -- ) | ||||||
|  | @ -55,8 +62,47 @@ TUPLE: dfa-traverser | ||||||
|         dup save-final-state |         dup save-final-state | ||||||
|     ] when text-finished? ; |     ] when text-finished? ; | ||||||
| 
 | 
 | ||||||
|  | : previous-text-character ( dfa-traverser -- ch ) | ||||||
|  |     [ text>> ] [ current-index>> 1- ] bi nth ; | ||||||
|  | 
 | ||||||
|  | : current-text-character ( dfa-traverser -- ch ) | ||||||
|  |     [ text>> ] [ current-index>> ] bi nth ; | ||||||
|  | 
 | ||||||
|  | : next-text-character ( dfa-traverser -- ch ) | ||||||
|  |     [ text>> ] [ current-index>> 1+ ] bi nth ; | ||||||
|  | 
 | ||||||
| GENERIC: flag-action ( dfa-traverser flag -- ) | GENERIC: flag-action ( dfa-traverser flag -- ) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | M: beginning-of-input flag-action ( dfa-traverser flag -- ) | ||||||
|  |     drop | ||||||
|  |     dup beginning-of-text? [ t >>match-failed? ] unless drop ; | ||||||
|  | 
 | ||||||
|  | M: end-of-input flag-action ( dfa-traverser flag -- ) | ||||||
|  |     drop | ||||||
|  |     dup end-of-text? [ t >>match-failed? ] unless drop ; | ||||||
|  | 
 | ||||||
|  | M: beginning-of-line flag-action ( dfa-traverser flag -- ) | ||||||
|  |     drop | ||||||
|  |     dup { | ||||||
|  |         [ beginning-of-text? ] | ||||||
|  |         [ previous-text-character terminator-class class-member? ] | ||||||
|  |     } 1|| [ t >>match-failed? ] unless drop ; | ||||||
|  | 
 | ||||||
|  | M: end-of-line flag-action ( dfa-traverser flag -- ) | ||||||
|  |     drop | ||||||
|  |     dup { | ||||||
|  |         [ end-of-text? ] | ||||||
|  |         [ next-text-character terminator-class class-member? ] | ||||||
|  |     } 1|| [ t >>match-failed? ] unless drop ; | ||||||
|  | 
 | ||||||
|  | M: word-boundary flag-action ( dfa-traverser flag -- ) | ||||||
|  |     drop | ||||||
|  |     dup { | ||||||
|  |         [ end-of-text? ] | ||||||
|  |         [ current-text-character terminator-class class-member? ] | ||||||
|  |     } 1|| [ t >>match-failed? ] unless drop ; | ||||||
|  | 
 | ||||||
| M: lookahead-on flag-action ( dfa-traverser flag -- ) | M: lookahead-on flag-action ( dfa-traverser flag -- ) | ||||||
|     drop |     drop | ||||||
|     lookahead-counters>> 0 swap push ; |     lookahead-counters>> 0 swap push ; | ||||||
|  | @ -110,11 +156,7 @@ M: capture-group-off flag-action ( dfa-traverser flag -- ) | ||||||
|         [ [ 1+ ] change-current-index ] |         [ [ 1+ ] change-current-index ] | ||||||
|         [ [ 1- ] change-current-index ] if |         [ [ 1- ] change-current-index ] if | ||||||
|         dup current-state>> >>last-state |         dup current-state>> >>last-state | ||||||
|     ] dip |     ] [ first ] bi* >>current-state ; | ||||||
|     first >>current-state ; |  | ||||||
| 
 |  | ||||||
| : match-failed ( dfa-traverser -- dfa-traverser ) |  | ||||||
|     V{ } clone >>matches ; |  | ||||||
| 
 | 
 | ||||||
| : match-literal ( transition from-state table -- to-state/f ) | : match-literal ( transition from-state table -- to-state/f ) | ||||||
|     transitions>> at at ; |     transitions>> at at ; | ||||||
|  | @ -131,11 +173,9 @@ M: capture-group-off flag-action ( dfa-traverser flag -- ) | ||||||
|     { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; |     { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; | ||||||
| 
 | 
 | ||||||
| : setup-match ( match -- obj state dfa-table ) | : setup-match ( match -- obj state dfa-table ) | ||||||
|     { |     [ [ current-index>> ] [ text>> ] bi nth ] | ||||||
|         [ current-index>> ] [ text>> ] |     [ current-state>> ] | ||||||
|         [ current-state>> ] [ dfa-table>> ] |     [ dfa-table>> ] tri ; | ||||||
|     } cleave |  | ||||||
|     [ nth ] 2dip ; |  | ||||||
| 
 | 
 | ||||||
| : do-match ( dfa-traverser -- dfa-traverser ) | : do-match ( dfa-traverser -- dfa-traverser ) | ||||||
|     dup process-flags |     dup process-flags | ||||||
|  |  | ||||||
|  | @ -11,14 +11,14 @@ IN: benchmark.regex-dna | ||||||
| 
 | 
 | ||||||
| : count-patterns ( string -- ) | : count-patterns ( string -- ) | ||||||
|     { |     { | ||||||
|         R/ agggtaaa|tttaccct/i, |         R/ agggtaaa|tttaccct/i | ||||||
|         R/ [cgt]gggtaaa|tttaccc[acg]/i, |         R/ [cgt]gggtaaa|tttaccc[acg]/i | ||||||
|         R/ a[act]ggtaaa|tttacc[agt]t/i, |         R/ a[act]ggtaaa|tttacc[agt]t/i | ||||||
|         R/ ag[act]gtaaa|tttac[agt]ct/i, |         R/ ag[act]gtaaa|tttac[agt]ct/i | ||||||
|         R/ agg[act]taaa|ttta[agt]cct/i, |         R/ agg[act]taaa|ttta[agt]cct/i | ||||||
|         R/ aggg[acg]aaa|ttt[cgt]ccct/i, |         R/ aggg[acg]aaa|ttt[cgt]ccct/i | ||||||
|         R/ agggt[cgt]aa|tt[acg]accct/i, |         R/ agggt[cgt]aa|tt[acg]accct/i | ||||||
|         R/ agggta[cgt]a|t[acg]taccct/i, |         R/ agggta[cgt]a|t[acg]taccct/i | ||||||
|         R/ agggtaa[cgt]|[acg]ttaccct/i |         R/ agggtaa[cgt]|[acg]ttaccct/i | ||||||
|     } [ |     } [ | ||||||
|         [ raw>> write bl ] |         [ raw>> write bl ] | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue