diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 7b729b2e50..f143bebdf7 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -7,6 +7,7 @@ IN: regexp.classes GENERIC: class-member? ( obj class -- ? ) M: word class-member? ( obj class -- ? ) 2drop f ; + M: integer class-member? ( obj class -- ? ) 2drop f ; M: character-class-range class-member? ( obj class -- ? ) @@ -60,3 +61,12 @@ M: java-blank-class class-member? ( obj class -- ? ) M: unmatchable-class class-member? ( obj class -- ? ) 2drop f ; + +M: terminator-class class-member? ( obj class -- ? ) + drop { + [ CHAR: \r = ] + [ CHAR: \n = ] + [ CHAR: \u000085 = ] + [ CHAR: \u002028 = ] + [ CHAR: \u002029 = ] + } 1|| ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 72d0fe970b..50847d6ff9 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -18,6 +18,9 @@ SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag +SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag +SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag +SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag : next-state ( regexp -- state ) [ state>> ] [ [ 1+ ] change-state drop ] bi ; @@ -135,7 +138,21 @@ M: non-capture-group nfa-node ( node -- ) M: reluctant-kleene-star nfa-node ( node -- ) term>> nfa-node ; -! + +: add-epsilon-flag ( flag -- ) + eps literal-transition add-simple-entry add-traversal-flag ; + +M: beginning-of-line nfa-node ( node -- ) + drop beginning-of-line add-epsilon-flag ; + +M: end-of-line nfa-node ( node -- ) + drop end-of-line add-epsilon-flag ; + +M: beginning-of-input nfa-node ( node -- ) + drop beginning-of-input add-epsilon-flag ; + +M: end-of-input nfa-node ( node -- ) + drop end-of-input add-epsilon-flag ; M: negation nfa-node ( node -- ) negation-mode inc diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 1feba62f68..ea8aaffcd5 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators io io.streams.string kernel math math.parser namespaces qualified sets quotations sequences splitting symbols vectors math.order unicode.categories strings regexp.backend regexp.utils -unicode.case words ; +unicode.case words locals ; IN: regexp.parser FROM: math.ranges => [a,b] ; @@ -44,18 +44,21 @@ TUPLE: character-class-range from to ; INSTANCE: character-class-range node SINGLETON: epsilon INSTANCE: epsilon node SINGLETON: any-char INSTANCE: any-char node SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node -SINGLETON: front-anchor INSTANCE: front-anchor node -SINGLETON: back-anchor INSTANCE: back-anchor node +SINGLETON: beginning-of-input INSTANCE: beginning-of-input node +SINGLETON: end-of-input INSTANCE: end-of-input node +SINGLETON: beginning-of-line INSTANCE: beginning-of-line node +SINGLETON: end-of-line INSTANCE: end-of-line node TUPLE: option-on option ; INSTANCE: option-on node TUPLE: option-off option ; INSTANCE: option-off node -SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ; +SINGLETONS: unix-lines dotall multiline comments case-insensitive +unicode-case reversed-regexp ; SINGLETONS: letter-class LETTER-class Letter-class digit-class alpha-class non-newline-blank-class ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class -unmatchable-class ; +terminator-class unmatchable-class word-boundary-class ; SINGLETONS: beginning-of-group end-of-group beginning-of-character-class end-of-character-class @@ -231,20 +234,6 @@ ERROR: invalid-range a b ; [ [ nip at-most-n ] [ at-least-n ] if* ] if ] [ drop 0 max exactly-n ] if ; -SINGLETON: beginning-of-input -SINGLETON: end-of-input - -: newlines ( -- obj1 obj2 obj3 ) - CHAR: \r - CHAR: \n - 2dup 2array ; - -: beginning-of-line ( -- obj ) - beginning-of-input newlines 4array lookbehind boa ; - -: end-of-line ( -- obj ) - end-of-input newlines 4array lookahead boa ; - : handle-front-anchor ( -- ) get-multiline beginning-of-line beginning-of-input ? push-stack ; @@ -281,13 +270,26 @@ ERROR: expected-posix-class ; : parse-control-character ( -- n ) read1 ; ERROR: bad-escaped-literals seq ; -: parse-escaped-literals ( -- obj ) - "\\E" read-until [ bad-escaped-literals ] unless + +: parse-til-E ( -- obj ) + "\\E" read-until [ bad-escaped-literals ] unless ; + +:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj ) + parse-til-E drop1 [ epsilon ] [ - [ ] V{ } map-as + [ quot call ] V{ } map-as first|concatenation - ] if-empty ; + ] if-empty ; inline + +: parse-escaped-literals ( -- obj ) + [ ] (parse-escaped-literals) ; + +: lower-case-literals ( -- obj ) + [ ch>lower ] (parse-escaped-literals) ; + +: upper-case-literals ( -- obj ) + [ ch>upper ] (parse-escaped-literals) ; : parse-escaped ( -- obj ) read1 @@ -299,12 +301,12 @@ ERROR: bad-escaped-literals seq ; { CHAR: a [ HEX: 7 ] } { CHAR: e [ HEX: 1b ] } - { CHAR: d [ digit-class ] } - { CHAR: D [ digit-class ] } - { CHAR: s [ java-blank-class ] } - { CHAR: S [ java-blank-class ] } { CHAR: w [ c-identifier-class ] } { CHAR: W [ c-identifier-class ] } + { CHAR: s [ java-blank-class ] } + { CHAR: S [ java-blank-class ] } + { CHAR: d [ digit-class ] } + { CHAR: D [ digit-class ] } { CHAR: p [ parse-posix-class ] } { CHAR: P [ parse-posix-class ] } @@ -313,13 +315,19 @@ ERROR: bad-escaped-literals seq ; { CHAR: 0 [ parse-octal ] } { CHAR: c [ parse-control-character ] } - ! { CHAR: b [ handle-word-boundary ] } - ! { CHAR: B [ handle-word-boundary ] } - ! { CHAR: A [ handle-beginning-of-input ] } - ! { CHAR: G [ end of previous match ] } - ! { CHAR: Z [ handle-end-of-input ] } - ! { CHAR: z [ handle-end-of-input ] } ! except for terminator + { CHAR: Q [ parse-escaped-literals ] } + ! { CHAR: b [ word-boundary-class ] } + ! { CHAR: B [ word-boundary-class ] } + ! { CHAR: A [ handle-beginning-of-input ] } + ! { CHAR: z [ handle-end-of-input ] } + + ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator + + ! m//g mode + ! { CHAR: G [ end of previous match ] } + + ! Group capture ! { CHAR: 1 [ CHAR: 1 ] } ! { CHAR: 2 [ CHAR: 2 ] } ! { CHAR: 3 [ CHAR: 3 ] } @@ -330,7 +338,11 @@ ERROR: bad-escaped-literals seq ; ! { CHAR: 8 [ CHAR: 8 ] } ! { CHAR: 9 [ CHAR: 9 ] } - { CHAR: Q [ parse-escaped-literals ] } + ! Perl extensions + ! can't do \l and \u because \u is already a 4-hex + { CHAR: L [ lower-case-literals ] } + { CHAR: U [ upper-case-literals ] } + [ ] } case ; diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index 291287f8c2..e4bab990a4 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -45,6 +45,7 @@ IN: regexp-tests ! Off by default. [ f ] [ "\n" "." matches? ] unit-test [ t ] [ "\n" "(?s)." matches? ] unit-test +[ t ] [ "\n" R/ ./s matches? ] unit-test [ f ] [ "\n\n" "(?s).(?-s)." matches? ] unit-test [ f ] [ "" ".+" matches? ] unit-test @@ -210,34 +211,34 @@ IN: regexp-tests [ 3 ] [ "aaacb" "a*" match-head ] unit-test [ 2 ] [ "aaacb" "aa?" match-head ] unit-test -[ t ] [ "aaa" "AAA" matches? ] unit-test -[ f ] [ "aax" "AAA" matches? ] unit-test -[ t ] [ "aaa" "A*" matches? ] unit-test -[ f ] [ "aaba" "A*" matches? ] unit-test -[ t ] [ "b" "[AB]" matches? ] unit-test -[ f ] [ "c" "[AB]" matches? ] unit-test -[ t ] [ "c" "[A-Z]" matches? ] unit-test -[ f ] [ "3" "[A-Z]" matches? ] unit-test +[ t ] [ "aaa" R/ AAA/i matches? ] unit-test +[ f ] [ "aax" R/ AAA/i matches? ] unit-test +[ t ] [ "aaa" R/ A*/i matches? ] unit-test +[ f ] [ "aaba" R/ A*/i matches? ] unit-test +[ t ] [ "b" R/ [AB]/i matches? ] unit-test +[ f ] [ "c" R/ [AB]/i matches? ] unit-test +[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test +[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test [ t ] [ "a" "(?i)a" matches? ] unit-test [ t ] [ "a" "(?i)a" matches? ] unit-test [ t ] [ "A" "(?i)a" matches? ] unit-test [ t ] [ "A" "(?i)a" matches? ] unit-test -[ t ] [ "a" "(?-i)a" matches? ] unit-test -[ t ] [ "a" "(?-i)a" matches? ] unit-test -[ f ] [ "A" "(?-i)a" matches? ] unit-test -[ f ] [ "A" "(?-i)a" matches? ] unit-test +[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test +[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test +[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test +[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test [ f ] [ "A" "[a-z]" matches? ] unit-test -[ t ] [ "A" "[a-z]" matches? ] unit-test +[ t ] [ "A" R/ [a-z]/i matches? ] unit-test [ f ] [ "A" "\\p{Lower}" matches? ] unit-test -[ t ] [ "A" "\\p{Lower}" matches? ] unit-test +[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test -[ t ] [ "abc" "abc" matches? ] unit-test -[ t ] [ "abc" "a[bB][cC]" matches? ] unit-test -[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" matches? ] unit-test +[ t ] [ "abc" R/ abc/r matches? ] unit-test +[ t ] [ "abc" R/ a[bB][cC]/r matches? ] unit-test +[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" matches? ] unit-test @@ -293,6 +294,9 @@ IN: regexp-tests [ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test [ f ] [ "ab" "a(?!b)" first-match ] unit-test +[ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test +[ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test +[ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test [ "a" ] [ "ab" "a(?=b)(?=b)" first-match >string ] unit-test [ "a" ] [ "ba" "a(?<=b)(?<=b)" first-match >string ] unit-test [ "a" ] [ "cab" "a(?=b)(?<=c)" first-match >string ] unit-test @@ -305,19 +309,65 @@ IN: regexp-tests ! [ "{Lower}" ] [ invalid-range? ] must-fail-with -! [ 1 ] [ "aaacb" "a+?" match-head ] unit-test -! [ 1 ] [ "aaacb" "aa??" match-head ] unit-test -! [ f ] [ "aaaab" "a++ab" matches? ] unit-test -! [ t ] [ "aaacb" "a++cb" matches? ] unit-test -! [ 3 ] [ "aacb" "aa?c" match-head ] unit-test -! [ 3 ] [ "aacb" "aa??c" match-head ] unit-test +[ t ] [ "a" R/ ^a/ matches? ] unit-test +[ f ] [ "\na" R/ ^a/ matches? ] unit-test +[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test +[ f ] [ "\ra" R/ ^a/ matches? ] unit-test -! [ t ] [ "fxxbar" "(?!foo).{3}bar" matches? ] unit-test -! [ f ] [ "foobar" "(?!foo).{3}bar" matches? ] unit-test +[ t ] [ "a" R/ a$/ matches? ] unit-test +[ f ] [ "a\n" R/ a$/ matches? ] unit-test +[ f ] [ "a\r" R/ a$/ matches? ] unit-test +[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test + +! [ t ] [ "a" R/ \Aa/ matches? ] unit-test +! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test +! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test +! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test + +! [ t ] [ "a" R/ \Aa/m matches? ] unit-test +! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test +! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test +! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test + +! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test + +! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test +! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test + +! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test +! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test + +! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test +! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test +! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test +! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test + +! [ t ] [ "a" R/ ^a/m matches? ] unit-test +! [ t ] [ "\na" R/ ^a/m matches? ] unit-test +! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test +! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test + +! Convert to lowercase until E +[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test +[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test + +! Convert to uppercase until E +[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test +[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test + +! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test +! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test +! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test +! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test ! [ f ] [ "foobxr" "foo\\z" match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" match-head ] unit-test +! [ t ] [ "foo" "\\bfoo\\b" matches? ] unit-test +! [ t ] [ "afoob" "\\Bfoo\\B" matches? ] unit-test +! [ t ] [ "afoob" "\\bfoo\\b" matches? ] unit-test +! [ f ] [ "foo" "\\Bfoo\\B" matches? ] unit-test + ! [ 3 ] [ "foo bar" "foo\\b" match-head ] unit-test ! [ f ] [ "fooxbar" "foo\\b" matches? ] unit-test ! [ t ] [ "foo" "foo\\b" matches? ] unit-test @@ -332,34 +382,29 @@ IN: regexp-tests ! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test -! clear "a(?=b*)" "ab" over match -! clear "a(?=b*c)" "abbbbbc" over match -! clear "a(?=b*)" "ab" over match +! [ 1 ] [ "aaacb" "a+?" match-head ] unit-test +! [ 1 ] [ "aaacb" "aa??" match-head ] unit-test +! [ f ] [ "aaaab" "a++ab" matches? ] unit-test +! [ t ] [ "aaacb" "a++cb" matches? ] unit-test +! [ 3 ] [ "aacb" "aa?c" match-head ] unit-test +! [ 3 ] [ "aacb" "aa??c" match-head ] unit-test -! clear "^a" "a" over match -! clear "^a" "\na" over match -! clear "^a" "\r\na" over match -! clear "^a" "\ra" over match +! "ab" "a(?=b*)" match +! "abbbbbc" "a(?=b*c)" match +! "ab" "a(?=b*)" match -! clear "a$" "a" over match -! clear "a$" "a\n" over match -! clear "a$" "a\r" over match -! clear "a$" "a\r\n" over match +! "baz" "(az)(?<=b)" first-match +! "cbaz" "a(?<=b*)" first-match +! "baz" "a(?<=b)" first-match -! "(az)(?<=b)" "baz" over first-match -! "a(?<=b*)" "cbaz" over first-match -! "a(?<=b)" "baz" over first-match +! "baz" "a(? first-match +! "caz" "a(? first-match -! "a(? "baz" over first-match -! "a(? "caz" over first-match +! "abcdefg" "a(?=bcdefg)bcd" first-match +! "abcdefg" "a(?#bcdefg)bcd" first-match +! "abcdefg" "a(?:bcdefg)" first-match -! "a(?=bcdefg)bcd" "abcdefg" over first-match -! "a(?#bcdefg)bcd" "abcdefg" over first-match -! "a(?:bcdefg)" "abcdefg" over first-match - -[ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test - -! "a(?<=b)" "caba" over first-match +! "caba" "a(?<=b)" first-match ! capture group 1: "aaaa" 2: "" ! "aaaa" "(a*)(a*)" match* diff --git a/basis/regexp/regexp.factor b/basis/regexp/regexp.factor index 652d943090..e61d5692f4 100644 --- a/basis/regexp/regexp.factor +++ b/basis/regexp/regexp.factor @@ -1,9 +1,10 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors combinators kernel math sequences +USING: accessors combinators kernel math sequences strings sets assocs prettyprint.backend make lexer namespaces parser arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa -regexp.dfa regexp.traversal regexp.transition-tables splitting ; +regexp.dfa regexp.traversal regexp.transition-tables splitting +sorting ; IN: regexp : default-regexp ( string -- regexp ) @@ -75,40 +76,7 @@ IN: regexp : count-matches ( string regexp -- n ) all-matches length ; -: initial-option ( regexp option -- regexp' ) - over options>> conjoin ; - -: ( string -- regexp ) - default-regexp construct-regexp ; - -: ( string -- regexp ) - default-regexp - case-insensitive initial-option - construct-regexp ; - -: ( string -- regexp ) - default-regexp - reversed-regexp initial-option - construct-regexp ; - -: parsing-regexp ( accum end -- accum ) - lexer get dup skip-blank - [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column - lexer get dup still-parsing-line? - [ (parse-token) ] [ drop f ] if - "i" = [ ] [ ] if parsed ; - -: R! CHAR: ! parsing-regexp ; parsing -: R" CHAR: " parsing-regexp ; parsing -: R# CHAR: # parsing-regexp ; parsing -: R' CHAR: ' parsing-regexp ; parsing -: R( CHAR: ) parsing-regexp ; parsing -: R/ CHAR: / parsing-regexp ; parsing -: R@ CHAR: @ parsing-regexp ; parsing -: R[ CHAR: ] parsing-regexp ; parsing -: R` CHAR: ` parsing-regexp ; parsing -: R{ CHAR: } parsing-regexp ; parsing -: R| CHAR: | parsing-regexp ; parsing +> key? ; +ERROR: unknown-regexp-option option ; + +: option>ch ( option -- string ) + { + { case-insensitive [ CHAR: i ] } + { multiline [ CHAR: m ] } + { reversed-regexp [ CHAR: r ] } + { dotall [ CHAR: s ] } + [ unknown-regexp-option ] + } case ; + +: ch>option ( ch -- option ) + { + { CHAR: i [ case-insensitive ] } + { CHAR: m [ multiline ] } + { CHAR: r [ reversed-regexp ] } + { CHAR: s [ dotall ] } + [ unknown-regexp-option ] + } case ; + +: string>options ( string -- options ) + [ ch>option dup ] H{ } map>assoc ; + +: options>string ( options -- string ) + keys [ option>ch ] map natural-sort >string ; + +PRIVATE> + +: ( string option-string -- regexp ) + [ default-regexp ] [ string>options ] bi* >>options + construct-regexp ; + +: ( string -- regexp ) "" ; + + parsed ; + +PRIVATE> + +: R! CHAR: ! parsing-regexp ; parsing +: R" CHAR: " parsing-regexp ; parsing +: R# CHAR: # parsing-regexp ; parsing +: R' CHAR: ' parsing-regexp ; parsing +: R( CHAR: ) parsing-regexp ; parsing +: R/ CHAR: / parsing-regexp ; parsing +: R@ CHAR: @ parsing-regexp ; parsing +: R[ CHAR: ] parsing-regexp ; parsing +: R` CHAR: ` parsing-regexp ; parsing +: R{ CHAR: } parsing-regexp ; parsing +: R| CHAR: | parsing-regexp ; parsing M: regexp pprint* [ [ - dup raw>> - dup find-regexp-syntax swap % swap % % - case-insensitive swap option? [ "i" % ] when + [ raw>> dup find-regexp-syntax swap % swap % % ] + [ options>> options>string % ] bi ] "" make ] keep present-text ; diff --git a/basis/regexp/traversal/traversal.factor b/basis/regexp/traversal/traversal.factor index 86d315ee2f..c880c11c53 100644 --- a/basis/regexp/traversal/traversal.factor +++ b/basis/regexp/traversal/traversal.factor @@ -17,6 +17,7 @@ TUPLE: dfa-traverser capture-group-index last-state current-state text + match-failed? start-index current-index matches ; @@ -37,14 +38,20 @@ TUPLE: dfa-traverser H{ } clone >>captured-groups ; : final-state? ( dfa-traverser -- ? ) - [ current-state>> ] [ dfa-table>> final-states>> ] bi - key? ; + [ current-state>> ] + [ dfa-table>> final-states>> ] bi key? ; + +: beginning-of-text? ( dfa-traverser -- ? ) + current-index>> 0 <= ; inline + +: end-of-text? ( dfa-traverser -- ? ) + [ current-index>> ] [ text>> length ] bi >= ; inline : text-finished? ( dfa-traverser -- ? ) { [ current-state>> empty? ] - [ [ current-index>> ] [ text>> length ] bi >= ] - ! [ current-index>> 0 < ] + [ end-of-text? ] + [ match-failed?>> ] } 1|| ; : save-final-state ( dfa-straverser -- ) @@ -55,8 +62,47 @@ TUPLE: dfa-traverser dup save-final-state ] when text-finished? ; +: previous-text-character ( dfa-traverser -- ch ) + [ text>> ] [ current-index>> 1- ] bi nth ; + +: current-text-character ( dfa-traverser -- ch ) + [ text>> ] [ current-index>> ] bi nth ; + +: next-text-character ( dfa-traverser -- ch ) + [ text>> ] [ current-index>> 1+ ] bi nth ; + GENERIC: flag-action ( dfa-traverser flag -- ) + +M: beginning-of-input flag-action ( dfa-traverser flag -- ) + drop + dup beginning-of-text? [ t >>match-failed? ] unless drop ; + +M: end-of-input flag-action ( dfa-traverser flag -- ) + drop + dup end-of-text? [ t >>match-failed? ] unless drop ; + +M: beginning-of-line flag-action ( dfa-traverser flag -- ) + drop + dup { + [ beginning-of-text? ] + [ previous-text-character terminator-class class-member? ] + } 1|| [ t >>match-failed? ] unless drop ; + +M: end-of-line flag-action ( dfa-traverser flag -- ) + drop + dup { + [ end-of-text? ] + [ next-text-character terminator-class class-member? ] + } 1|| [ t >>match-failed? ] unless drop ; + +M: word-boundary flag-action ( dfa-traverser flag -- ) + drop + dup { + [ end-of-text? ] + [ current-text-character terminator-class class-member? ] + } 1|| [ t >>match-failed? ] unless drop ; + M: lookahead-on flag-action ( dfa-traverser flag -- ) drop lookahead-counters>> 0 swap push ; @@ -110,11 +156,7 @@ M: capture-group-off flag-action ( dfa-traverser flag -- ) [ [ 1+ ] change-current-index ] [ [ 1- ] change-current-index ] if dup current-state>> >>last-state - ] dip - first >>current-state ; - -: match-failed ( dfa-traverser -- dfa-traverser ) - V{ } clone >>matches ; + ] [ first ] bi* >>current-state ; : match-literal ( transition from-state table -- to-state/f ) transitions>> at at ; @@ -131,11 +173,9 @@ M: capture-group-off flag-action ( dfa-traverser flag -- ) { [ match-literal ] [ match-class ] [ match-default ] } 3|| ; : setup-match ( match -- obj state dfa-table ) - { - [ current-index>> ] [ text>> ] - [ current-state>> ] [ dfa-table>> ] - } cleave - [ nth ] 2dip ; + [ [ current-index>> ] [ text>> ] bi nth ] + [ current-state>> ] + [ dfa-table>> ] tri ; : do-match ( dfa-traverser -- dfa-traverser ) dup process-flags