! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators kernel math sequences namespaces locals combinators.lib state-tables math.parser state-parser sets dlists unicode.categories math.order quotations shuffle math.ranges splitting symbols ; IN: regexp4 SYMBOLS: eps start-state final-state beginning-of-text end-of-text left-parenthesis alternation left-bracket caret dash ampersand semicolon ; SYMBOL: runtime-epsilon TUPLE: regexp raw parentheses-count bracket-count state stack nfa new-states dfa minimized-dfa dot-matches-newlines? character-sets capture-group captured-groups ; TUPLE: capture-group n range ; ERROR: parentheses-underflow ; ERROR: unbalanced-parentheses ; ERROR: unbalanced-brackets ; : push-stack ( regexp token -- ) swap stack>> push ; : push-all-stack ( regexp seq -- ) swap stack>> push-all ; : next-state ( regexp -- n ) [ 1+ ] change-state state>> ; : check-parentheses-underflow ( regexp -- ) parentheses-count>> 0 < [ parentheses-underflow ] when ; : check-unbalanced-parentheses ( regexp -- ) parentheses-count>> 0 > [ unbalanced-parentheses ] when ; :: (apply-alternation) ( stack regexp -- ) [let | s2 [ stack peek first ] s3 [ stack pop second ] s0 [ stack peek alternation = [ stack pop* ] when stack peek first ] s1 [ stack pop second ] s4 [ regexp next-state ] s5 [ regexp next-state ] table [ regexp nfa>> ] | s5 table add-row s4 eps s0 table add-entry s4 eps s2 table add-entry s1 eps s5 table add-entry s3 eps s5 table add-entry s1 table final-states>> delete-at s3 table final-states>> delete-at t s5 table final-states>> set-at s4 s5 2array stack push ] ; : apply-alternation ( regexp -- ) [ stack>> ] [ (apply-alternation) ] bi ; : apply-alternation? ( stack -- ? ) dup length dup 3 < [ 2drop f ] [ 2 - swap nth alternation = ] if ; :: (apply-concatenation) ( stack regexp -- ) [let* | s2 [ stack peek first ] s3 [ stack pop second ] s0 [ stack peek first ] s1 [ stack pop second ] table [ regexp nfa>> ] | s1 eps s2 table set-entry s1 table final-states>> delete-at s3 table add-row s0 s3 2array stack push ] ; : apply-concatenation ( regexp -- ) [ stack>> ] [ (apply-concatenation) ] bi ; : apply-concatenation? ( seq -- ? ) dup length dup 2 < [ 2drop f ] [ 2 - swap nth array? ] if ; : apply-loop ( seq regexp -- seq regexp ) over length 1 > [ 2dup over apply-alternation? [ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop ] when ; : cut-stack ( n vector -- vector' vector ) tuck index cut reverse dup pop* ; : apply-til-last ( token regexp -- ) swap [ cut-stack ] change-stack >r reverse r> apply-loop stack>> push-all ; : concatenation-loop ( regexp -- ) dup stack>> dup apply-concatenation? [ over (apply-concatenation) concatenation-loop ] [ 2drop ] if ; :: apply-kleene-closure ( regexp -- ) [let* | stack [ regexp stack>> ] s0 [ stack peek first ] s1 [ stack pop second ] s2 [ regexp next-state ] s3 [ regexp next-state ] table [ regexp nfa>> ] | s1 table final-states>> delete-at t s3 table final-states>> set-at s3 table add-row s1 eps s0 table add-entry s2 eps s0 table add-entry s2 eps s3 table add-entry s1 eps s3 table add-entry s2 s3 2array stack push ] ; : add-numbers ( n obj -- obj ) 2dup [ number? ] bi@ and [ + ] [ dup sequence? [ [ + ] with map ] [ nip ] if ] if ; : increment-columns ( n assoc -- ) dup [ >r swap >r add-numbers r> r> set-at ] curry with* assoc-each ; :: copy-state-rows ( regexp range -- ) [let* | len [ range range-length ] offset [ regexp state>> range range-min - 1+ ] state [ regexp [ len + ] change-state ] | regexp nfa>> rows>> [ drop range member? ] assoc-filter [ [ offset + ] dip [ offset swap add-numbers ] assoc-map ] assoc-map regexp nfa>> [ assoc-union ] change-rows drop range [ range-min ] [ range-max ] bi [ offset + ] bi@ 2array regexp stack>> push ] ; : last-state ( regexp -- range ) stack>> peek first2 [a,b] ; : set-last-state-final ( ? regexp -- ) [ stack>> peek second ] [ nfa>> final-states>> ] bi set-at ; : apply-plus-closure ( regexp -- ) [ dup last-state copy-state-rows ] [ apply-kleene-closure ] [ apply-concatenation ] tri ; : apply-question-closure ( regexp -- ) [ stack>> peek first2 eps swap ] [ nfa>> add-entry ] bi ; : with0 ( obj n quot -- n quot' ) swapd curry ; inline : copy-state ( regexp state n -- ) [ copy-state-rows ] with0 with0 times ; :: (exactly-n) ( regexp state n -- ) regexp state n copy-state t regexp set-last-state-final ; : exactly-n ( regexp n -- ) >r dup last-state r> 1- (exactly-n) ; : exactly-n-concatenated ( regexp state n -- ) [ (exactly-n) ] 3keep nip 1- [ apply-concatenation ] with0 times ; :: at-least-n ( regexp n -- ) [let | state [ regexp stack>> pop first2 [a,b] ] | regexp state n copy-state state regexp stack>> push regexp apply-kleene-closure ] ; : pop-last ( regexp -- range ) stack>> pop first2 [a,b] ; :: at-most-n ( regexp n -- ) [let | state [ regexp pop-last ] | regexp state n [ 1+ exactly-n-concatenated ] with with each regexp n 1- [ apply-alternation ] with0 times regexp apply-question-closure ] ; :: from-m-to-n ( regexp m n -- ) [let | state [ regexp pop-last ] | regexp state m n [a,b] [ exactly-n-concatenated ] with with each regexp n m - [ apply-alternation ] with0 times ] ; : apply-brace-closure ( regexp from/f to/f comma? -- ) [ 2dup and [ from-m-to-n ] [ [ nip at-most-n ] [ at-least-n ] if* ] if ] [ drop exactly-n ] if ; :: make-nontoken-nfa ( regexp obj -- ) [let | s0 [ regexp next-state ] s1 [ regexp next-state ] stack [ regexp stack>> ] table [ regexp nfa>> ] | s0 obj s1 table set-entry s1 table add-row t s1 table final-states>> set-at s0 s1 2array stack push ] ; : set-start-state ( regexp -- ) dup stack>> dup empty? [ 2drop ] [ [ nfa>> ] [ pop first ] bi* >>start-state drop ] if ; : ascii? ( n -- ? ) 0 HEX: 7f between? ; : octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; : decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; : hex-digit? ( n -- ? ) dup decimal-digit? over CHAR: a CHAR: f between? or swap CHAR: A CHAR: F between? or ; : control-char? ( n -- ? ) dup 0 HEX: 1f between? swap HEX: 7f = or ; : punct? ( n -- ? ) "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ; : c-identifier-char? ( ch -- ? ) dup alpha? swap CHAR: _ = or ; : java-blank? ( n -- ? ) { CHAR: \s CHAR: \t CHAR: \n HEX: b HEX: 7 CHAR: \r } member? ; : java-printable? ( n -- ? ) dup alpha? swap punct? or ; ERROR: bad-character-class obj ; : parse-posix-class ( -- quot ) next CHAR: { expect [ get-char CHAR: } = ] take-until { { "Lower" [ [ letter? ] ] } { "Upper" [ [ LETTER? ] ] } { "ASCII" [ [ ascii? ] ] } { "Alpha" [ [ Letter? ] ] } { "Digit" [ [ digit? ] ] } { "Alnum" [ [ alpha? ] ] } { "Punct" [ [ punct? ] ] } { "Graph" [ [ java-printable? ] ] } { "Print" [ [ java-printable? ] ] } { "Blank" [ [ " \t" member? ] ] } { "Cntrl" [ [ control-char? ] ] } { "XDigit" [ [ hex-digit? ] ] } { "Space" [ [ java-blank? ] ] } ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss [ bad-character-class ] } case ; ERROR: bad-octal number ; : parse-octal ( regexp -- ) next get-char drop 3 take oct> dup 255 > [ bad-octal ] when make-nontoken-nfa ; ERROR: bad-hex number ; : parse-short-hex ( regexp -- ) next 2 take hex> dup number? [ bad-hex ] unless make-nontoken-nfa ; : parse-long-hex ( regexp -- ) next 4 take hex> dup number? [ bad-hex ] unless make-nontoken-nfa ; : parse-control-character ( regexp -- ) next get-char make-nontoken-nfa ; : parse-backreference ( regexp obj -- ) 2drop ; : dot-construction ( regexp -- ) [ CHAR: \n = not ] make-nontoken-nfa ; : front-anchor-construction ( regexp -- ) drop ; : back-anchor-construction ( regexp -- ) drop ; : parse-brace ( -- from/f to/f comma? ) next [ get-char CHAR: } = ] take-until "," split1 [ [ string>number ] bi@ ] keep >boolean ; : parse-escaped ( regexp -- ) next get-char { { CHAR: \ [ CHAR: \ make-nontoken-nfa ] } { CHAR: t [ CHAR: \t make-nontoken-nfa ] } { CHAR: n [ CHAR: \n make-nontoken-nfa ] } { CHAR: r [ CHAR: \r make-nontoken-nfa ] } { CHAR: f [ HEX: c make-nontoken-nfa ] } { CHAR: a [ HEX: 7 make-nontoken-nfa ] } { CHAR: e [ HEX: 1b make-nontoken-nfa ] } { CHAR: d [ [ digit? ] make-nontoken-nfa ] } { CHAR: D [ [ digit? not ] make-nontoken-nfa ] } { CHAR: s [ [ java-blank? ] make-nontoken-nfa ] } { CHAR: S [ [ java-blank? not ] make-nontoken-nfa ] } { CHAR: w [ [ c-identifier-char? ] make-nontoken-nfa ] } { CHAR: W [ [ c-identifier-char? not ] make-nontoken-nfa ] } { CHAR: p [ parse-posix-class make-nontoken-nfa ] } { CHAR: P [ parse-posix-class [ not ] compose make-nontoken-nfa ] } { CHAR: x [ parse-short-hex ] } { CHAR: u [ parse-long-hex ] } { CHAR: 0 [ parse-octal ] } { CHAR: c [ parse-control-character ] } ! { CHAR: Q [ quot til \E ] } ! { CHAR: E [ should be an error, parse this in the Q if exists ] } ! { CHAR: b [ ] } ! a word boundary ! { CHAR: B [ ] } ! a non-word boundary ! { CHAR: A [ ] } ! beginning of input ! { CHAR: G [ ] } ! end of previous match ! { CHAR: Z [ ] } ! end of input but for the final terminator, if any ! { CHAR: z [ ] } ! end of the input [ dup digit? [ parse-backreference ] [ make-nontoken-nfa ] if ] } case ; : make-character-set ( regexp -- ) stack>> throw ; : (parse-character-set) ( regexp -- ) [ next get-char { { CHAR: [ [ [ 1+ ] change-bracket-count left-bracket push-stack ] } { CHAR: ] [ [ 1- ] change-bracket-count left-bracket over stack>> cut-stack ] } { CHAR: - [ dash push-stack ] } { CHAR: & [ ampersand push-stack ] } { CHAR: : [ semicolon push-stack ] } { CHAR: \ [ parse-escaped ] } { f [ unbalanced-brackets ] } [ make-nontoken-nfa ] } case ] [ dup bracket-count>> 0 > [ (parse-character-set) ] [ make-character-set ] if ] bi ; : parse-character-set-first ( regexp -- ) get-next { { CHAR: ^ [ caret push-stack next ] } { CHAR: ] [ CHAR: ] make-nontoken-nfa next ] } [ 2drop ] } case ; : parse-character-set ( regexp -- ) [ parse-character-set-first ] [ (parse-character-set) ] bi ; ERROR: unsupported-token token ; : parse-token ( regexp token -- ) dup { { CHAR: ^ [ drop front-anchor-construction ] } { CHAR: $ [ drop back-anchor-construction ] } { CHAR: \ [ drop parse-escaped ] } { CHAR: | [ drop dup concatenation-loop alternation push-stack ] } { CHAR: ( [ drop [ 1+ ] change-parentheses-count left-parenthesis push-stack ] } { CHAR: ) [ drop [ 1- ] change-parentheses-count left-parenthesis apply-til-last ] } { CHAR: * [ drop apply-kleene-closure ] } { CHAR: + [ drop apply-plus-closure ] } { CHAR: ? [ drop apply-question-closure ] } { CHAR: { [ drop parse-brace apply-brace-closure ] } { CHAR: [ [ drop dup left-bracket push-stack [ 1+ ] change-bracket-count parse-character-set ] } ! { CHAR: } [ drop drop "brace" ] } ! { CHAR: ? [ drop ] } { CHAR: . [ drop dot-construction ] } { beginning-of-text [ push-stack ] } { end-of-text [ drop { [ check-unbalanced-parentheses ] [ concatenation-loop ] [ beginning-of-text apply-til-last ] [ set-start-state ] } cleave ] } [ drop make-nontoken-nfa ] } case ; : (parse-raw-regexp) ( regexp -- ) get-char [ dupd parse-token next (parse-raw-regexp) ] [ drop ] if* ; : parse-raw-regexp ( regexp -- ) [ beginning-of-text parse-token ] [ dup raw>> dup empty? [ 2drop ] [ [ (parse-raw-regexp) ] string-parse ] if ] [ end-of-text parse-token ] tri ; :: find-delta ( states obj table -- keys ) obj states [ table get-row at [ dup integer? [ 1array ] when unique ] [ H{ } ] if* ] with map H{ } clone [ assoc-union ] reduce keys ; :: (find-closure) ( states obj assoc table -- keys ) [let | size [ assoc assoc-size ] | assoc states unique assoc-union dup assoc-size size > [ obj states [ table get-row at* [ dup integer? [ 1array ] when obj rot table (find-closure) ] [ drop ] if ] with each ] when ] ; : find-closure ( states obj table -- states ) >r H{ } r> (find-closure) keys ; : find-epsilon-closure ( states table -- states ) >r eps H{ } r> (find-closure) keys ; : filter-special-transition ( vec -- vec' ) [ drop eps = not ] assoc-filter ; : initialize-subset-construction ( regexp -- ) >>dfa [ nfa>> [ start-state>> 1array ] keep find-epsilon-closure 1dlist ] [ swap >>new-states drop ] [ [ dfa>> ] [ nfa>> ] bi columns>> filter-special-transition >>columns drop ] tri ; :: (subset-construction) ( regexp -- ) [let* | nfa [ regexp nfa>> ] dfa [ regexp dfa>> ] new-states [ regexp new-states>> ] columns [ dfa columns>> keys ] | new-states dlist-empty? [ new-states pop-front dup dfa add-row columns [ 2dup nfa [ find-delta ] [ find-epsilon-closure ] bi dup [ dfa rows>> key? ] [ empty? ] bi or [ dup new-states push-back ] unless dup empty? [ 3drop ] [ dfa set-entry ] if ] with each regexp (subset-construction) ] unless ] ; : set-start/final-states ( regexp -- ) dup [ nfa>> start-state>> ] [ dfa>> rows>> keys [ member? ] with filter first ] bi >r dup dfa>> r> >>start-state drop dup [ nfa>> final-states>> ] [ dfa>> rows>> ] bi [ keys ] bi@ [ intersect empty? not ] with filter >r dfa>> r> >>final-states drop ; : subset-construction ( regexp -- ) [ initialize-subset-construction ] [ (subset-construction) ] [ set-start/final-states ] tri ; : ( raw -- obj ) regexp new swap >>raw 0 >>parentheses-count 0 >>bracket-count -1 >>state V{ } clone >>stack V{ } clone >>character-sets >>nfa dup [ parse-raw-regexp ] [ subset-construction ] bi ; TUPLE: dfa-traverser dfa last-state current-state text start-index current-index matches ; : ( text dfa -- match ) dfa>> dfa-traverser new swap [ start-state>> >>current-state ] keep >>dfa swap >>text 0 >>start-index 0 >>current-index V{ } clone >>matches ; : final-state? ( dfa-traverser -- ? ) [ current-state>> ] [ dfa>> final-states>> ] bi member? ; : text-finished? ( dfa-traverser -- ? ) [ current-index>> ] [ text>> length ] bi >= ; : save-final-state ( dfa-straverser -- ) [ current-index>> ] [ matches>> ] bi push ; : match-done? ( dfa-traverser -- ? ) dup final-state? [ dup save-final-state ] when text-finished? ; : increment-state ( dfa-traverser state -- dfa-traverser ) >r [ 1+ ] change-current-index dup current-state>> >>last-state r> >>current-state ; : match-transition ( obj hash -- state/f ) 2dup keys [ callable? ] filter predicates [ swap at nip ] [ at ] if* ; : do-match ( dfa-traverser -- dfa-traverser ) dup match-done? [ dup { [ current-index>> ] [ text>> ] [ current-state>> ] [ dfa>> rows>> ] } cleave at >r nth r> match-transition [ increment-state do-match ] when* ] unless ; : return-match ( dfa-traverser -- interval/f ) dup matches>> empty? [ drop f ] [ [ start-index>> ] [ matches>> peek ] bi 1 ] if ; : match ( string regexp -- pair ) do-match return-match ; : matches? ( string regexp -- ? ) dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ; ! character classes ! TUPLE: range-class from to ; ! TUPLE: or-class left right ; ! (?:a|b)* <- does not capture ! (a|b)*\1 <- group captured ! (?!abba) negative lookahead matches ababa but not abbaa ! (?=abba) positive lookahead matches abbaaa but not abaaa