add regular expressions library prematurely so i can work on another computer
							parent
							
								
									98dc245420
								
							
						
					
					
						commit
						fda6f77d4e
					
				| 
						 | 
				
			
			@ -0,0 +1,138 @@
 | 
			
		|||
USING: regexp4 tools.test kernel ;
 | 
			
		||||
IN: regexp4-tests
 | 
			
		||||
 | 
			
		||||
[ f ] [ "b" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaaaaaa" "a*"  <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "ab" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "abc" "abc" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "b" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "c" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "c" "d|e|f" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "aa" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "bb" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "cc" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "cc" "d|e|f" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "a+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aa" "a+" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "" "a?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a?" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aa" "a?" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "." <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "." <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "." "." <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "\n" "." <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" ".+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" ".+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ab" ".+" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[ t ] [ "" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "c" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "cc" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "ccd" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "d" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "foo" "foo|bar" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "(a)" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "(a)" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aa" "(a*)" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "a{1}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aa" "a{1}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "a" "a{2,}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaa" "a{2,}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaaa" "a{2,}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaaaa" "a{2,}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aa" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaa" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaaa" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaaaa" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aa" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaa" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaaa" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "[a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[abc]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "b" "[a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "d" "[abc]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ab" "[abc]{1,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abc" "[abc]{1,2}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "[^a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[^a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[^abc]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "b" "[^a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "d" "[^abc]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "ab" "[^abc]{1,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abc" "[^abc]{1,2}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "]" "[]]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "]" "[^]]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
! [ "^" "[^]" <regexp> matches? ] must-fail
 | 
			
		||||
[ t ] [ "^" "[]^]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "]" "[]^]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "[" "[[]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "^" "[^^]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^^]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "-" "[-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "-" "[-a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[-a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "-" "[a-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[a-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "b" "[a-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "-" "[a-c]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "-" "[^a-c]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "b" "[a-c]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "b" "[^a-c]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "-" "[a-c-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "-" "[^a-c-]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "\\" "[\\\\]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[\\\\]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "\\" "[^\\\\]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^\\\\]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
! ((A)(B(C)))
 | 
			
		||||
! 1.  ((A)(B(C)))
 | 
			
		||||
! 2. (A)
 | 
			
		||||
! 3. (B(C))
 | 
			
		||||
! 4. (C) 
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,547 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays assocs combinators kernel math
 | 
			
		||||
sequences namespaces locals combinators.lib state-tables
 | 
			
		||||
math.parser state-parser sets dlists unicode.categories
 | 
			
		||||
math.order quotations shuffle math.ranges splitting
 | 
			
		||||
symbols ;
 | 
			
		||||
IN: regexp4
 | 
			
		||||
 | 
			
		||||
SYMBOLS: eps start-state final-state beginning-of-text
 | 
			
		||||
end-of-text left-paren right-paren alternation ;
 | 
			
		||||
 | 
			
		||||
SYMBOL: runtim-epsilon
 | 
			
		||||
 | 
			
		||||
TUPLE: regexp raw paren-count bracket-count
 | 
			
		||||
state stack nfa new-states dfa minimized-dfa
 | 
			
		||||
dot-matches-newlines? character-sets capture-group
 | 
			
		||||
captured-groups ;
 | 
			
		||||
 | 
			
		||||
TUPLE: capture-group n range ;
 | 
			
		||||
 | 
			
		||||
ERROR: paren-underflow ;
 | 
			
		||||
ERROR: unbalanced-paren ;
 | 
			
		||||
 | 
			
		||||
: push-stack ( regexp token -- ) swap stack>> push ;
 | 
			
		||||
: push-all-stack ( regexp seq -- ) swap stack>> push-all ;
 | 
			
		||||
: next-state ( regexp -- n ) [ 1+ ] change-state state>> ;
 | 
			
		||||
 | 
			
		||||
: check-paren-underflow ( regexp -- )
 | 
			
		||||
    paren-count>> 0 < [ paren-underflow ] when ;
 | 
			
		||||
 | 
			
		||||
: check-unbalanced-paren ( regexp -- )
 | 
			
		||||
    paren-count>> 0 > [ unbalanced-paren ] when ;
 | 
			
		||||
 | 
			
		||||
:: (apply-alternation) ( stack regexp -- )
 | 
			
		||||
    [let | s2 [ stack peek first ]
 | 
			
		||||
           s3 [ stack pop second ]
 | 
			
		||||
           s0 [ stack peek alternation = [ stack pop* ] when stack peek first ]
 | 
			
		||||
           s1 [ stack pop second ]
 | 
			
		||||
           s4 [ regexp next-state ]
 | 
			
		||||
           s5 [ regexp next-state ]
 | 
			
		||||
           table [ regexp nfa>> ] |
 | 
			
		||||
        s5 table add-row
 | 
			
		||||
        s4 eps s0 <entry> table add-entry
 | 
			
		||||
        s4 eps s2 <entry> table add-entry
 | 
			
		||||
        s1 eps s5 <entry> table add-entry
 | 
			
		||||
        s3 eps s5 <entry> table add-entry
 | 
			
		||||
        s1 table final-states>> delete-at
 | 
			
		||||
        s3 table final-states>> delete-at
 | 
			
		||||
        t s5 table final-states>> set-at
 | 
			
		||||
        s4 s5 2array stack push ] ;
 | 
			
		||||
 | 
			
		||||
: apply-alternation ( regexp -- )
 | 
			
		||||
    [ stack>> ] [ (apply-alternation) ] bi ;
 | 
			
		||||
 | 
			
		||||
: apply-alternation? ( stack -- ? )
 | 
			
		||||
    dup length dup 3 <
 | 
			
		||||
    [ 2drop f ] [ 2 - swap nth alternation = ] if ;
 | 
			
		||||
 | 
			
		||||
:: (apply-concatenation) ( stack regexp -- )
 | 
			
		||||
    [let* |
 | 
			
		||||
            s2 [ stack peek first ]
 | 
			
		||||
            s3 [ stack pop second ]
 | 
			
		||||
            s0 [ stack peek first ]
 | 
			
		||||
            s1 [ stack pop second ]
 | 
			
		||||
            table [ regexp nfa>> ] |
 | 
			
		||||
        s1 eps s2 <entry> table set-entry
 | 
			
		||||
        s1 table final-states>> delete-at
 | 
			
		||||
        s3 table add-row
 | 
			
		||||
        s0 s3 2array stack push ] ;
 | 
			
		||||
 | 
			
		||||
: apply-concatenation ( regexp -- )
 | 
			
		||||
    [ stack>> ] [ (apply-concatenation) ] bi ;
 | 
			
		||||
 | 
			
		||||
: apply-concatenation? ( seq -- ? )
 | 
			
		||||
    dup length dup 2 <
 | 
			
		||||
    [ 2drop f ] [ 2 - swap nth array? ] if ;
 | 
			
		||||
 | 
			
		||||
: apply-loop ( seq regexp -- seq regexp )
 | 
			
		||||
    over length 1 > [
 | 
			
		||||
        2dup over apply-alternation?
 | 
			
		||||
        [ (apply-alternation) ] [ (apply-concatenation) ] if apply-loop
 | 
			
		||||
    ] when ;
 | 
			
		||||
 | 
			
		||||
: apply-til-last ( token regexp -- )
 | 
			
		||||
    swap [
 | 
			
		||||
        <reversed> tuck index cut reverse dup pop*
 | 
			
		||||
    ] change-stack >r reverse r> apply-loop stack>> push-all ;
 | 
			
		||||
 | 
			
		||||
: concatenation-loop ( regexp -- )
 | 
			
		||||
    dup stack>> dup apply-concatenation?
 | 
			
		||||
    [ over (apply-concatenation) concatenation-loop ] [ 2drop ] if ;
 | 
			
		||||
 | 
			
		||||
:: apply-kleene-closure ( regexp -- )
 | 
			
		||||
    [let* | stack [ regexp stack>> ]
 | 
			
		||||
            s0 [ stack peek first ]
 | 
			
		||||
            s1 [ stack pop second ]
 | 
			
		||||
            s2 [ regexp next-state ]
 | 
			
		||||
            s3 [ regexp next-state ]
 | 
			
		||||
            table [ regexp nfa>> ] |
 | 
			
		||||
        s1 table final-states>> delete-at
 | 
			
		||||
        t s3 table final-states>> set-at
 | 
			
		||||
        s3 table add-row
 | 
			
		||||
        s1 eps s0 <entry> table add-entry
 | 
			
		||||
        s2 eps s0 <entry> table add-entry
 | 
			
		||||
        s2 eps s3 <entry> table add-entry
 | 
			
		||||
        s1 eps s3 <entry> table add-entry
 | 
			
		||||
        s2 s3 2array stack push ] ;
 | 
			
		||||
 | 
			
		||||
: add-numbers ( n obj -- obj )
 | 
			
		||||
    2dup [ number? ] bi@ and
 | 
			
		||||
    [ + ] [ dup sequence? [ [ + ] with map ] [ nip ] if ] if ;
 | 
			
		||||
 | 
			
		||||
: increment-columns ( n assoc -- )
 | 
			
		||||
    dup [ >r swap >r add-numbers r> r> set-at ] curry with* assoc-each ;
 | 
			
		||||
 | 
			
		||||
:: copy-state-rows ( regexp range -- )
 | 
			
		||||
    [let* | len [ range range-length ]
 | 
			
		||||
            offset [ regexp state>> range range-min - 1+ ]
 | 
			
		||||
            state [ regexp [ len + ] change-state ] |
 | 
			
		||||
        regexp nfa>> rows>>
 | 
			
		||||
        [ drop range member? ] assoc-filter
 | 
			
		||||
        [
 | 
			
		||||
            [ offset + ] dip
 | 
			
		||||
            [ offset swap add-numbers ] assoc-map
 | 
			
		||||
        ] assoc-map
 | 
			
		||||
        regexp nfa>> [ assoc-union ] change-rows drop
 | 
			
		||||
        range [ range-min ] [ range-max ] bi [ offset + ] bi@ 2array
 | 
			
		||||
        regexp stack>> push ] ;
 | 
			
		||||
 | 
			
		||||
: last-state ( regexp -- range )
 | 
			
		||||
    stack>> peek first2 [a,b] ;
 | 
			
		||||
 | 
			
		||||
: set-last-state-final ( ? regexp -- )
 | 
			
		||||
    [ stack>> peek second ] [ nfa>> final-states>> ] bi set-at ;
 | 
			
		||||
 | 
			
		||||
: apply-plus-closure ( regexp -- )
 | 
			
		||||
    [ dup last-state copy-state-rows ]
 | 
			
		||||
    [ apply-kleene-closure ]
 | 
			
		||||
    [ apply-concatenation ] tri ;
 | 
			
		||||
 | 
			
		||||
: apply-question-closure ( regexp -- )
 | 
			
		||||
    [ stack>> peek first2 eps swap <entry> ] [ nfa>> add-entry ] bi ;
 | 
			
		||||
 | 
			
		||||
: with0 ( obj n quot -- n quot' ) swapd curry ; inline
 | 
			
		||||
 | 
			
		||||
: copy-state ( regexp state n -- )
 | 
			
		||||
    [ copy-state-rows ] with0 with0 times ;
 | 
			
		||||
 | 
			
		||||
:: (exactly-n) ( regexp state n -- )
 | 
			
		||||
    regexp state n copy-state
 | 
			
		||||
    t regexp set-last-state-final ;
 | 
			
		||||
 | 
			
		||||
: exactly-n ( regexp n -- )
 | 
			
		||||
    >r dup last-state r> 1- (exactly-n) ;
 | 
			
		||||
 | 
			
		||||
: exactly-n-concatenated ( regexp state n -- )
 | 
			
		||||
    [ (exactly-n) ] 3keep
 | 
			
		||||
    nip 1- [ apply-concatenation ] with0 times ;
 | 
			
		||||
 | 
			
		||||
:: at-least-n ( regexp n -- )
 | 
			
		||||
    [let | state [ regexp stack>> pop first2 [a,b] ] |
 | 
			
		||||
        regexp state n copy-state
 | 
			
		||||
        state regexp stack>> push
 | 
			
		||||
        regexp apply-kleene-closure ] ; 
 | 
			
		||||
 | 
			
		||||
: pop-last ( regexp -- range )
 | 
			
		||||
    stack>> pop first2 [a,b] ;
 | 
			
		||||
 | 
			
		||||
:: at-most-n ( regexp n -- )
 | 
			
		||||
    [let | state [ regexp pop-last ] |
 | 
			
		||||
        regexp state n [ 1+ exactly-n-concatenated ] with with each
 | 
			
		||||
        regexp n 1- [ apply-alternation ] with0 times
 | 
			
		||||
        regexp apply-question-closure ] ;
 | 
			
		||||
 | 
			
		||||
:: from-m-to-n ( regexp m n -- )
 | 
			
		||||
    [let | state [ regexp pop-last ] |
 | 
			
		||||
        regexp state
 | 
			
		||||
        m n [a,b] [ exactly-n-concatenated ] with with each
 | 
			
		||||
        regexp n m - [ apply-alternation ] with0 times ] ;
 | 
			
		||||
 | 
			
		||||
: apply-brace-closure ( regexp from/f to/f comma? -- )
 | 
			
		||||
    [
 | 
			
		||||
        2dup and
 | 
			
		||||
        [ from-m-to-n ]
 | 
			
		||||
        [ [ nip at-most-n ] [ at-least-n ] if* ] if
 | 
			
		||||
    ] [ drop exactly-n ] if ;
 | 
			
		||||
 | 
			
		||||
:: make-nontoken-nfa ( regexp obj -- )
 | 
			
		||||
    [let | s0 [ regexp next-state ]
 | 
			
		||||
           s1 [ regexp next-state ]
 | 
			
		||||
           stack [ regexp stack>> ]
 | 
			
		||||
           table [ regexp nfa>> ] |
 | 
			
		||||
        s0 obj s1 <entry> table set-entry
 | 
			
		||||
        s1 table add-row
 | 
			
		||||
        t s1 table final-states>> set-at
 | 
			
		||||
        s0 s1 2array stack push ] ;
 | 
			
		||||
 | 
			
		||||
: set-start-state ( regexp -- )
 | 
			
		||||
    dup stack>> dup empty? [
 | 
			
		||||
        2drop
 | 
			
		||||
    ] [
 | 
			
		||||
        [ nfa>> ] [ pop first ] bi* >>start-state drop
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: ascii? ( n -- ? ) 0 HEX: 7f between? ;
 | 
			
		||||
: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
 | 
			
		||||
: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
 | 
			
		||||
 | 
			
		||||
: hex-digit? ( n -- ? )
 | 
			
		||||
    dup decimal-digit? 
 | 
			
		||||
    over CHAR: a CHAR: f between? or 
 | 
			
		||||
    swap CHAR: A CHAR: F between? or ;
 | 
			
		||||
 | 
			
		||||
: control-char? ( n -- ? )
 | 
			
		||||
    dup 0 HEX: 1f between? swap HEX: 7f = or ;
 | 
			
		||||
 | 
			
		||||
: punct? ( n -- ? )
 | 
			
		||||
    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
 | 
			
		||||
 | 
			
		||||
: c-identifier-char? ( ch -- ? ) 
 | 
			
		||||
    dup alpha? swap CHAR: _ = or ;
 | 
			
		||||
 | 
			
		||||
: java-blank? ( n -- ? )
 | 
			
		||||
    {   
 | 
			
		||||
        CHAR: \s CHAR: \t CHAR: \n
 | 
			
		||||
        HEX: b HEX: 7 CHAR: \r
 | 
			
		||||
    } member? ;
 | 
			
		||||
 | 
			
		||||
: java-printable? ( n -- ? )
 | 
			
		||||
    dup alpha? swap punct? or ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-character-class obj ;
 | 
			
		||||
 | 
			
		||||
: parse-posix-class ( -- quot )
 | 
			
		||||
    next
 | 
			
		||||
    CHAR: { expect
 | 
			
		||||
    [ get-char CHAR: } = ] take-until
 | 
			
		||||
    {
 | 
			
		||||
        { "Lower" [ [ letter? ] ] }
 | 
			
		||||
        { "Upper" [ [ LETTER? ] ] }
 | 
			
		||||
        { "ASCII" [ [ ascii? ] ] }
 | 
			
		||||
        { "Alpha" [ [ Letter? ] ] }
 | 
			
		||||
        { "Digit" [ [ digit? ] ] }
 | 
			
		||||
        { "Alnum" [ [ alpha? ] ] }
 | 
			
		||||
        { "Punct" [ [ punct? ] ] }
 | 
			
		||||
        { "Graph" [ [ java-printable? ] ] }
 | 
			
		||||
        { "Print" [ [ java-printable? ] ] }
 | 
			
		||||
        { "Blank" [ [ " \t" member? ] ] }
 | 
			
		||||
        { "Cntrl" [ [ control-char? ] ] }
 | 
			
		||||
        { "XDigit" [ [ hex-digit? ] ] }
 | 
			
		||||
        { "Space" [ [ java-blank? ] ] }
 | 
			
		||||
        ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
 | 
			
		||||
        [ bad-character-class ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-octal number ;
 | 
			
		||||
 | 
			
		||||
: parse-octal ( regexp -- )
 | 
			
		||||
    next get-char drop
 | 
			
		||||
    3 take oct>
 | 
			
		||||
    dup 255 > [ bad-octal ] when
 | 
			
		||||
    make-nontoken-nfa ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-hex number ;
 | 
			
		||||
 | 
			
		||||
: parse-short-hex ( regexp -- )
 | 
			
		||||
    next 2 take hex>
 | 
			
		||||
    dup number? [ bad-hex ] unless
 | 
			
		||||
    make-nontoken-nfa ;
 | 
			
		||||
 | 
			
		||||
: parse-long-hex ( regexp -- )
 | 
			
		||||
    next 4 take hex>
 | 
			
		||||
    dup number? [ bad-hex ] unless
 | 
			
		||||
    make-nontoken-nfa ;
 | 
			
		||||
 | 
			
		||||
: parse-control-character ( regexp -- )
 | 
			
		||||
    next get-char make-nontoken-nfa ;
 | 
			
		||||
 | 
			
		||||
: parse-backreference ( regexp obj -- )
 | 
			
		||||
    2drop ;
 | 
			
		||||
 | 
			
		||||
: dot-construction ( regexp -- )
 | 
			
		||||
    [ CHAR: \n = not ] make-nontoken-nfa ;
 | 
			
		||||
 | 
			
		||||
: front-anchor-construction ( regexp -- )
 | 
			
		||||
    drop ;
 | 
			
		||||
 | 
			
		||||
: back-anchor-construction  ( regexp -- )
 | 
			
		||||
    drop ;
 | 
			
		||||
 | 
			
		||||
: parse-brace ( -- from/f to/f comma? )
 | 
			
		||||
    next
 | 
			
		||||
    [ get-char CHAR: } = ] take-until
 | 
			
		||||
    "," split1 [ [ string>number ] bi@ ] keep >boolean ;
 | 
			
		||||
 | 
			
		||||
: take-until-]
 | 
			
		||||
    [ get-char CHAR: ] = ] take-until ;
 | 
			
		||||
 | 
			
		||||
: make-character-set ( regexp str -- )
 | 
			
		||||
    dup
 | 
			
		||||
    [ length 1 > ] [ first CHAR: ^ = ] bi and
 | 
			
		||||
    [ rest t ] [ f ] if
 | 
			
		||||
    >r [ member? ] curry r>
 | 
			
		||||
    [ [ not ] compose ] when make-nontoken-nfa ;
 | 
			
		||||
 | 
			
		||||
: parse-escaped ( regexp -- )
 | 
			
		||||
    next get-char {
 | 
			
		||||
        { CHAR: \ [ CHAR: \ make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: t [ CHAR: \t make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: n [ CHAR: \n make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: r [ CHAR: \r make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: f [ HEX: c make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: a [ HEX: 7 make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: e [ HEX: 1b make-nontoken-nfa ] }
 | 
			
		||||
 | 
			
		||||
        { CHAR: d [ [ digit? ] make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: D [ [ digit? not ] make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: s [ [ java-blank? ] make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: S [ [ java-blank? not ] make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: w [ [ c-identifier-char? ] make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: W [ [ c-identifier-char? not ] make-nontoken-nfa ] }
 | 
			
		||||
 | 
			
		||||
        { CHAR: p [ parse-posix-class make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: P [ parse-posix-class [ not ] compose make-nontoken-nfa ] }
 | 
			
		||||
        { CHAR: x [ parse-short-hex ] }
 | 
			
		||||
        { CHAR: u [ parse-long-hex ] }
 | 
			
		||||
        { CHAR: 0 [ parse-octal ] }
 | 
			
		||||
        { CHAR: c [ parse-control-character ] }
 | 
			
		||||
 | 
			
		||||
        ! { CHAR: Q [ quot til \E ] }
 | 
			
		||||
        ! { CHAR: E [ should be an error, parse this in the Q if exists ] }
 | 
			
		||||
 | 
			
		||||
        ! { CHAR: b [ ] } ! a word boundary
 | 
			
		||||
        ! { CHAR: B [ ] } ! a non-word boundary
 | 
			
		||||
        ! { CHAR: A [ ] } ! beginning of input
 | 
			
		||||
        ! { CHAR: G [ ] } ! end of previous match
 | 
			
		||||
        ! { CHAR: Z [ ] } ! end of input but for the final terminator, if any
 | 
			
		||||
        ! { CHAR: z [ ] } ! end of the input
 | 
			
		||||
        [ dup digit? [ parse-backreference ] [ make-nontoken-nfa ] if ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
ERROR: unsupported-token token ;
 | 
			
		||||
: parse-token ( regexp token -- )
 | 
			
		||||
    dup {
 | 
			
		||||
        { CHAR: ^ [ drop front-anchor-construction ] }
 | 
			
		||||
        { CHAR: $ [ drop back-anchor-construction ] }
 | 
			
		||||
        { CHAR: \ [ drop parse-escaped ] }
 | 
			
		||||
        { CHAR: | [ drop dup concatenation-loop alternation push-stack ] }
 | 
			
		||||
        { CHAR: ( [ drop [ 1+ ] change-paren-count left-paren push-stack ] }
 | 
			
		||||
        { CHAR: ) [ drop [ 1- ] change-paren-count left-paren apply-til-last ] }
 | 
			
		||||
        { CHAR: * [ drop apply-kleene-closure ] }
 | 
			
		||||
        { CHAR: + [ drop apply-plus-closure ] }
 | 
			
		||||
        { CHAR: ? [ drop apply-question-closure ] }
 | 
			
		||||
        { CHAR: { [ drop parse-brace apply-brace-closure ] }
 | 
			
		||||
        ! { CHAR: [ [ drop parse-character-set ] }
 | 
			
		||||
        ! { CHAR: } [ drop drop "brace" ] }
 | 
			
		||||
        ! { CHAR: ? [ drop ] }
 | 
			
		||||
        { CHAR: . [ drop dot-construction ] }
 | 
			
		||||
        { beginning-of-text [ push-stack ] }
 | 
			
		||||
        { end-of-text [
 | 
			
		||||
            drop {
 | 
			
		||||
                [ check-unbalanced-paren ]
 | 
			
		||||
                [ concatenation-loop ]
 | 
			
		||||
                [ beginning-of-text apply-til-last ]
 | 
			
		||||
                [ set-start-state ]
 | 
			
		||||
            } cleave
 | 
			
		||||
        ] }
 | 
			
		||||
        [ drop make-nontoken-nfa ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: (parse-raw-regexp) ( regexp -- )
 | 
			
		||||
    get-char [ dupd parse-token next (parse-raw-regexp) ] [ drop ] if* ;
 | 
			
		||||
 | 
			
		||||
: parse-raw-regexp ( regexp -- )
 | 
			
		||||
    [ beginning-of-text parse-token ]
 | 
			
		||||
    [
 | 
			
		||||
        dup raw>> dup empty? [
 | 
			
		||||
            2drop
 | 
			
		||||
        ] [
 | 
			
		||||
            [ (parse-raw-regexp) ] string-parse
 | 
			
		||||
        ] if
 | 
			
		||||
    ]
 | 
			
		||||
    [ end-of-text parse-token ] tri ;
 | 
			
		||||
 | 
			
		||||
:: find-delta ( states obj table -- keys )
 | 
			
		||||
    obj states [
 | 
			
		||||
        table get-row at
 | 
			
		||||
        [ dup integer? [ 1array ] when unique ] [ H{ } ] if*
 | 
			
		||||
    ] with map H{ } clone [ assoc-union ] reduce keys ;
 | 
			
		||||
 | 
			
		||||
:: (find-closure) ( states obj assoc table -- keys )
 | 
			
		||||
    [let | size [ assoc assoc-size ] |
 | 
			
		||||
        assoc states unique assoc-union
 | 
			
		||||
        dup assoc-size size > [
 | 
			
		||||
            obj states [
 | 
			
		||||
                table get-row at* [
 | 
			
		||||
                    dup integer? [ 1array ] when
 | 
			
		||||
                    obj rot table (find-closure)
 | 
			
		||||
                ] [
 | 
			
		||||
                    drop
 | 
			
		||||
                ] if
 | 
			
		||||
            ] with each
 | 
			
		||||
        ] when ] ;
 | 
			
		||||
 | 
			
		||||
: find-closure ( states obj table -- states )
 | 
			
		||||
    >r H{ } r> (find-closure) keys ;
 | 
			
		||||
 | 
			
		||||
: find-epsilon-closure ( states table -- states )
 | 
			
		||||
    >r eps H{ } r> (find-closure) keys ;
 | 
			
		||||
 | 
			
		||||
: filter-special-transition ( vec -- vec' )
 | 
			
		||||
    [ drop eps = not ] assoc-filter ;
 | 
			
		||||
 | 
			
		||||
: initialize-subset-construction ( regexp -- )
 | 
			
		||||
    <vector-table> >>dfa
 | 
			
		||||
    [
 | 
			
		||||
        nfa>> [ start-state>> 1array ] keep
 | 
			
		||||
        find-epsilon-closure 1dlist
 | 
			
		||||
    ] [
 | 
			
		||||
        swap >>new-states drop
 | 
			
		||||
    ] [
 | 
			
		||||
        [ dfa>> ] [ nfa>> ] bi
 | 
			
		||||
        columns>> filter-special-transition >>columns drop
 | 
			
		||||
    ] tri ;
 | 
			
		||||
 | 
			
		||||
:: (subset-construction) ( regexp -- )
 | 
			
		||||
    [let* | nfa [ regexp nfa>> ]
 | 
			
		||||
           dfa [ regexp dfa>> ]
 | 
			
		||||
           new-states [ regexp new-states>> ]
 | 
			
		||||
           columns [ dfa columns>> keys ] |
 | 
			
		||||
        
 | 
			
		||||
        new-states dlist-empty? [
 | 
			
		||||
            new-states pop-front
 | 
			
		||||
            dup dfa add-row
 | 
			
		||||
            columns [
 | 
			
		||||
                2dup nfa [ find-delta ] [ find-epsilon-closure ] bi
 | 
			
		||||
                dup [ dfa rows>> key? ] [ empty? ] bi or [
 | 
			
		||||
                    dup new-states push-back
 | 
			
		||||
                ] unless
 | 
			
		||||
                dup empty? [ 3drop ] [ <entry> dfa set-entry ] if
 | 
			
		||||
            ] with each
 | 
			
		||||
            regexp (subset-construction)
 | 
			
		||||
        ] unless ] ;
 | 
			
		||||
 | 
			
		||||
: set-start/final-states ( regexp -- )
 | 
			
		||||
    dup [ nfa>> start-state>> ]
 | 
			
		||||
    [ dfa>> rows>> keys [ member? ] with filter first ] bi
 | 
			
		||||
    >r dup dfa>> r> >>start-state drop
 | 
			
		||||
 | 
			
		||||
    dup [ nfa>> final-states>> ] [ dfa>> rows>> ] bi
 | 
			
		||||
    [ keys ] bi@
 | 
			
		||||
    [ intersect empty? not ] with filter
 | 
			
		||||
    >r dfa>> r> >>final-states drop ;
 | 
			
		||||
 | 
			
		||||
: subset-construction ( regexp -- )
 | 
			
		||||
    [ initialize-subset-construction ]
 | 
			
		||||
    [ (subset-construction) ]
 | 
			
		||||
    [ set-start/final-states ] tri ;
 | 
			
		||||
 | 
			
		||||
: <regexp> ( raw -- obj )
 | 
			
		||||
    regexp new
 | 
			
		||||
        swap >>raw
 | 
			
		||||
        0 >>paren-count
 | 
			
		||||
        -1 >>state
 | 
			
		||||
        V{ } clone >>stack 
 | 
			
		||||
        V{ } clone >>character-sets
 | 
			
		||||
        <vector-table> >>nfa
 | 
			
		||||
        dup [ parse-raw-regexp ] [ subset-construction ] bi ;
 | 
			
		||||
 | 
			
		||||
TUPLE: dfa-traverser
 | 
			
		||||
    dfa
 | 
			
		||||
    last-state current-state
 | 
			
		||||
    text
 | 
			
		||||
    start-index current-index
 | 
			
		||||
    matches ;
 | 
			
		||||
 | 
			
		||||
: <dfa-traverser> ( text dfa -- match )
 | 
			
		||||
    dfa>>
 | 
			
		||||
    dfa-traverser new
 | 
			
		||||
        swap [ start-state>> >>current-state ] keep
 | 
			
		||||
        >>dfa
 | 
			
		||||
        swap >>text
 | 
			
		||||
        0 >>start-index
 | 
			
		||||
        0 >>current-index
 | 
			
		||||
        V{ } clone >>matches ;
 | 
			
		||||
 | 
			
		||||
: final-state? ( dfa-traverser -- ? )
 | 
			
		||||
    [ current-state>> ] [ dfa>> final-states>> ] bi
 | 
			
		||||
    member? ;
 | 
			
		||||
 | 
			
		||||
: text-finished? ( dfa-traverser -- ? )
 | 
			
		||||
    [ current-index>> ] [ text>> length ] bi >= ;
 | 
			
		||||
 | 
			
		||||
: save-final-state ( dfa-straverser -- )
 | 
			
		||||
    [ current-index>> ] [ matches>> ] bi push ;
 | 
			
		||||
 | 
			
		||||
: match-done? ( dfa-traverser -- ? )
 | 
			
		||||
    dup final-state? [
 | 
			
		||||
        dup save-final-state
 | 
			
		||||
    ] when text-finished? ;
 | 
			
		||||
 | 
			
		||||
: increment-state ( dfa-traverser state -- dfa-traverser )
 | 
			
		||||
    >r [ 1+ ] change-current-index
 | 
			
		||||
    dup current-state>> >>last-state r>
 | 
			
		||||
    >>current-state ;
 | 
			
		||||
 | 
			
		||||
: match-transition ( obj hash -- state/f )
 | 
			
		||||
    2dup keys [ callable? ] filter predicates
 | 
			
		||||
    [ swap at nip ] [ at ] if* ;
 | 
			
		||||
 | 
			
		||||
: do-match ( dfa-traverser -- dfa-traverser )
 | 
			
		||||
    dup match-done? [
 | 
			
		||||
        dup {
 | 
			
		||||
            [ current-index>> ]
 | 
			
		||||
            [ text>> ]
 | 
			
		||||
            [ current-state>> ]
 | 
			
		||||
            [ dfa>> rows>> ]
 | 
			
		||||
        } cleave
 | 
			
		||||
        at >r nth r> match-transition [
 | 
			
		||||
            increment-state do-match
 | 
			
		||||
        ] when*
 | 
			
		||||
    ] unless ;
 | 
			
		||||
 | 
			
		||||
: return-match ( dfa-traverser -- interval/f )
 | 
			
		||||
    dup matches>> empty? [
 | 
			
		||||
        drop f
 | 
			
		||||
    ] [
 | 
			
		||||
        [ start-index>> ] [ matches>> peek ] bi 1 <range>
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: match ( string regexp -- pair )
 | 
			
		||||
    <dfa-traverser> do-match return-match ;
 | 
			
		||||
 | 
			
		||||
: matches? ( string regexp -- ? )
 | 
			
		||||
    dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
! character classes
 | 
			
		||||
! TUPLE: range-class from to ;
 | 
			
		||||
! TUPLE: or-class left right ;
 | 
			
		||||
 | 
			
		||||
! (?:a|b)*  <- does not capture
 | 
			
		||||
! (a|b)*\1  <- group captured
 | 
			
		||||
! (?!abba)  negative lookahead  matches ababa but not abbaa
 | 
			
		||||
! (?=abba)  positive lookahead  matches abbaaa but not abaaa
 | 
			
		||||
		Loading…
	
		Reference in New Issue