Merge branch 'master' of git://factorcode.org/git/factor
						commit
						1b34c3e586
					
				| 
						 | 
				
			
			@ -147,6 +147,7 @@ M: float >base
 | 
			
		|||
        { [ dup fp-nan? ] [ drop "0.0/0.0" ] }
 | 
			
		||||
        { [ dup 1.0/0.0 = ] [ drop "1.0/0.0" ] }
 | 
			
		||||
        { [ dup -1.0/0.0 = ] [ drop "-1.0/0.0" ] }
 | 
			
		||||
        { [ dup double>bits HEX: 8000000000000000 = ] [ drop "-0.0" ] }
 | 
			
		||||
        [ float>string fix-float ]
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,6 +7,7 @@ TUPLE: regexp
 | 
			
		|||
    raw
 | 
			
		||||
    { stack vector }
 | 
			
		||||
    parse-tree
 | 
			
		||||
    { options hashtable }
 | 
			
		||||
    nfa-table
 | 
			
		||||
    dfa-table
 | 
			
		||||
    minimized-table
 | 
			
		||||
| 
						 | 
				
			
			@ -18,6 +19,7 @@ TUPLE: regexp
 | 
			
		|||
    0 >>state
 | 
			
		||||
    V{ } clone >>stack
 | 
			
		||||
    V{ } clone >>new-states
 | 
			
		||||
    H{ } clone >>options
 | 
			
		||||
    H{ } clone >>visited-states ;
 | 
			
		||||
 | 
			
		||||
SYMBOL: current-regexp
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -21,6 +21,9 @@ M: letter-class class-member? ( obj class -- ? )
 | 
			
		|||
M: LETTER-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop LETTER? ;
 | 
			
		||||
 | 
			
		||||
M: Letter-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop Letter? ;
 | 
			
		||||
 | 
			
		||||
M: ascii-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop ascii? ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -47,3 +50,6 @@ M: hex-digit-class class-member? ( obj class -- ? )
 | 
			
		|||
 | 
			
		||||
M: java-blank-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop java-blank? ;
 | 
			
		||||
 | 
			
		||||
M: unmatchable-class class-member? ( obj class -- ? )
 | 
			
		||||
    2drop f ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,10 +1,10 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays assocs combinators io io.streams.string
 | 
			
		||||
kernel math math.parser multi-methods namespaces qualified
 | 
			
		||||
kernel math math.parser multi-methods namespaces qualified sets
 | 
			
		||||
quotations sequences sequences.lib splitting symbols vectors
 | 
			
		||||
dlists math.order combinators.lib unicode.categories
 | 
			
		||||
sequences.lib regexp2.backend regexp2.utils ;
 | 
			
		||||
dlists math.order combinators.lib unicode.categories strings
 | 
			
		||||
sequences.lib regexp2.backend regexp2.utils unicode.case ;
 | 
			
		||||
IN: regexp2.parser
 | 
			
		||||
 | 
			
		||||
FROM: math.ranges => [a,b] ;
 | 
			
		||||
| 
						 | 
				
			
			@ -30,30 +30,41 @@ SINGLETON: back-anchor INSTANCE: back-anchor node
 | 
			
		|||
 | 
			
		||||
TUPLE: option-on option ; INSTANCE: option-on node
 | 
			
		||||
TUPLE: option-off option ; INSTANCE: option-off node
 | 
			
		||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case ;
 | 
			
		||||
MIXIN: regexp-option
 | 
			
		||||
INSTANCE: unix-lines regexp-option
 | 
			
		||||
INSTANCE: dotall regexp-option
 | 
			
		||||
INSTANCE: multiline regexp-option
 | 
			
		||||
INSTANCE: comments regexp-option
 | 
			
		||||
INSTANCE: case-insensitive regexp-option
 | 
			
		||||
INSTANCE: unicode-case regexp-option
 | 
			
		||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;
 | 
			
		||||
 | 
			
		||||
SINGLETONS: letter-class LETTER-class Letter-class digit-class
 | 
			
		||||
alpha-class non-newline-blank-class
 | 
			
		||||
ascii-class punctuation-class java-printable-class blank-class
 | 
			
		||||
control-character-class hex-digit-class java-blank-class c-identifier-class ;
 | 
			
		||||
control-character-class hex-digit-class java-blank-class c-identifier-class
 | 
			
		||||
unmatchable-class ;
 | 
			
		||||
 | 
			
		||||
SINGLETONS: beginning-of-group end-of-group
 | 
			
		||||
beginning-of-character-class end-of-character-class
 | 
			
		||||
left-parenthesis pipe caret dash ;
 | 
			
		||||
 | 
			
		||||
: <constant> ( obj -- constant ) constant boa ;
 | 
			
		||||
: get-option ( option -- ? ) current-regexp get options>> at ;
 | 
			
		||||
: get-unix-lines ( -- ? ) unix-lines get-option ;
 | 
			
		||||
: get-dotall ( -- ? ) dotall get-option ;
 | 
			
		||||
: get-multiline ( -- ? ) multiline get-option ;
 | 
			
		||||
: get-comments ( -- ? ) comments get-option ;
 | 
			
		||||
: get-case-insensitive ( -- ? ) case-insensitive get-option ;
 | 
			
		||||
: get-unicode-case ( -- ? ) unicode-case get-option ;
 | 
			
		||||
: get-reversed-regexp ( -- ? ) reversed-regexp get-option ;
 | 
			
		||||
 | 
			
		||||
: <negation> ( obj -- negation ) negation boa ;
 | 
			
		||||
: <concatenation> ( seq -- concatenation ) >vector concatenation boa ;
 | 
			
		||||
: <concatenation> ( seq -- concatenation )
 | 
			
		||||
    >vector get-reversed-regexp [ reverse ] when
 | 
			
		||||
    concatenation boa ;
 | 
			
		||||
: <alternation> ( seq -- alternation ) >vector alternation boa ;
 | 
			
		||||
: <capture-group> ( obj -- capture-group ) capture-group boa ;
 | 
			
		||||
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
 | 
			
		||||
: <constant> ( obj -- constant )
 | 
			
		||||
    dup Letter? get-case-insensitive and [
 | 
			
		||||
        [ ch>lower constant boa ]
 | 
			
		||||
        [ ch>upper constant boa ] bi 2array <alternation>
 | 
			
		||||
    ] [
 | 
			
		||||
        constant boa
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: first|concatenation ( seq -- first/concatenation )
 | 
			
		||||
    dup length 1 = [ first ] [ <concatenation> ] if ;
 | 
			
		||||
| 
						 | 
				
			
			@ -61,6 +72,17 @@ left-parenthesis pipe caret dash ;
 | 
			
		|||
: first|alternation ( seq -- first/alternation )
 | 
			
		||||
    dup length 1 = [ first ] [ <alternation> ] if ;
 | 
			
		||||
 | 
			
		||||
: <character-class-range> ( from to -- obj )
 | 
			
		||||
    2dup [ Letter? ] bi@ or get-case-insensitive and [
 | 
			
		||||
        [ [ ch>lower ] bi@ character-class-range boa ]
 | 
			
		||||
        [ [ ch>upper ] bi@ character-class-range boa ] 2bi
 | 
			
		||||
        2array [ [ from>> ] [ to>> ] bi < ] filter
 | 
			
		||||
        [ unmatchable-class ] [ first|alternation ] if-empty
 | 
			
		||||
    ] [
 | 
			
		||||
        2dup <
 | 
			
		||||
        [ character-class-range boa ] [ 2drop unmatchable-class ] if
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
ERROR: unmatched-parentheses ;
 | 
			
		||||
 | 
			
		||||
: make-positive-lookahead ( string -- )
 | 
			
		||||
| 
						 | 
				
			
			@ -90,24 +112,26 @@ ERROR: bad-option ch ;
 | 
			
		|||
        { CHAR: i [ case-insensitive ] }
 | 
			
		||||
        { CHAR: d [ unix-lines ] }
 | 
			
		||||
        { CHAR: m [ multiline ] }
 | 
			
		||||
        { CHAR: r [ reversed-regexp ] }
 | 
			
		||||
        { CHAR: s [ dotall ] }
 | 
			
		||||
        { CHAR: u [ unicode-case ] }
 | 
			
		||||
        { CHAR: x [ comments ] }
 | 
			
		||||
        [ bad-option ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: option-on ( ch -- ) option \ option-on boa push-stack ;
 | 
			
		||||
: option-off ( ch -- ) option \ option-off boa push-stack ;
 | 
			
		||||
: toggle-option ( ch ? -- ) [ option-on ] [ option-off ] if ;
 | 
			
		||||
: option-on ( option -- ) current-regexp get options>> conjoin ;
 | 
			
		||||
: option-off ( option -- ) current-regexp get options>> delete-at ;
 | 
			
		||||
 | 
			
		||||
: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
 | 
			
		||||
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
 | 
			
		||||
 | 
			
		||||
: parse-options ( string -- )
 | 
			
		||||
    "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
 | 
			
		||||
 | 
			
		||||
DEFER: (parse-regexp)
 | 
			
		||||
: parse-special-group-options ( options -- )
 | 
			
		||||
: parse-special-group ( -- )
 | 
			
		||||
    beginning-of-group push-stack
 | 
			
		||||
    parse-options (parse-regexp) pop-stack make-non-capturing-group ;
 | 
			
		||||
    (parse-regexp) pop-stack make-non-capturing-group ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-special-group string ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -126,8 +150,13 @@ ERROR: bad-special-group string ;
 | 
			
		|||
        { [ dup CHAR: < = peek1 CHAR: ! = and ]
 | 
			
		||||
            [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] }
 | 
			
		||||
        [
 | 
			
		||||
            ":" read-until [ bad-special-group ] unless
 | 
			
		||||
            swap prefix parse-special-group-options
 | 
			
		||||
            ":)" read-until
 | 
			
		||||
            [ swap prefix ] dip
 | 
			
		||||
            {
 | 
			
		||||
                { CHAR: : [ parse-options parse-special-group ] }
 | 
			
		||||
                { CHAR: ) [ parse-options ] }
 | 
			
		||||
                [ drop bad-special-group ]
 | 
			
		||||
            } case
 | 
			
		||||
        ]
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -193,10 +222,10 @@ ERROR: expected-posix-class ;
 | 
			
		|||
    read1 CHAR: { = [ expected-posix-class ] unless
 | 
			
		||||
    "}" read-until [ bad-character-class ] unless
 | 
			
		||||
    {
 | 
			
		||||
        { "Lower" [ letter-class ] }
 | 
			
		||||
        { "Upper" [ LETTER-class ] }
 | 
			
		||||
        { "ASCII" [ ascii-class ] }
 | 
			
		||||
        { "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
 | 
			
		||||
        { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
 | 
			
		||||
        { "Alpha" [ Letter-class ] }
 | 
			
		||||
        { "ASCII" [ ascii-class ] }
 | 
			
		||||
        { "Digit" [ digit-class ] }
 | 
			
		||||
        { "Alnum" [ alpha-class ] }
 | 
			
		||||
        { "Punct" [ punctuation-class ] }
 | 
			
		||||
| 
						 | 
				
			
			@ -250,6 +279,13 @@ ERROR: bad-escaped-literals seq ;
 | 
			
		|||
        { CHAR: 0 [ parse-octal <constant> ] }
 | 
			
		||||
        { CHAR: c [ parse-control-character ] }
 | 
			
		||||
 | 
			
		||||
        ! { CHAR: b [ handle-word-boundary ] }
 | 
			
		||||
        ! { CHAR: B [ handle-word-boundary <negation> ] }
 | 
			
		||||
        ! { CHAR: A [ handle-beginning-of-input ] }
 | 
			
		||||
        ! { CHAR: G [ end of previous match ] }
 | 
			
		||||
        ! { CHAR: Z [ handle-end-of-input ] }
 | 
			
		||||
        ! { CHAR: z [ handle-end-of-input ] } ! except for terminator
 | 
			
		||||
 | 
			
		||||
        { CHAR: Q [ parse-escaped-literals ] }
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -273,7 +309,7 @@ ERROR: bad-escaped-literals seq ;
 | 
			
		|||
    handle-dash handle-caret ;
 | 
			
		||||
 | 
			
		||||
: apply-dash ( -- )
 | 
			
		||||
    stack [ pop3 nip character-class-range boa ] keep push ;
 | 
			
		||||
    stack [ pop3 nip <character-class-range> ] keep push ;
 | 
			
		||||
 | 
			
		||||
: apply-dash? ( -- ? )
 | 
			
		||||
    stack dup length 3 >=
 | 
			
		||||
| 
						 | 
				
			
			@ -312,16 +348,9 @@ DEFER: handle-left-bracket
 | 
			
		|||
    beginning-of-character-class push-stack
 | 
			
		||||
    parse-character-class-first (parse-character-class) ;
 | 
			
		||||
 | 
			
		||||
ERROR: empty-regexp ;
 | 
			
		||||
: finish-regexp-parse ( stack -- obj )
 | 
			
		||||
    dup length {
 | 
			
		||||
        { 0 [ empty-regexp ] }
 | 
			
		||||
        { 1 [ first ] }
 | 
			
		||||
        [
 | 
			
		||||
            drop { pipe } split
 | 
			
		||||
            [ first|concatenation ] map first|alternation
 | 
			
		||||
        ]
 | 
			
		||||
    } case ;
 | 
			
		||||
    { pipe } split
 | 
			
		||||
    [ first|concatenation ] map first|alternation ;
 | 
			
		||||
 | 
			
		||||
: handle-right-parenthesis ( -- )
 | 
			
		||||
    stack beginning-of-group over last-index cut rest
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,14 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: kernel strings help.markup help.syntax regexp2.backend ;
 | 
			
		||||
IN: regexp2
 | 
			
		||||
 | 
			
		||||
HELP: <regexp>
 | 
			
		||||
{ $values { "string" string } { "regexp" regexp } }
 | 
			
		||||
{ $description "Compiles a regular expression into a DFA and returns this object.  Regular expressions only have to be compiled once and can then be used multiple times to match input strings." } ;
 | 
			
		||||
 | 
			
		||||
HELP: <iregexp>
 | 
			
		||||
{ $values { "string" string } { "regexp" regexp } }
 | 
			
		||||
{ $description "Compiles a case-insensitive regular expression into a DFA and returns this object.  Otherwise, exactly the same as " { $link <regexp> } } ;
 | 
			
		||||
 | 
			
		||||
{ <regexp> <iregexp> } related-words
 | 
			
		||||
| 
						 | 
				
			
			@ -1,4 +1,5 @@
 | 
			
		|||
USING: regexp2 tools.test kernel regexp2.traversal ;
 | 
			
		||||
USING: regexp2 tools.test kernel sequences regexp2.parser
 | 
			
		||||
regexp2.traversal ;
 | 
			
		||||
IN: regexp2-tests
 | 
			
		||||
 | 
			
		||||
[ f ] [ "b" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			@ -151,7 +152,7 @@ IN: regexp2-tests
 | 
			
		|||
[ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			@ -203,6 +204,8 @@ IN: regexp2-tests
 | 
			
		|||
    <regexp> drop
 | 
			
		||||
] unit-test
 | 
			
		||||
 | 
			
		||||
[ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
 | 
			
		||||
 | 
			
		||||
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -226,9 +229,29 @@ IN: regexp2-tests
 | 
			
		|||
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
! [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
! [ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
! Bug in parsing word
 | 
			
		||||
! [ t ] [ "a" R' a' matches?  ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,8 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors combinators kernel regexp2.backend regexp2.utils
 | 
			
		||||
regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal state-tables
 | 
			
		||||
USING: accessors combinators kernel math math.ranges
 | 
			
		||||
sequences regexp2.backend regexp2.utils memoize sets
 | 
			
		||||
regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal
 | 
			
		||||
regexp2.transition-tables ;
 | 
			
		||||
IN: regexp2
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -13,8 +14,7 @@ IN: regexp2
 | 
			
		|||
        <transition-table> >>minimized-table
 | 
			
		||||
        reset-regexp ;
 | 
			
		||||
 | 
			
		||||
: <regexp> ( string -- regexp )
 | 
			
		||||
    default-regexp
 | 
			
		||||
: construct-regexp ( regexp -- regexp' )
 | 
			
		||||
    {
 | 
			
		||||
        [ parse-regexp ]
 | 
			
		||||
        [ construct-nfa ]
 | 
			
		||||
| 
						 | 
				
			
			@ -22,6 +22,30 @@ IN: regexp2
 | 
			
		|||
        [ ]
 | 
			
		||||
    } cleave ;
 | 
			
		||||
 | 
			
		||||
: match ( string regexp -- pair )
 | 
			
		||||
    <dfa-traverser> do-match return-match ;
 | 
			
		||||
 | 
			
		||||
: matches? ( string regexp -- ? )
 | 
			
		||||
    dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
 | 
			
		||||
 | 
			
		||||
: match-head ( string regexp -- end ) match length>> 1- ;
 | 
			
		||||
 | 
			
		||||
: initial-option ( regexp option -- regexp' )
 | 
			
		||||
    over options>> conjoin ;
 | 
			
		||||
 | 
			
		||||
: <regexp> ( string -- regexp )
 | 
			
		||||
    default-regexp construct-regexp ;
 | 
			
		||||
 | 
			
		||||
: <iregexp> ( string -- regexp )
 | 
			
		||||
    default-regexp
 | 
			
		||||
    case-insensitive initial-option
 | 
			
		||||
    construct-regexp ;
 | 
			
		||||
 | 
			
		||||
: <rregexp> ( string -- regexp )
 | 
			
		||||
    default-regexp
 | 
			
		||||
    reversed-regexp initial-option
 | 
			
		||||
    construct-regexp ;
 | 
			
		||||
 | 
			
		||||
: R! CHAR: ! <regexp> ; parsing
 | 
			
		||||
: R" CHAR: " <regexp> ; parsing
 | 
			
		||||
: R# CHAR: # <regexp> ; parsing
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -78,11 +78,3 @@ TUPLE: dfa-traverser
 | 
			
		|||
    dup matches>>
 | 
			
		||||
    [ drop f ]
 | 
			
		||||
    [ [ start-index>> ] [ peek ] bi* 1 <range> ] if-empty ;
 | 
			
		||||
 | 
			
		||||
: match ( string regexp -- pair )
 | 
			
		||||
    <dfa-traverser> do-match return-match ;
 | 
			
		||||
 | 
			
		||||
: matches? ( string regexp -- ? )
 | 
			
		||||
    dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
 | 
			
		||||
 | 
			
		||||
: match-head ( string regexp -- end ) match length>> 1- ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
Doug Coleman
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,25 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors hashtables kernel math state-tables vars vectors ;
 | 
			
		||||
IN: regexp2.backend
 | 
			
		||||
 | 
			
		||||
TUPLE: regexp
 | 
			
		||||
    raw
 | 
			
		||||
    { stack vector }
 | 
			
		||||
    parse-tree
 | 
			
		||||
    { options hashtable }
 | 
			
		||||
    nfa-table
 | 
			
		||||
    dfa-table
 | 
			
		||||
    minimized-table
 | 
			
		||||
    { state integer }
 | 
			
		||||
    { new-states vector }
 | 
			
		||||
    { visited-states hashtable } ;
 | 
			
		||||
 | 
			
		||||
: reset-regexp ( regexp -- regexp )
 | 
			
		||||
    0 >>state
 | 
			
		||||
    V{ } clone >>stack
 | 
			
		||||
    V{ } clone >>new-states
 | 
			
		||||
    H{ } clone >>options
 | 
			
		||||
    H{ } clone >>visited-states ;
 | 
			
		||||
 | 
			
		||||
SYMBOL: current-regexp
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,55 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors kernel math math.order symbols regexp2.parser
 | 
			
		||||
words regexp2.utils unicode.categories combinators.short-circuit ;
 | 
			
		||||
IN: regexp2.classes
 | 
			
		||||
 | 
			
		||||
GENERIC: class-member? ( obj class -- ? )
 | 
			
		||||
 | 
			
		||||
M: word class-member? ( obj class -- ? ) 2drop f ;
 | 
			
		||||
M: integer class-member? ( obj class -- ? ) 2drop f ;
 | 
			
		||||
 | 
			
		||||
M: character-class-range class-member? ( obj class -- ? )
 | 
			
		||||
    [ from>> ] [ to>> ] bi between? ;
 | 
			
		||||
 | 
			
		||||
M: any-char class-member? ( obj class -- ? )
 | 
			
		||||
    2drop t ;
 | 
			
		||||
    
 | 
			
		||||
M: letter-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop letter? ;
 | 
			
		||||
            
 | 
			
		||||
M: LETTER-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop LETTER? ;
 | 
			
		||||
 | 
			
		||||
M: Letter-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop Letter? ;
 | 
			
		||||
 | 
			
		||||
M: ascii-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop ascii? ;
 | 
			
		||||
 | 
			
		||||
M: digit-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop digit? ;
 | 
			
		||||
 | 
			
		||||
M: alpha-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop alpha? ;
 | 
			
		||||
 | 
			
		||||
M: punctuation-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop punct? ;
 | 
			
		||||
 | 
			
		||||
M: java-printable-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop java-printable? ;
 | 
			
		||||
 | 
			
		||||
M: non-newline-blank-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop { [ blank? ] [ CHAR: \n = not ] } 1&& ;
 | 
			
		||||
 | 
			
		||||
M: control-character-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop control-char? ;
 | 
			
		||||
 | 
			
		||||
M: hex-digit-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop hex-digit? ;
 | 
			
		||||
 | 
			
		||||
M: java-blank-class class-member? ( obj class -- ? )
 | 
			
		||||
    drop java-blank? ;
 | 
			
		||||
 | 
			
		||||
M: unmatchable-class class-member? ( obj class -- ? )
 | 
			
		||||
    2drop f ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,70 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays assocs combinators fry kernel locals
 | 
			
		||||
math math.order regexp2.nfa regexp2.transition-tables sequences
 | 
			
		||||
sets sorting vectors regexp2.utils sequences.lib ;
 | 
			
		||||
USING: io prettyprint threads ;
 | 
			
		||||
IN: regexp2.dfa
 | 
			
		||||
 | 
			
		||||
: find-delta ( states transition regexp -- new-states )
 | 
			
		||||
    nfa-table>> transitions>>
 | 
			
		||||
    rot [ swap at at ] with with map sift concat prune ;
 | 
			
		||||
 | 
			
		||||
: (find-epsilon-closure) ( states regexp -- new-states )
 | 
			
		||||
    eps swap find-delta ;
 | 
			
		||||
 | 
			
		||||
: find-epsilon-closure ( states regexp -- new-states )
 | 
			
		||||
    '[ dup , (find-epsilon-closure) union ] [ length ] while-changes
 | 
			
		||||
    natural-sort ;
 | 
			
		||||
 | 
			
		||||
: find-closure ( states transition regexp -- new-states )
 | 
			
		||||
    [ find-delta ] 2keep nip find-epsilon-closure ;
 | 
			
		||||
 | 
			
		||||
: find-start-state ( regexp -- state )
 | 
			
		||||
    [ nfa-table>> start-state>> 1vector ] keep find-epsilon-closure ;
 | 
			
		||||
 | 
			
		||||
: find-transitions ( seq1 regexp -- seq2 )
 | 
			
		||||
    nfa-table>> transitions>>
 | 
			
		||||
    [ at keys ] curry map concat eps swap remove ;
 | 
			
		||||
 | 
			
		||||
: add-todo-state ( state regexp -- )
 | 
			
		||||
    2dup visited-states>> key? [
 | 
			
		||||
        2drop
 | 
			
		||||
    ] [
 | 
			
		||||
        [ visited-states>> conjoin ]
 | 
			
		||||
        [ new-states>> push ] 2bi
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: new-transitions ( regexp -- )
 | 
			
		||||
    dup new-states>> [
 | 
			
		||||
        drop
 | 
			
		||||
    ] [
 | 
			
		||||
        dupd pop dup pick find-transitions rot
 | 
			
		||||
        [
 | 
			
		||||
            [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
 | 
			
		||||
            >r swapd transition boa r> dfa-table>> add-transition 
 | 
			
		||||
        ] curry with each
 | 
			
		||||
        new-transitions
 | 
			
		||||
    ] if-empty ;
 | 
			
		||||
 | 
			
		||||
: states ( hashtable -- array )
 | 
			
		||||
    [ keys ]
 | 
			
		||||
    [ values [ values concat ] map concat append ] bi ;
 | 
			
		||||
 | 
			
		||||
: set-final-states ( regexp -- )
 | 
			
		||||
    dup
 | 
			
		||||
    [ nfa-table>> final-states>> keys ]
 | 
			
		||||
    [ dfa-table>> transitions>> states ] bi
 | 
			
		||||
    [ intersect empty? not ] with filter
 | 
			
		||||
 | 
			
		||||
    swap dfa-table>> final-states>>
 | 
			
		||||
    [ conjoin ] curry each ;
 | 
			
		||||
 | 
			
		||||
: set-initial-state ( regexp -- )
 | 
			
		||||
    dup
 | 
			
		||||
    [ dfa-table>> ] [ find-start-state ] bi
 | 
			
		||||
    [ >>start-state drop ] keep
 | 
			
		||||
    1vector >>new-states drop ;
 | 
			
		||||
 | 
			
		||||
: construct-dfa ( regexp -- )
 | 
			
		||||
    [ set-initial-state ] [ new-transitions ] [ set-final-states ] tri ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,126 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays assocs grouping kernel regexp2.backend
 | 
			
		||||
locals math namespaces regexp2.parser sequences state-tables fry
 | 
			
		||||
quotations math.order math.ranges vectors unicode.categories
 | 
			
		||||
regexp2.utils regexp2.transition-tables words sequences.lib ;
 | 
			
		||||
IN: regexp2.nfa
 | 
			
		||||
 | 
			
		||||
SYMBOL: negation-mode
 | 
			
		||||
: negated? ( -- ? ) negation-mode get 0 or odd? ; 
 | 
			
		||||
 | 
			
		||||
SINGLETON: eps
 | 
			
		||||
 | 
			
		||||
: next-state ( regexp -- state )
 | 
			
		||||
    [ state>> ] [ [ 1+ ] change-state drop ] bi ;
 | 
			
		||||
 | 
			
		||||
: set-start-state ( regexp -- )
 | 
			
		||||
    dup stack>> [
 | 
			
		||||
        drop
 | 
			
		||||
    ] [
 | 
			
		||||
        [ nfa-table>> ] [ pop first ] bi* >>start-state drop
 | 
			
		||||
    ] if-empty ;
 | 
			
		||||
 | 
			
		||||
GENERIC: nfa-node ( node -- )
 | 
			
		||||
 | 
			
		||||
:: add-simple-entry ( obj class -- )
 | 
			
		||||
    [let* | regexp [ current-regexp get ]
 | 
			
		||||
            s0 [ regexp next-state ]
 | 
			
		||||
            s1 [ regexp next-state ]
 | 
			
		||||
            stack [ regexp stack>> ]
 | 
			
		||||
            table [ regexp nfa-table>> ] |
 | 
			
		||||
        negated? [
 | 
			
		||||
            s0 f obj class boa table add-transition
 | 
			
		||||
            s0 s1 <default-transition> table add-transition
 | 
			
		||||
        ] [
 | 
			
		||||
            s0 s1 obj class boa table add-transition
 | 
			
		||||
        ] if
 | 
			
		||||
        s0 s1 2array stack push
 | 
			
		||||
        t s1 table final-states>> set-at ] ;
 | 
			
		||||
 | 
			
		||||
:: concatenate-nodes ( -- )
 | 
			
		||||
    [let* | regexp [ current-regexp get ]
 | 
			
		||||
            stack [ regexp stack>> ]
 | 
			
		||||
            table [ regexp nfa-table>> ]
 | 
			
		||||
            s2 [ stack peek first ]
 | 
			
		||||
            s3 [ stack pop second ]
 | 
			
		||||
            s0 [ stack peek first ]
 | 
			
		||||
            s1 [ stack pop second ] |
 | 
			
		||||
        s1 s2 eps <literal-transition> table add-transition
 | 
			
		||||
        s1 table final-states>> delete-at
 | 
			
		||||
        s0 s3 2array stack push ] ;
 | 
			
		||||
 | 
			
		||||
:: alternate-nodes ( -- )
 | 
			
		||||
    [let* | regexp [ current-regexp get ]
 | 
			
		||||
            stack [ regexp stack>> ]
 | 
			
		||||
            table [ regexp nfa-table>> ]
 | 
			
		||||
            s2 [ stack peek first ]
 | 
			
		||||
            s3 [ stack pop second ]
 | 
			
		||||
            s0 [ stack peek first ]
 | 
			
		||||
            s1 [ stack pop second ]
 | 
			
		||||
            s4 [ regexp next-state ]
 | 
			
		||||
            s5 [ regexp next-state ] |
 | 
			
		||||
        s4 s0 eps <literal-transition> table add-transition
 | 
			
		||||
        s4 s2 eps <literal-transition> table add-transition
 | 
			
		||||
        s1 s5 eps <literal-transition> table add-transition
 | 
			
		||||
        s3 s5 eps <literal-transition> table add-transition
 | 
			
		||||
        s1 table final-states>> delete-at
 | 
			
		||||
        s3 table final-states>> delete-at
 | 
			
		||||
        t s5 table final-states>> set-at
 | 
			
		||||
        s4 s5 2array stack push ] ;
 | 
			
		||||
 | 
			
		||||
M: kleene-star nfa-node ( node -- )
 | 
			
		||||
    term>> nfa-node
 | 
			
		||||
    [let* | regexp [ current-regexp get ]
 | 
			
		||||
            stack [ regexp stack>> ]
 | 
			
		||||
            s0 [ stack peek first ]
 | 
			
		||||
            s1 [ stack pop second ]
 | 
			
		||||
            s2 [ regexp next-state ]
 | 
			
		||||
            s3 [ regexp next-state ]
 | 
			
		||||
            table [ regexp nfa-table>> ] |
 | 
			
		||||
        s1 table final-states>> delete-at
 | 
			
		||||
        t s3 table final-states>> set-at
 | 
			
		||||
        s1 s0 eps <literal-transition> table add-transition
 | 
			
		||||
        s2 s0 eps <literal-transition> table add-transition
 | 
			
		||||
        s2 s3 eps <literal-transition> table add-transition
 | 
			
		||||
        s1 s3 eps <literal-transition> table add-transition
 | 
			
		||||
        s2 s3 2array stack push ] ;
 | 
			
		||||
 | 
			
		||||
M: concatenation nfa-node ( node -- )
 | 
			
		||||
    seq>>
 | 
			
		||||
    [ [ nfa-node ] each ]
 | 
			
		||||
    [ length 1- [ concatenate-nodes ] times ] bi ;
 | 
			
		||||
 | 
			
		||||
M: alternation nfa-node ( node -- )
 | 
			
		||||
    seq>>
 | 
			
		||||
    [ [ nfa-node ] each ]
 | 
			
		||||
    [ length 1- [ alternate-nodes ] times ] bi ;
 | 
			
		||||
 | 
			
		||||
M: constant nfa-node ( node -- )
 | 
			
		||||
    char>> literal-transition add-simple-entry ;
 | 
			
		||||
 | 
			
		||||
M: epsilon nfa-node ( node -- )
 | 
			
		||||
    drop eps literal-transition add-simple-entry ;
 | 
			
		||||
 | 
			
		||||
M: word nfa-node ( node -- )
 | 
			
		||||
    class-transition add-simple-entry ;
 | 
			
		||||
 | 
			
		||||
M: character-class-range nfa-node ( node -- )
 | 
			
		||||
    class-transition add-simple-entry ;
 | 
			
		||||
 | 
			
		||||
M: capture-group nfa-node ( node -- )
 | 
			
		||||
    term>> nfa-node ;
 | 
			
		||||
 | 
			
		||||
M: negation nfa-node ( node -- )
 | 
			
		||||
    negation-mode inc
 | 
			
		||||
    term>> nfa-node 
 | 
			
		||||
    negation-mode dec ;
 | 
			
		||||
 | 
			
		||||
: construct-nfa ( regexp -- )
 | 
			
		||||
    [
 | 
			
		||||
        reset-regexp
 | 
			
		||||
        negation-mode off
 | 
			
		||||
        [ current-regexp set ]
 | 
			
		||||
        [ parse-tree>> nfa-node ]
 | 
			
		||||
        [ set-start-state ] tri
 | 
			
		||||
    ] with-scope ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,33 @@
 | 
			
		|||
USING: kernel tools.test regexp2.backend regexp2 ;
 | 
			
		||||
IN: regexp2.parser
 | 
			
		||||
 | 
			
		||||
: test-regexp ( string -- )
 | 
			
		||||
    default-regexp parse-regexp ;
 | 
			
		||||
 | 
			
		||||
: test-regexp2 ( string -- regexp )
 | 
			
		||||
    default-regexp dup parse-regexp ;
 | 
			
		||||
 | 
			
		||||
[ "(" ] [ unmatched-parentheses? ] must-fail-with
 | 
			
		||||
 | 
			
		||||
[ ] [ "a|b" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "a.b" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "a|b|c" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "abc|b" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "a|bcd" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "a|(b)" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "(a)|b" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "(a|b)" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "((a)|(b))" test-regexp ] unit-test
 | 
			
		||||
 | 
			
		||||
[ ] [ "(?:a)" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "(?i:a)" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "(?-i:a)" test-regexp ] unit-test
 | 
			
		||||
[ "(?z:a)" test-regexp ] [ bad-option? ] must-fail-with
 | 
			
		||||
[ "(?-z:a)" test-regexp ] [ bad-option? ] must-fail-with
 | 
			
		||||
 | 
			
		||||
[ ] [ "(?=a)" test-regexp ] unit-test
 | 
			
		||||
 | 
			
		||||
[ ] [ "[abc]" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "[a-c]" test-regexp ] unit-test
 | 
			
		||||
[ ] [ "[^a-c]" test-regexp ] unit-test
 | 
			
		||||
[ "[^]" test-regexp ] must-fail
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,391 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays assocs combinators io io.streams.string
 | 
			
		||||
kernel math math.parser multi-methods namespaces qualified sets
 | 
			
		||||
quotations sequences sequences.lib splitting symbols vectors
 | 
			
		||||
dlists math.order combinators.lib unicode.categories strings
 | 
			
		||||
sequences.lib regexp2.backend regexp2.utils unicode.case ;
 | 
			
		||||
IN: regexp2.parser
 | 
			
		||||
 | 
			
		||||
FROM: math.ranges => [a,b] ;
 | 
			
		||||
 | 
			
		||||
MIXIN: node
 | 
			
		||||
TUPLE: concatenation seq ; INSTANCE: concatenation node
 | 
			
		||||
TUPLE: alternation seq ; INSTANCE: alternation node
 | 
			
		||||
TUPLE: kleene-star term ; INSTANCE: kleene-star node
 | 
			
		||||
TUPLE: question term ; INSTANCE: question node
 | 
			
		||||
TUPLE: negation term ; INSTANCE: negation node
 | 
			
		||||
TUPLE: constant char ; INSTANCE: constant node
 | 
			
		||||
TUPLE: range from to ; INSTANCE: range node
 | 
			
		||||
TUPLE: lookahead term ; INSTANCE: lookahead node
 | 
			
		||||
TUPLE: lookbehind term ; INSTANCE: lookbehind node
 | 
			
		||||
TUPLE: capture-group term ; INSTANCE: capture-group node
 | 
			
		||||
TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
 | 
			
		||||
TUPLE: independent-group term ; INSTANCE: independent-group node
 | 
			
		||||
TUPLE: character-class-range from to ; INSTANCE: character-class-range node
 | 
			
		||||
SINGLETON: epsilon INSTANCE: epsilon node
 | 
			
		||||
SINGLETON: any-char INSTANCE: any-char node
 | 
			
		||||
SINGLETON: front-anchor INSTANCE: front-anchor node
 | 
			
		||||
SINGLETON: back-anchor INSTANCE: back-anchor node
 | 
			
		||||
 | 
			
		||||
TUPLE: option-on option ; INSTANCE: option-on node
 | 
			
		||||
TUPLE: option-off option ; INSTANCE: option-off node
 | 
			
		||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;
 | 
			
		||||
 | 
			
		||||
SINGLETONS: letter-class LETTER-class Letter-class digit-class
 | 
			
		||||
alpha-class non-newline-blank-class
 | 
			
		||||
ascii-class punctuation-class java-printable-class blank-class
 | 
			
		||||
control-character-class hex-digit-class java-blank-class c-identifier-class
 | 
			
		||||
unmatchable-class ;
 | 
			
		||||
 | 
			
		||||
SINGLETONS: beginning-of-group end-of-group
 | 
			
		||||
beginning-of-character-class end-of-character-class
 | 
			
		||||
left-parenthesis pipe caret dash ;
 | 
			
		||||
 | 
			
		||||
: get-option ( option -- ? ) current-regexp get options>> at ;
 | 
			
		||||
: get-unix-lines ( -- ? ) unix-lines get-option ;
 | 
			
		||||
: get-dotall ( -- ? ) dotall get-option ;
 | 
			
		||||
: get-multiline ( -- ? ) multiline get-option ;
 | 
			
		||||
: get-comments ( -- ? ) comments get-option ;
 | 
			
		||||
: get-case-insensitive ( -- ? ) case-insensitive get-option ;
 | 
			
		||||
: get-unicode-case ( -- ? ) unicode-case get-option ;
 | 
			
		||||
: get-reversed-regexp ( -- ? ) reversed-regexp get-option ;
 | 
			
		||||
 | 
			
		||||
: <negation> ( obj -- negation ) negation boa ;
 | 
			
		||||
: <concatenation> ( seq -- concatenation )
 | 
			
		||||
    >vector get-reversed-regexp [ reverse ] when
 | 
			
		||||
    concatenation boa ;
 | 
			
		||||
: <alternation> ( seq -- alternation ) >vector alternation boa ;
 | 
			
		||||
: <capture-group> ( obj -- capture-group ) capture-group boa ;
 | 
			
		||||
: <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
 | 
			
		||||
: <constant> ( obj -- constant )
 | 
			
		||||
    dup Letter? get-case-insensitive and [
 | 
			
		||||
        [ ch>lower constant boa ]
 | 
			
		||||
        [ ch>upper constant boa ] bi 2array <alternation>
 | 
			
		||||
    ] [
 | 
			
		||||
        constant boa
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: first|concatenation ( seq -- first/concatenation )
 | 
			
		||||
    dup length 1 = [ first ] [ <concatenation> ] if ;
 | 
			
		||||
 | 
			
		||||
: first|alternation ( seq -- first/alternation )
 | 
			
		||||
    dup length 1 = [ first ] [ <alternation> ] if ;
 | 
			
		||||
 | 
			
		||||
: <character-class-range> ( from to -- obj )
 | 
			
		||||
    2dup [ Letter? ] bi@ or get-case-insensitive and [
 | 
			
		||||
        [ [ ch>lower ] bi@ character-class-range boa ]
 | 
			
		||||
        [ [ ch>upper ] bi@ character-class-range boa ] 2bi
 | 
			
		||||
        2array [ [ from>> ] [ to>> ] bi < ] filter
 | 
			
		||||
        [ unmatchable-class ] [ first|alternation ] if-empty
 | 
			
		||||
    ] [
 | 
			
		||||
        2dup <
 | 
			
		||||
        [ character-class-range boa ] [ 2drop unmatchable-class ] if
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
ERROR: unmatched-parentheses ;
 | 
			
		||||
 | 
			
		||||
: make-positive-lookahead ( string -- )
 | 
			
		||||
    lookahead boa push-stack ;
 | 
			
		||||
 | 
			
		||||
: make-negative-lookahead ( string -- )
 | 
			
		||||
    <negation> lookahead boa push-stack ;
 | 
			
		||||
 | 
			
		||||
: make-independent-group ( string -- )
 | 
			
		||||
    #! no backtracking
 | 
			
		||||
    independent-group boa push-stack ;
 | 
			
		||||
 | 
			
		||||
: make-positive-lookbehind ( string -- )
 | 
			
		||||
    lookbehind boa push-stack ;
 | 
			
		||||
 | 
			
		||||
: make-negative-lookbehind ( string -- )
 | 
			
		||||
    <negation> lookbehind boa push-stack ;
 | 
			
		||||
 | 
			
		||||
DEFER: nested-parse-regexp
 | 
			
		||||
: make-non-capturing-group ( string -- )
 | 
			
		||||
    non-capture-group boa push-stack ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-option ch ;
 | 
			
		||||
 | 
			
		||||
: option ( ch -- singleton )
 | 
			
		||||
    {
 | 
			
		||||
        { CHAR: i [ case-insensitive ] }
 | 
			
		||||
        { CHAR: d [ unix-lines ] }
 | 
			
		||||
        { CHAR: m [ multiline ] }
 | 
			
		||||
        { CHAR: r [ reversed-regexp ] }
 | 
			
		||||
        { CHAR: s [ dotall ] }
 | 
			
		||||
        { CHAR: u [ unicode-case ] }
 | 
			
		||||
        { CHAR: x [ comments ] }
 | 
			
		||||
        [ bad-option ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: option-on ( option -- ) current-regexp get options>> conjoin ;
 | 
			
		||||
: option-off ( option -- ) current-regexp get options>> delete-at ;
 | 
			
		||||
 | 
			
		||||
: toggle-option ( ch ? -- ) [ option ] dip [ option-on ] [ option-off ] if ;
 | 
			
		||||
: (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
 | 
			
		||||
 | 
			
		||||
: parse-options ( string -- )
 | 
			
		||||
    "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
 | 
			
		||||
 | 
			
		||||
DEFER: (parse-regexp)
 | 
			
		||||
: parse-special-group ( -- )
 | 
			
		||||
    beginning-of-group push-stack
 | 
			
		||||
    (parse-regexp) pop-stack make-non-capturing-group ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-special-group string ;
 | 
			
		||||
 | 
			
		||||
: (parse-special-group) ( -- )
 | 
			
		||||
    read1 {
 | 
			
		||||
        { [ dup CHAR: : = ]
 | 
			
		||||
            [ drop nested-parse-regexp pop-stack make-non-capturing-group ] }
 | 
			
		||||
        { [ dup CHAR: = = ]
 | 
			
		||||
            [ drop nested-parse-regexp pop-stack make-positive-lookahead ] }
 | 
			
		||||
        { [ dup CHAR: = = ]
 | 
			
		||||
            [ drop nested-parse-regexp pop-stack make-negative-lookahead ] }
 | 
			
		||||
        { [ dup CHAR: > = ]
 | 
			
		||||
            [ drop nested-parse-regexp pop-stack make-independent-group ] }
 | 
			
		||||
        { [ dup CHAR: < = peek1 CHAR: = = and ]
 | 
			
		||||
            [ drop read1 drop nested-parse-regexp pop-stack make-positive-lookbehind ] }
 | 
			
		||||
        { [ dup CHAR: < = peek1 CHAR: ! = and ]
 | 
			
		||||
            [ drop read1 drop nested-parse-regexp pop-stack make-negative-lookbehind ] }
 | 
			
		||||
        [
 | 
			
		||||
            ":)" read-until
 | 
			
		||||
            [ swap prefix ] dip
 | 
			
		||||
            {
 | 
			
		||||
                { CHAR: : [ parse-options parse-special-group ] }
 | 
			
		||||
                { CHAR: ) [ parse-options ] }
 | 
			
		||||
                [ drop bad-special-group ]
 | 
			
		||||
            } case
 | 
			
		||||
        ]
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
: handle-left-parenthesis ( -- )
 | 
			
		||||
    peek1 CHAR: ? =
 | 
			
		||||
    [ read1 drop (parse-special-group) ]
 | 
			
		||||
    [ nested-parse-regexp ] if ;
 | 
			
		||||
 | 
			
		||||
: handle-dot ( -- ) any-char push-stack ;
 | 
			
		||||
: handle-pipe ( -- ) pipe push-stack ;
 | 
			
		||||
: handle-star ( -- ) stack pop <kleene-star> push-stack ;
 | 
			
		||||
: handle-question ( -- )
 | 
			
		||||
    stack pop epsilon 2array <alternation> push-stack ;
 | 
			
		||||
: handle-plus ( -- )
 | 
			
		||||
    stack pop dup <kleene-star> 2array <concatenation> push-stack ;
 | 
			
		||||
 | 
			
		||||
ERROR: unmatched-brace ;
 | 
			
		||||
: parse-repetition ( -- start finish ? )
 | 
			
		||||
    "}" read-until [ unmatched-brace ] unless
 | 
			
		||||
    [ "," split1 [ string>number ] bi@ ]
 | 
			
		||||
    [ CHAR: , swap index >boolean ] bi ;
 | 
			
		||||
 | 
			
		||||
: replicate/concatenate ( n obj -- obj' )
 | 
			
		||||
    over zero? [ 2drop epsilon ]
 | 
			
		||||
    [ <repetition> first|concatenation ] if ;
 | 
			
		||||
 | 
			
		||||
: exactly-n ( n -- )
 | 
			
		||||
    stack pop replicate/concatenate push-stack ;
 | 
			
		||||
 | 
			
		||||
: at-least-n ( n -- )
 | 
			
		||||
    stack pop
 | 
			
		||||
    [ replicate/concatenate ] keep
 | 
			
		||||
    <kleene-star> 2array <concatenation> push-stack ;
 | 
			
		||||
 | 
			
		||||
: at-most-n ( n -- )
 | 
			
		||||
    1+
 | 
			
		||||
    stack pop
 | 
			
		||||
    [ replicate/concatenate ] curry map <alternation> push-stack ;
 | 
			
		||||
 | 
			
		||||
: from-m-to-n ( m n -- )
 | 
			
		||||
    [a,b]
 | 
			
		||||
    stack pop
 | 
			
		||||
    [ replicate/concatenate ] curry map
 | 
			
		||||
    <alternation> push-stack ;
 | 
			
		||||
 | 
			
		||||
ERROR: invalid-range a b ;
 | 
			
		||||
 | 
			
		||||
: handle-left-brace ( -- )
 | 
			
		||||
    parse-repetition
 | 
			
		||||
    >r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r>
 | 
			
		||||
    [
 | 
			
		||||
        2dup and [ from-m-to-n ]
 | 
			
		||||
        [ [ nip at-most-n ] [ at-least-n ] if* ] if
 | 
			
		||||
    ] [ drop 0 max exactly-n ] if ;
 | 
			
		||||
 | 
			
		||||
: handle-front-anchor ( -- ) front-anchor push-stack ;
 | 
			
		||||
: handle-back-anchor ( -- ) back-anchor push-stack ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-character-class obj ;
 | 
			
		||||
ERROR: expected-posix-class ;
 | 
			
		||||
 | 
			
		||||
: parse-posix-class ( -- obj )
 | 
			
		||||
    read1 CHAR: { = [ expected-posix-class ] unless
 | 
			
		||||
    "}" read-until [ bad-character-class ] unless
 | 
			
		||||
    {
 | 
			
		||||
        { "Lower" [ get-case-insensitive Letter-class letter-class ? ] }
 | 
			
		||||
        { "Upper" [ get-case-insensitive Letter-class LETTER-class ? ] }
 | 
			
		||||
        { "Alpha" [ Letter-class ] }
 | 
			
		||||
        { "ASCII" [ ascii-class ] }
 | 
			
		||||
        { "Digit" [ digit-class ] }
 | 
			
		||||
        { "Alnum" [ alpha-class ] }
 | 
			
		||||
        { "Punct" [ punctuation-class ] }
 | 
			
		||||
        { "Graph" [ java-printable-class ] }
 | 
			
		||||
        { "Print" [ java-printable-class ] }
 | 
			
		||||
        { "Blank" [ non-newline-blank-class ] }
 | 
			
		||||
        { "Cntrl" [ control-character-class ] }
 | 
			
		||||
        { "XDigit" [ hex-digit-class ] }
 | 
			
		||||
        { "Space" [ java-blank-class ] }
 | 
			
		||||
        ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
 | 
			
		||||
        [ bad-character-class ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: parse-octal ( -- n ) 3 read oct> check-octal ;
 | 
			
		||||
: parse-short-hex ( -- n ) 2 read hex> check-hex ;
 | 
			
		||||
: parse-long-hex ( -- n ) 6 read hex> check-hex ;
 | 
			
		||||
: parse-control-character ( -- n ) read1 ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-escaped-literals seq ;
 | 
			
		||||
: parse-escaped-literals ( -- obj )
 | 
			
		||||
    "\\E" read-until [ bad-escaped-literals ] unless
 | 
			
		||||
    read1 drop
 | 
			
		||||
    [ epsilon ] [
 | 
			
		||||
        [ <constant> ] V{ } map-as
 | 
			
		||||
        first|concatenation
 | 
			
		||||
    ] if-empty ;
 | 
			
		||||
 | 
			
		||||
: parse-escaped ( -- obj )
 | 
			
		||||
    read1
 | 
			
		||||
    {
 | 
			
		||||
        { CHAR: \ [ CHAR: \ <constant> ] }
 | 
			
		||||
        { CHAR: . [ CHAR: . <constant> ] }
 | 
			
		||||
        { CHAR: t [ CHAR: \t <constant> ] }
 | 
			
		||||
        { CHAR: n [ CHAR: \n <constant> ] }
 | 
			
		||||
        { CHAR: r [ CHAR: \r <constant> ] }
 | 
			
		||||
        { CHAR: f [ HEX: c <constant> ] }
 | 
			
		||||
        { CHAR: a [ HEX: 7 <constant> ] }
 | 
			
		||||
        { CHAR: e [ HEX: 1b <constant> ] }
 | 
			
		||||
 | 
			
		||||
        { CHAR: d [ digit-class ] }
 | 
			
		||||
        { CHAR: D [ digit-class <negation> ] }
 | 
			
		||||
        { CHAR: s [ java-blank-class ] }
 | 
			
		||||
        { CHAR: S [ java-blank-class <negation> ] }
 | 
			
		||||
        { CHAR: w [ c-identifier-class ] }
 | 
			
		||||
        { CHAR: W [ c-identifier-class <negation> ] }
 | 
			
		||||
 | 
			
		||||
        { CHAR: p [ parse-posix-class ] }
 | 
			
		||||
        { CHAR: P [ parse-posix-class <negation> ] }
 | 
			
		||||
        { CHAR: x [ parse-short-hex <constant> ] }
 | 
			
		||||
        { CHAR: u [ parse-long-hex <constant> ] }
 | 
			
		||||
        { CHAR: 0 [ parse-octal <constant> ] }
 | 
			
		||||
        { CHAR: c [ parse-control-character ] }
 | 
			
		||||
 | 
			
		||||
        ! { CHAR: b [ handle-word-boundary ] }
 | 
			
		||||
        ! { CHAR: B [ handle-word-boundary <negation> ] }
 | 
			
		||||
        ! { CHAR: A [ handle-beginning-of-input ] }
 | 
			
		||||
        ! { CHAR: G [ end of previous match ] }
 | 
			
		||||
        ! { CHAR: Z [ handle-end-of-input ] }
 | 
			
		||||
        ! { CHAR: z [ handle-end-of-input ] } ! except for terminator
 | 
			
		||||
 | 
			
		||||
        { CHAR: Q [ parse-escaped-literals ] }
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: handle-escape ( -- ) parse-escaped push-stack ;
 | 
			
		||||
 | 
			
		||||
: handle-dash ( vector -- vector' )
 | 
			
		||||
    H{ { dash CHAR: - } } substitute ;
 | 
			
		||||
 | 
			
		||||
: character-class>alternation ( seq -- alternation )
 | 
			
		||||
    [ dup number? [ <constant> ] when ] map first|alternation ;
 | 
			
		||||
 | 
			
		||||
: handle-caret ( vector -- vector' )
 | 
			
		||||
    dup [ length 2 >= ] [ first caret eq? ] bi and [
 | 
			
		||||
        rest-slice character-class>alternation <negation>
 | 
			
		||||
    ] [
 | 
			
		||||
        character-class>alternation
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: make-character-class ( -- character-class )
 | 
			
		||||
    [ beginning-of-character-class swap cut-stack ] change-whole-stack
 | 
			
		||||
    handle-dash handle-caret ;
 | 
			
		||||
 | 
			
		||||
: apply-dash ( -- )
 | 
			
		||||
    stack [ pop3 nip <character-class-range> ] keep push ;
 | 
			
		||||
 | 
			
		||||
: apply-dash? ( -- ? )
 | 
			
		||||
    stack dup length 3 >=
 | 
			
		||||
    [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
 | 
			
		||||
 | 
			
		||||
ERROR: empty-negated-character-class ;
 | 
			
		||||
DEFER: handle-left-bracket
 | 
			
		||||
: (parse-character-class) ( -- )
 | 
			
		||||
    read1 [ empty-negated-character-class ] unless* {
 | 
			
		||||
        { CHAR: [ [ handle-left-bracket t ] }
 | 
			
		||||
        { CHAR: ] [ make-character-class push-stack f ] }
 | 
			
		||||
        { CHAR: - [ dash push-stack t ] }
 | 
			
		||||
        { CHAR: \ [ parse-escaped push-stack t ] }
 | 
			
		||||
        [ push-stack apply-dash? [ apply-dash ] when t ]
 | 
			
		||||
    } case
 | 
			
		||||
    [ (parse-character-class) ] when ;
 | 
			
		||||
 | 
			
		||||
: parse-character-class-second ( -- )
 | 
			
		||||
    read1 {
 | 
			
		||||
        { CHAR: [ [ CHAR: [ <constant> push-stack ] }
 | 
			
		||||
        { CHAR: ] [ CHAR: ] <constant> push-stack ] }
 | 
			
		||||
        { CHAR: - [ CHAR: - <constant> push-stack ] }
 | 
			
		||||
        [ push1 ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: parse-character-class-first ( -- )
 | 
			
		||||
    read1 {
 | 
			
		||||
        { CHAR: ^ [ caret push-stack parse-character-class-second ] }
 | 
			
		||||
        { CHAR: [ [ CHAR: [ <constant> push-stack ] }
 | 
			
		||||
        { CHAR: ] [ CHAR: ] <constant> push-stack ] }
 | 
			
		||||
        { CHAR: - [ CHAR: - <constant> push-stack ] }
 | 
			
		||||
        [ push1 ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: handle-left-bracket ( -- )
 | 
			
		||||
    beginning-of-character-class push-stack
 | 
			
		||||
    parse-character-class-first (parse-character-class) ;
 | 
			
		||||
 | 
			
		||||
: finish-regexp-parse ( stack -- obj )
 | 
			
		||||
    { pipe } split
 | 
			
		||||
    [ first|concatenation ] map first|alternation ;
 | 
			
		||||
 | 
			
		||||
: handle-right-parenthesis ( -- )
 | 
			
		||||
    stack beginning-of-group over last-index cut rest
 | 
			
		||||
    [ current-regexp get swap >>stack drop ]
 | 
			
		||||
    [ finish-regexp-parse <capture-group> push-stack ] bi* ;
 | 
			
		||||
 | 
			
		||||
: nested-parse-regexp ( -- )
 | 
			
		||||
    beginning-of-group push-stack (parse-regexp) ;
 | 
			
		||||
 | 
			
		||||
: ((parse-regexp)) ( token -- )
 | 
			
		||||
    {
 | 
			
		||||
        { CHAR: . [ handle-dot ] }
 | 
			
		||||
        { CHAR: ( [ handle-left-parenthesis ] }
 | 
			
		||||
        { CHAR: ) [ handle-right-parenthesis ] }
 | 
			
		||||
        { CHAR: | [ handle-pipe ] }
 | 
			
		||||
        { CHAR: ? [ handle-question ] }
 | 
			
		||||
        { CHAR: * [ handle-star ] }
 | 
			
		||||
        { CHAR: + [ handle-plus ] }
 | 
			
		||||
        { CHAR: { [ handle-left-brace ] }
 | 
			
		||||
        { CHAR: [ [ handle-left-bracket ] }
 | 
			
		||||
        { CHAR: ^ [ handle-front-anchor ] }
 | 
			
		||||
        { CHAR: $ [ handle-back-anchor ] }
 | 
			
		||||
        { CHAR: \ [ handle-escape ] }
 | 
			
		||||
        [ <constant> push-stack ]
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: (parse-regexp) ( -- )
 | 
			
		||||
    read1 [ ((parse-regexp)) (parse-regexp) ] when* ;
 | 
			
		||||
 | 
			
		||||
: parse-regexp ( regexp -- )
 | 
			
		||||
    dup current-regexp [
 | 
			
		||||
        raw>> [
 | 
			
		||||
            <string-reader> [ (parse-regexp) ] with-input-stream
 | 
			
		||||
        ] unless-empty
 | 
			
		||||
        current-regexp get
 | 
			
		||||
        stack finish-regexp-parse
 | 
			
		||||
            >>parse-tree drop
 | 
			
		||||
    ] with-variable ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,14 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: kernel strings help.markup help.syntax regexp2.backend ;
 | 
			
		||||
IN: regexp2
 | 
			
		||||
 | 
			
		||||
HELP: <regexp>
 | 
			
		||||
{ $values { "string" string } { "regexp" regexp } }
 | 
			
		||||
{ $description "Compiles a regular expression into a DFA and returns this object.  Regular expressions only have to be compiled once and can then be used multiple times to match input strings." } ;
 | 
			
		||||
 | 
			
		||||
HELP: <iregexp>
 | 
			
		||||
{ $values { "string" string } { "regexp" regexp } }
 | 
			
		||||
{ $description "Compiles a case-insensitive regular expression into a DFA and returns this object.  Otherwise, exactly the same as " { $link <regexp> } } ;
 | 
			
		||||
 | 
			
		||||
{ <regexp> <iregexp> } related-words
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,263 @@
 | 
			
		|||
USING: regexp2 tools.test kernel sequences regexp2.parser
 | 
			
		||||
regexp2.traversal ;
 | 
			
		||||
IN: regexp2-tests
 | 
			
		||||
 | 
			
		||||
[ f ] [ "b" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaaaaaa" "a*"  <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "ab" "a*" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "abc" "abc" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "b" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "c" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "c" "d|e|f" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "aa" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "bb" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "cc" "a|b|c" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "cc" "d|e|f" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "a+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aa" "a+" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "" "a?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a?" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aa" "a?" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "." <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "." <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "." "." <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "\n" "." <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" ".+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" ".+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ab" ".+" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[ t ] [ "" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "c" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "cc" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "ccd" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "d" "a|b*|c+|d?" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "foo" "foo|bar" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "bar" "foo|bar" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "foobar" "foo|bar" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "(a)" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "(a)" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aa" "(a)" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aa" "(a*)" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "aababaaabbac" "(a|b)+" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ababaaabba" "(a|b)+" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "a{1}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a{1}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aa" "a{1}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "a" "a{2,}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaa" "a{2,}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaaa" "a{2,}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaaaa" "a{2,}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aa" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaa" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaaa" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaaaa" "a{,2}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aa" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaa" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaaa" "a{1,3}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "[a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[abc]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "b" "[a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "d" "[abc]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ab" "[abc]{1,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abc" "[abc]{1,2}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "[^a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[^a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[^abc]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "b" "[^a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "d" "[^abc]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "ab" "[^abc]{1,2}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abc" "[^abc]{1,2}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "]" "[]]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "]" "[^]]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^]]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ "^" "[^]" <regexp> matches? ] must-fail
 | 
			
		||||
[ t ] [ "^" "[]^]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "]" "[]^]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "[" "[[]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "^" "[^^]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^^]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "-" "[-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "-" "[-a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[-a]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "-" "[a-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[a-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "b" "[a-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "-" "[^-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^-]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "-" "[a-c]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "-" "[^a-c]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "b" "[a-c]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "b" "[^a-c]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "-" "[a-c-]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "-" "[^a-c-]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "\\" "[\\\\]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[\\\\]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "\\" "[^\\\\]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^\\\\]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "0" "[\\d]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[\\d]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "abc" "\\p{Lower}{3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "ABC" "\\p{Lower}{3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ABC" "\\p{Upper}{3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abc" "\\p{Upper}{3}" <regexp> matches? ] unit-test
 | 
			
		||||
! 
 | 
			
		||||
[ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "s" "\\Qs\\E" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "S" "\\0123" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "b" "a+b" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aab" "a+b" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abb" "a+b" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "abbbb" "ab*" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "ab*" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "abab" "ab*" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "x" "\\." <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "." "\\." <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "aaaab" "a+ab" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
 | 
			
		||||
[ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
 | 
			
		||||
[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
 | 
			
		||||
[ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
 | 
			
		||||
[ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
 | 
			
		||||
[ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
 | 
			
		||||
 | 
			
		||||
! [ t ] [ "aaa" "AAA" t <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "aax" "AAA" t <regexp> matches? ] unit-test
 | 
			
		||||
! [ t ] [ "aaa" "A*" t <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "aaba" "A*" t <regexp> matches? ] unit-test
 | 
			
		||||
! [ t ] [ "b" "[AB]" t <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "c" "[AB]" t <regexp> matches? ] unit-test
 | 
			
		||||
! [ t ] [ "c" "[A-Z]" t <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "3" "[A-Z]" t <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ ] [
 | 
			
		||||
    "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
 | 
			
		||||
    <regexp> drop
 | 
			
		||||
] unit-test
 | 
			
		||||
 | 
			
		||||
[ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
 | 
			
		||||
 | 
			
		||||
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
! [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
 | 
			
		||||
! [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
 | 
			
		||||
 | 
			
		||||
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
 | 
			
		||||
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
 | 
			
		||||
 | 
			
		||||
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
 | 
			
		||||
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
 | 
			
		||||
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
 | 
			
		||||
! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
 | 
			
		||||
! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-head ] unit-test
 | 
			
		||||
! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
 | 
			
		||||
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
 | 
			
		||||
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
 | 
			
		||||
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
 | 
			
		||||
[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
 | 
			
		||||
 | 
			
		||||
! Bug in parsing word
 | 
			
		||||
! [ t ] [ "a" R' a' matches?  ] unit-test
 | 
			
		||||
 | 
			
		||||
! ((A)(B(C)))
 | 
			
		||||
! 1.  ((A)(B(C)))
 | 
			
		||||
! 2. (A)
 | 
			
		||||
! 3. (B(C))
 | 
			
		||||
! 4. (C) 
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,59 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors combinators kernel math math.ranges
 | 
			
		||||
sequences regexp2.backend regexp2.utils memoize sets
 | 
			
		||||
regexp2.parser regexp2.nfa regexp2.dfa regexp2.traversal
 | 
			
		||||
regexp2.transition-tables ;
 | 
			
		||||
IN: regexp2
 | 
			
		||||
 | 
			
		||||
: default-regexp ( string -- regexp )
 | 
			
		||||
    regexp new
 | 
			
		||||
        swap >>raw
 | 
			
		||||
        <transition-table> >>nfa-table
 | 
			
		||||
        <transition-table> >>dfa-table
 | 
			
		||||
        <transition-table> >>minimized-table
 | 
			
		||||
        reset-regexp ;
 | 
			
		||||
 | 
			
		||||
: construct-regexp ( regexp -- regexp' )
 | 
			
		||||
    {
 | 
			
		||||
        [ parse-regexp ]
 | 
			
		||||
        [ construct-nfa ]
 | 
			
		||||
        [ construct-dfa ]
 | 
			
		||||
        [ ]
 | 
			
		||||
    } cleave ;
 | 
			
		||||
 | 
			
		||||
: match ( string regexp -- pair )
 | 
			
		||||
    <dfa-traverser> do-match return-match ;
 | 
			
		||||
 | 
			
		||||
: matches? ( string regexp -- ? )
 | 
			
		||||
    dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;
 | 
			
		||||
 | 
			
		||||
: match-head ( string regexp -- end ) match length>> 1- ;
 | 
			
		||||
 | 
			
		||||
: initial-option ( regexp option -- regexp' )
 | 
			
		||||
    over options>> conjoin ;
 | 
			
		||||
 | 
			
		||||
: <regexp> ( string -- regexp )
 | 
			
		||||
    default-regexp construct-regexp ;
 | 
			
		||||
 | 
			
		||||
: <iregexp> ( string -- regexp )
 | 
			
		||||
    default-regexp
 | 
			
		||||
    case-insensitive initial-option
 | 
			
		||||
    construct-regexp ;
 | 
			
		||||
 | 
			
		||||
: <rregexp> ( string -- regexp )
 | 
			
		||||
    default-regexp
 | 
			
		||||
    reversed-regexp initial-option
 | 
			
		||||
    construct-regexp ;
 | 
			
		||||
 | 
			
		||||
: R! CHAR: ! <regexp> ; parsing
 | 
			
		||||
: R" CHAR: " <regexp> ; parsing
 | 
			
		||||
: R# CHAR: # <regexp> ; parsing
 | 
			
		||||
: R' CHAR: ' <regexp> ; parsing
 | 
			
		||||
: R( CHAR: ) <regexp> ; parsing
 | 
			
		||||
: R/ CHAR: / <regexp> ; parsing
 | 
			
		||||
: R@ CHAR: @ <regexp> ; parsing
 | 
			
		||||
: R[ CHAR: ] <regexp> ; parsing
 | 
			
		||||
: R` CHAR: ` <regexp> ; parsing
 | 
			
		||||
: R{ CHAR: } <regexp> ; parsing
 | 
			
		||||
: R| CHAR: | <regexp> ; parsing
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
Regular expressions
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,2 @@
 | 
			
		|||
parsing
 | 
			
		||||
text
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,44 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays assocs fry hashtables kernel sequences
 | 
			
		||||
vectors ;
 | 
			
		||||
IN: regexp2.transition-tables
 | 
			
		||||
 | 
			
		||||
: insert-at ( value key hash -- )
 | 
			
		||||
    2dup at* [
 | 
			
		||||
        2nip push
 | 
			
		||||
    ] [
 | 
			
		||||
        drop >r >r dup vector? [ 1vector ] unless r> r> set-at
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
: ?insert-at ( value key hash/f -- hash )
 | 
			
		||||
    [ H{ } clone ] unless* [ insert-at ] keep ;
 | 
			
		||||
 | 
			
		||||
TUPLE: transition from to obj ;
 | 
			
		||||
TUPLE: literal-transition < transition ;
 | 
			
		||||
TUPLE: class-transition < transition ;
 | 
			
		||||
TUPLE: default-transition < transition ;
 | 
			
		||||
 | 
			
		||||
TUPLE: literal obj ;
 | 
			
		||||
TUPLE: class obj ;
 | 
			
		||||
TUPLE: default ;
 | 
			
		||||
: <literal-transition> ( from to obj -- transition ) literal-transition boa ;
 | 
			
		||||
: <class-transition> ( from to obj -- transition ) class-transition boa ;
 | 
			
		||||
: <default-transition> ( from to -- transition ) t default-transition boa ;
 | 
			
		||||
 | 
			
		||||
TUPLE: transition-table transitions
 | 
			
		||||
    literals classes defaults
 | 
			
		||||
    start-state final-states ;
 | 
			
		||||
 | 
			
		||||
: <transition-table> ( -- transition-table )
 | 
			
		||||
    transition-table new
 | 
			
		||||
        H{ } clone >>transitions
 | 
			
		||||
        H{ } clone >>final-states ;
 | 
			
		||||
 | 
			
		||||
: set-transition ( transition hash -- )
 | 
			
		||||
    >r [ to>> ] [ obj>> ] [ from>> ] tri r>
 | 
			
		||||
    2dup at* [ 2nip insert-at ]
 | 
			
		||||
    [ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ;
 | 
			
		||||
 | 
			
		||||
: add-transition ( transition transition-table -- )
 | 
			
		||||
    transitions>> set-transition ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,80 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors assocs combinators combinators.lib kernel
 | 
			
		||||
math math.ranges quotations sequences regexp2.parser
 | 
			
		||||
regexp2.classes combinators.short-circuit assocs.lib
 | 
			
		||||
sequences.lib ;
 | 
			
		||||
IN: regexp2.traversal
 | 
			
		||||
 | 
			
		||||
TUPLE: dfa-traverser
 | 
			
		||||
    dfa-table
 | 
			
		||||
    last-state current-state
 | 
			
		||||
    text
 | 
			
		||||
    start-index current-index
 | 
			
		||||
    matches ;
 | 
			
		||||
 | 
			
		||||
: <dfa-traverser> ( text regexp -- match )
 | 
			
		||||
    dfa-table>>
 | 
			
		||||
    dfa-traverser new
 | 
			
		||||
        swap [ start-state>> >>current-state ] keep
 | 
			
		||||
        >>dfa-table
 | 
			
		||||
        swap >>text
 | 
			
		||||
        0 >>start-index
 | 
			
		||||
        0 >>current-index
 | 
			
		||||
        V{ } clone >>matches ;
 | 
			
		||||
 | 
			
		||||
: final-state? ( dfa-traverser -- ? )
 | 
			
		||||
    [ current-state>> ] [ dfa-table>> final-states>> ] bi
 | 
			
		||||
    key? ;
 | 
			
		||||
 | 
			
		||||
: text-finished? ( dfa-traverser -- ? )
 | 
			
		||||
    [ current-index>> ] [ text>> length ] bi >= ;
 | 
			
		||||
 | 
			
		||||
: save-final-state ( dfa-straverser -- )
 | 
			
		||||
    [ current-index>> ] [ matches>> ] bi push ;
 | 
			
		||||
 | 
			
		||||
: match-done? ( dfa-traverser -- ? )
 | 
			
		||||
    dup final-state? [
 | 
			
		||||
        dup save-final-state
 | 
			
		||||
    ] when text-finished? ;
 | 
			
		||||
 | 
			
		||||
: increment-state ( dfa-traverser state -- dfa-traverser )
 | 
			
		||||
    >r [ 1+ ] change-current-index
 | 
			
		||||
    dup current-state>> >>last-state r>
 | 
			
		||||
    first >>current-state ;
 | 
			
		||||
 | 
			
		||||
: match-failed ( dfa-traverser -- dfa-traverser )
 | 
			
		||||
    V{ } clone >>matches ;
 | 
			
		||||
 | 
			
		||||
: match-literal ( transition from-state table -- to-state/f )
 | 
			
		||||
    transitions>> [ at ] [ 2drop f ] if-at ;
 | 
			
		||||
 | 
			
		||||
: assoc-with ( param assoc quot -- assoc curry )
 | 
			
		||||
    swapd [ [ -rot ] dip call ] 2curry ; inline
 | 
			
		||||
 | 
			
		||||
: match-class ( transition from-state table -- to-state/f )
 | 
			
		||||
    transitions>> at* [
 | 
			
		||||
        [ drop class-member? ] assoc-with assoc-find [ nip ] [ drop ] if
 | 
			
		||||
    ] [ drop ] if ;
 | 
			
		||||
 | 
			
		||||
: match-default ( transition from-state table -- to-state/f )
 | 
			
		||||
    [ nip ] dip transitions>>
 | 
			
		||||
    [ t swap [ drop f ] unless-at ] [ drop f ] if-at ;
 | 
			
		||||
 | 
			
		||||
: match-transition ( obj from-state dfa -- to-state/f )
 | 
			
		||||
    { [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
 | 
			
		||||
 | 
			
		||||
: setup-match ( match -- obj state dfa-table )
 | 
			
		||||
    { current-index>> text>> current-state>> dfa-table>> } get-slots
 | 
			
		||||
    [ nth ] 2dip ;
 | 
			
		||||
 | 
			
		||||
: do-match ( dfa-traverser -- dfa-traverser )
 | 
			
		||||
    dup match-done? [
 | 
			
		||||
        dup setup-match match-transition
 | 
			
		||||
        [ increment-state do-match ] when*
 | 
			
		||||
    ] unless ;
 | 
			
		||||
 | 
			
		||||
: return-match ( dfa-traverser -- interval/f )
 | 
			
		||||
    dup matches>>
 | 
			
		||||
    [ drop f ]
 | 
			
		||||
    [ [ start-index>> ] [ peek ] bi* 1 <range> ] if-empty ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,69 @@
 | 
			
		|||
! Copyright (C) 2008 Doug Coleman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays assocs combinators.lib io kernel
 | 
			
		||||
math math.order namespaces regexp2.backend sequences
 | 
			
		||||
sequences.lib unicode.categories math.ranges fry
 | 
			
		||||
combinators.short-circuit ;
 | 
			
		||||
IN: regexp2.utils
 | 
			
		||||
 | 
			
		||||
: (while-changes) ( obj quot pred pred-ret -- obj )
 | 
			
		||||
    ! quot: ( obj -- obj' )
 | 
			
		||||
    ! pred: ( obj -- <=> )
 | 
			
		||||
    >r >r dup slip r> pick over call r> dupd =
 | 
			
		||||
    [ 3drop ] [ (while-changes) ] if ; inline
 | 
			
		||||
 | 
			
		||||
: while-changes ( obj quot pred -- obj' )
 | 
			
		||||
    pick over call (while-changes) ; inline
 | 
			
		||||
 | 
			
		||||
: last-state ( regexp -- range ) stack>> peek first2 [a,b] ;
 | 
			
		||||
: push1 ( obj -- ) input-stream get stream>> push ;
 | 
			
		||||
: peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
 | 
			
		||||
: pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
 | 
			
		||||
 | 
			
		||||
: stack ( -- obj ) current-regexp get stack>> ;
 | 
			
		||||
: change-whole-stack ( quot -- )
 | 
			
		||||
    current-regexp get
 | 
			
		||||
    [ stack>> swap call ] keep (>>stack) ; inline
 | 
			
		||||
: push-stack ( obj -- ) stack push ;
 | 
			
		||||
: pop-stack ( -- obj ) stack pop ;
 | 
			
		||||
: cut-out ( vector n -- vector' vector ) cut rest ;
 | 
			
		||||
ERROR: cut-stack-error ;
 | 
			
		||||
: cut-stack ( obj vector -- vector' vector )
 | 
			
		||||
    tuck last-index [ cut-stack-error ] unless* cut-out swap ;
 | 
			
		||||
 | 
			
		||||
ERROR: bad-octal number ;
 | 
			
		||||
ERROR: bad-hex number ;
 | 
			
		||||
: check-octal ( octal -- octal ) dup 255 > [ bad-octal ] when ;
 | 
			
		||||
: check-hex ( hex -- hex ) dup number? [ bad-hex ] unless ;
 | 
			
		||||
 | 
			
		||||
: ascii? ( n -- ? ) 0 HEX: 7f between? ;
 | 
			
		||||
: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ;
 | 
			
		||||
: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ;
 | 
			
		||||
 | 
			
		||||
: hex-digit? ( n -- ? )
 | 
			
		||||
    [
 | 
			
		||||
        [ decimal-digit? ]
 | 
			
		||||
        [ CHAR: a CHAR: f between? ]
 | 
			
		||||
        [ CHAR: A CHAR: F between? ]
 | 
			
		||||
    ] 1|| ;
 | 
			
		||||
 | 
			
		||||
: control-char? ( n -- ? )
 | 
			
		||||
    [
 | 
			
		||||
        [ 0 HEX: 1f between? ]
 | 
			
		||||
        [ HEX: 7f = ]
 | 
			
		||||
    ] 1|| ;
 | 
			
		||||
 | 
			
		||||
: punct? ( n -- ? )
 | 
			
		||||
    "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
 | 
			
		||||
 | 
			
		||||
: c-identifier-char? ( ch -- ? )
 | 
			
		||||
    [ [ alpha? ] [ CHAR: _ = ] ] 1|| ;
 | 
			
		||||
 | 
			
		||||
: java-blank? ( n -- ? )
 | 
			
		||||
    {
 | 
			
		||||
        CHAR: \s CHAR: \t CHAR: \n
 | 
			
		||||
        HEX: b HEX: 7 CHAR: \r
 | 
			
		||||
    } member? ;
 | 
			
		||||
 | 
			
		||||
: java-printable? ( n -- ? )
 | 
			
		||||
    [ [ alpha? ] [ punct? ] ] 1|| ;
 | 
			
		||||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ SYMBOL: errored
 | 
			
		|||
SYMBOL: before
 | 
			
		||||
SYMBOL: after
 | 
			
		||||
SYMBOL: quot
 | 
			
		||||
TUPLE: random-tester-error ;
 | 
			
		||||
ERROR: random-tester-error ;
 | 
			
		||||
 | 
			
		||||
: setup-test ( #data #code -- data... quot )
 | 
			
		||||
    #! Variable stack effect
 | 
			
		||||
| 
						 | 
				
			
			@ -35,7 +35,7 @@ TUPLE: random-tester-error ;
 | 
			
		|||
            "--" print
 | 
			
		||||
            [ . ] each
 | 
			
		||||
            quot get .
 | 
			
		||||
            random-tester-error construct-empty throw
 | 
			
		||||
            random-tester-error
 | 
			
		||||
        ] if
 | 
			
		||||
    ] unless clear ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,7 @@
 | 
			
		|||
USING: kernel namespaces sequences sorting vocabs ;
 | 
			
		||||
USING: arrays assocs generic hashtables  math math.intervals math.parser math.functions refs shuffle vectors words ;
 | 
			
		||||
USING: kernel namespaces sequences sets sorting vocabs ;
 | 
			
		||||
USING: arrays assocs generic hashtables 
 | 
			
		||||
math math.intervals math.parser math.order math.functions
 | 
			
		||||
refs shuffle vectors words ;
 | 
			
		||||
IN: random-tester.safe-words
 | 
			
		||||
 | 
			
		||||
: ?-words
 | 
			
		||||
| 
						 | 
				
			
			@ -16,7 +18,11 @@ IN: random-tester.safe-words
 | 
			
		|||
        array? integer? complex? value-ref? ref? key-ref?
 | 
			
		||||
        interval? number?
 | 
			
		||||
        wrapper? tuple?
 | 
			
		||||
        [-1,1]? between? bignum? both? either? eq? equal? even? fixnum? float? fp-nan? hashtable? interval-contains? interval-subset? interval? key-ref? key? number? odd? pair? power-of-2? ratio? rational? real? subassoc? zero? assoc? curry? vector? callstack? ! clear 3.14 [ <vector> assoc? ] compile-1
 | 
			
		||||
        [-1,1]? between? bignum? both? either? eq? equal? even? fixnum?
 | 
			
		||||
        float? fp-nan? hashtable? interval-contains? interval-subset?
 | 
			
		||||
        interval? key-ref? key? number? odd? pair? power-of-2?
 | 
			
		||||
        ratio? rational? real? zero? assoc? curry? vector? callstack?
 | 
			
		||||
 | 
			
		||||
        2^ not
 | 
			
		||||
        ! arrays
 | 
			
		||||
        resize-array <array>
 | 
			
		||||
| 
						 | 
				
			
			@ -64,6 +70,9 @@ IN: random-tester.safe-words
 | 
			
		|||
        retainstack callstack
 | 
			
		||||
        datastack
 | 
			
		||||
        callstack>array
 | 
			
		||||
 | 
			
		||||
        curry 2curry 3curry compose 3compose
 | 
			
		||||
        (assoc-each)
 | 
			
		||||
    } ;
 | 
			
		||||
 | 
			
		||||
: exit-words
 | 
			
		||||
| 
						 | 
				
			
			@ -83,8 +92,9 @@ IN: random-tester.safe-words
 | 
			
		|||
    ] { } make ;
 | 
			
		||||
 | 
			
		||||
: safe-words ( -- array )
 | 
			
		||||
    bad-words {
 | 
			
		||||
        "alists" "arrays" "assocs" ! "bit-arrays" "byte-arrays"
 | 
			
		||||
    {
 | 
			
		||||
        ! "accessors"
 | 
			
		||||
        "alists" "arrays" "assocs" "bit-arrays" "byte-arrays"
 | 
			
		||||
        ! "classes" "combinators" "compiler" "continuations"
 | 
			
		||||
        ! "core-foundation" "definitions" "documents"
 | 
			
		||||
        ! "float-arrays" "generic" "graphs" "growable"
 | 
			
		||||
| 
						 | 
				
			
			@ -92,19 +102,21 @@ IN: random-tester.safe-words
 | 
			
		|||
        "kernel" "math"
 | 
			
		||||
        "math.bitfields" "math.complex" "math.constants" "math.floats"
 | 
			
		||||
        "math.functions" "math.integers" "math.intervals" "math.libm"
 | 
			
		||||
        "math.parser" "math.ratios" "math.vectors"
 | 
			
		||||
        ! "namespaces" "quotations" "sbufs"
 | 
			
		||||
        "math.parser" "math.order" "math.ratios" "math.vectors"
 | 
			
		||||
        ! "namespaces"
 | 
			
		||||
        "quotations" "sbufs"
 | 
			
		||||
        ! "queues" "strings" "sequences"
 | 
			
		||||
        "sets"
 | 
			
		||||
        "vectors"
 | 
			
		||||
        ! "words"
 | 
			
		||||
    } [ words ] map concat seq-diff natural-sort ;
 | 
			
		||||
    } [ words ] map concat bad-words diff natural-sort ;
 | 
			
		||||
    
 | 
			
		||||
safe-words \ safe-words set-global
 | 
			
		||||
 | 
			
		||||
! foo dup (clone) = .
 | 
			
		||||
! foo dup clone = .
 | 
			
		||||
! f [ byte-array>bignum assoc-clone-like ] compile-1
 | 
			
		||||
! 2 3.14 [ construct-empty number= ] compile-1
 | 
			
		||||
! 2 3.14 [ number= ] compile-1
 | 
			
		||||
! 3.14 [ <vector> assoc? ] compile-1
 | 
			
		||||
! -3 [ ] 2 [ byte-array>bignum denominator ] compile-1
 | 
			
		||||
 | 
			
		||||
! : foo ( x -- y ) euler bitand ; { foo } compile 20 foo
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue