Merge branch 'master' of git://factorcode.org/git/factor

db4
Slava Pestov 2008-11-24 00:30:09 -06:00
commit d3d57c800b
9 changed files with 310 additions and 160 deletions

View File

@ -7,6 +7,7 @@ IN: regexp.classes
GENERIC: class-member? ( obj class -- ? ) GENERIC: class-member? ( obj class -- ? )
M: word class-member? ( obj class -- ? ) 2drop f ; M: word class-member? ( obj class -- ? ) 2drop f ;
M: integer class-member? ( obj class -- ? ) 2drop f ; M: integer class-member? ( obj class -- ? ) 2drop f ;
M: character-class-range class-member? ( obj class -- ? ) M: character-class-range class-member? ( obj class -- ? )
@ -60,3 +61,12 @@ M: java-blank-class class-member? ( obj class -- ? )
M: unmatchable-class class-member? ( obj class -- ? ) M: unmatchable-class class-member? ( obj class -- ? )
2drop f ; 2drop f ;
M: terminator-class class-member? ( obj class -- ? )
drop {
[ CHAR: \r = ]
[ CHAR: \n = ]
[ CHAR: \u000085 = ]
[ CHAR: \u002028 = ]
[ CHAR: \u002029 = ]
} 1|| ;

View File

@ -43,7 +43,8 @@ IN: regexp.dfa
dupd pop dup pick find-transitions rot dupd pop dup pick find-transitions rot
[ [
[ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep [ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
>r swapd transition make-transition r> dfa-table>> add-transition [ swapd transition make-transition ] dip
dfa-table>> add-transition
] curry with each ] curry with each
new-transitions new-transitions
] if-empty ; ] if-empty ;

View File

@ -18,6 +18,9 @@ SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
: next-state ( regexp -- state ) : next-state ( regexp -- state )
[ state>> ] [ [ 1+ ] change-state drop ] bi ; [ state>> ] [ [ 1+ ] change-state drop ] bi ;
@ -135,7 +138,21 @@ M: non-capture-group nfa-node ( node -- )
M: reluctant-kleene-star nfa-node ( node -- ) M: reluctant-kleene-star nfa-node ( node -- )
term>> <kleene-star> nfa-node ; term>> <kleene-star> nfa-node ;
!
: add-epsilon-flag ( flag -- )
eps literal-transition add-simple-entry add-traversal-flag ;
M: beginning-of-line nfa-node ( node -- )
drop beginning-of-line add-epsilon-flag ;
M: end-of-line nfa-node ( node -- )
drop end-of-line add-epsilon-flag ;
M: beginning-of-input nfa-node ( node -- )
drop beginning-of-input add-epsilon-flag ;
M: end-of-input nfa-node ( node -- )
drop end-of-input add-epsilon-flag ;
M: negation nfa-node ( node -- ) M: negation nfa-node ( node -- )
negation-mode inc negation-mode inc

View File

@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators io io.streams.string
kernel math math.parser namespaces qualified sets kernel math math.parser namespaces qualified sets
quotations sequences splitting symbols vectors math.order quotations sequences splitting symbols vectors math.order
unicode.categories strings regexp.backend regexp.utils unicode.categories strings regexp.backend regexp.utils
unicode.case words ; unicode.case words locals ;
IN: regexp.parser IN: regexp.parser
FROM: math.ranges => [a,b] ; FROM: math.ranges => [a,b] ;
@ -44,18 +44,21 @@ TUPLE: character-class-range from to ; INSTANCE: character-class-range node
SINGLETON: epsilon INSTANCE: epsilon node SINGLETON: epsilon INSTANCE: epsilon node
SINGLETON: any-char INSTANCE: any-char node SINGLETON: any-char INSTANCE: any-char node
SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
SINGLETON: front-anchor INSTANCE: front-anchor node SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
SINGLETON: back-anchor INSTANCE: back-anchor node SINGLETON: end-of-input INSTANCE: end-of-input node
SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
SINGLETON: end-of-line INSTANCE: end-of-line node
TUPLE: option-on option ; INSTANCE: option-on node TUPLE: option-on option ; INSTANCE: option-on node
TUPLE: option-off option ; INSTANCE: option-off node TUPLE: option-off option ; INSTANCE: option-off node
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ; SINGLETONS: unix-lines dotall multiline comments case-insensitive
unicode-case reversed-regexp ;
SINGLETONS: letter-class LETTER-class Letter-class digit-class SINGLETONS: letter-class LETTER-class Letter-class digit-class
alpha-class non-newline-blank-class alpha-class non-newline-blank-class
ascii-class punctuation-class java-printable-class blank-class ascii-class punctuation-class java-printable-class blank-class
control-character-class hex-digit-class java-blank-class c-identifier-class control-character-class hex-digit-class java-blank-class c-identifier-class
unmatchable-class ; terminator-class unmatchable-class word-boundary-class ;
SINGLETONS: beginning-of-group end-of-group SINGLETONS: beginning-of-group end-of-group
beginning-of-character-class end-of-character-class beginning-of-character-class end-of-character-class
@ -225,26 +228,12 @@ ERROR: invalid-range a b ;
: handle-left-brace ( -- ) : handle-left-brace ( -- )
parse-repetition parse-repetition
>r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r> [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
[ [
2dup and [ from-m-to-n ] 2dup and [ from-m-to-n ]
[ [ nip at-most-n ] [ at-least-n ] if* ] if [ [ nip at-most-n ] [ at-least-n ] if* ] if
] [ drop 0 max exactly-n ] if ; ] [ drop 0 max exactly-n ] if ;
SINGLETON: beginning-of-input
SINGLETON: end-of-input
: newlines ( -- obj1 obj2 obj3 )
CHAR: \r <constant>
CHAR: \n <constant>
2dup 2array <concatenation> ;
: beginning-of-line ( -- obj )
beginning-of-input newlines 4array <alternation> lookbehind boa ;
: end-of-line ( -- obj )
end-of-input newlines 4array <alternation> lookahead boa ;
: handle-front-anchor ( -- ) : handle-front-anchor ( -- )
get-multiline beginning-of-line beginning-of-input ? push-stack ; get-multiline beginning-of-line beginning-of-input ? push-stack ;
@ -281,13 +270,26 @@ ERROR: expected-posix-class ;
: parse-control-character ( -- n ) read1 ; : parse-control-character ( -- n ) read1 ;
ERROR: bad-escaped-literals seq ; ERROR: bad-escaped-literals seq ;
: parse-escaped-literals ( -- obj )
"\\E" read-until [ bad-escaped-literals ] unless : parse-til-E ( -- obj )
"\\E" read-until [ bad-escaped-literals ] unless ;
:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
parse-til-E
drop1 drop1
[ epsilon ] [ [ epsilon ] [
[ <constant> ] V{ } map-as [ quot call <constant> ] V{ } map-as
first|concatenation first|concatenation
] if-empty ; ] if-empty ; inline
: parse-escaped-literals ( -- obj )
[ ] (parse-escaped-literals) ;
: lower-case-literals ( -- obj )
[ ch>lower ] (parse-escaped-literals) ;
: upper-case-literals ( -- obj )
[ ch>upper ] (parse-escaped-literals) ;
: parse-escaped ( -- obj ) : parse-escaped ( -- obj )
read1 read1
@ -299,12 +301,12 @@ ERROR: bad-escaped-literals seq ;
{ CHAR: a [ HEX: 7 <constant> ] } { CHAR: a [ HEX: 7 <constant> ] }
{ CHAR: e [ HEX: 1b <constant> ] } { CHAR: e [ HEX: 1b <constant> ] }
{ CHAR: d [ digit-class ] }
{ CHAR: D [ digit-class <negation> ] }
{ CHAR: s [ java-blank-class ] }
{ CHAR: S [ java-blank-class <negation> ] }
{ CHAR: w [ c-identifier-class ] } { CHAR: w [ c-identifier-class ] }
{ CHAR: W [ c-identifier-class <negation> ] } { CHAR: W [ c-identifier-class <negation> ] }
{ CHAR: s [ java-blank-class ] }
{ CHAR: S [ java-blank-class <negation> ] }
{ CHAR: d [ digit-class ] }
{ CHAR: D [ digit-class <negation> ] }
{ CHAR: p [ parse-posix-class ] } { CHAR: p [ parse-posix-class ] }
{ CHAR: P [ parse-posix-class <negation> ] } { CHAR: P [ parse-posix-class <negation> ] }
@ -313,13 +315,19 @@ ERROR: bad-escaped-literals seq ;
{ CHAR: 0 [ parse-octal <constant> ] } { CHAR: 0 [ parse-octal <constant> ] }
{ CHAR: c [ parse-control-character ] } { CHAR: c [ parse-control-character ] }
! { CHAR: b [ handle-word-boundary ] } { CHAR: Q [ parse-escaped-literals ] }
! { CHAR: B [ handle-word-boundary <negation> ] }
! { CHAR: A [ handle-beginning-of-input ] }
! { CHAR: G [ end of previous match ] }
! { CHAR: Z [ handle-end-of-input ] }
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
! { CHAR: b [ word-boundary-class ] }
! { CHAR: B [ word-boundary-class <negation> ] }
! { CHAR: A [ handle-beginning-of-input ] }
! { CHAR: z [ handle-end-of-input ] }
! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
! m//g mode
! { CHAR: G [ end of previous match ] }
! Group capture
! { CHAR: 1 [ CHAR: 1 <constant> ] } ! { CHAR: 1 [ CHAR: 1 <constant> ] }
! { CHAR: 2 [ CHAR: 2 <constant> ] } ! { CHAR: 2 [ CHAR: 2 <constant> ] }
! { CHAR: 3 [ CHAR: 3 <constant> ] } ! { CHAR: 3 [ CHAR: 3 <constant> ] }
@ -330,7 +338,11 @@ ERROR: bad-escaped-literals seq ;
! { CHAR: 8 [ CHAR: 8 <constant> ] } ! { CHAR: 8 [ CHAR: 8 <constant> ] }
! { CHAR: 9 [ CHAR: 9 <constant> ] } ! { CHAR: 9 [ CHAR: 9 <constant> ] }
{ CHAR: Q [ parse-escaped-literals ] } ! Perl extensions
! can't do \l and \u because \u is already a 4-hex
{ CHAR: L [ lower-case-literals ] }
{ CHAR: U [ upper-case-literals ] }
[ <constant> ] [ <constant> ]
} case ; } case ;

View File

@ -45,6 +45,7 @@ IN: regexp-tests
! Off by default. ! Off by default.
[ f ] [ "\n" "." <regexp> matches? ] unit-test [ f ] [ "\n" "." <regexp> matches? ] unit-test
[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test [ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
[ t ] [ "\n" R/ ./s matches? ] unit-test
[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test [ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
[ f ] [ "" ".+" <regexp> matches? ] unit-test [ f ] [ "" ".+" <regexp> matches? ] unit-test
@ -210,34 +211,34 @@ IN: regexp-tests
[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test [ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test [ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
[ t ] [ "aaa" "AAA" <iregexp> matches? ] unit-test [ t ] [ "aaa" R/ AAA/i matches? ] unit-test
[ f ] [ "aax" "AAA" <iregexp> matches? ] unit-test [ f ] [ "aax" R/ AAA/i matches? ] unit-test
[ t ] [ "aaa" "A*" <iregexp> matches? ] unit-test [ t ] [ "aaa" R/ A*/i matches? ] unit-test
[ f ] [ "aaba" "A*" <iregexp> matches? ] unit-test [ f ] [ "aaba" R/ A*/i matches? ] unit-test
[ t ] [ "b" "[AB]" <iregexp> matches? ] unit-test [ t ] [ "b" R/ [AB]/i matches? ] unit-test
[ f ] [ "c" "[AB]" <iregexp> matches? ] unit-test [ f ] [ "c" R/ [AB]/i matches? ] unit-test
[ t ] [ "c" "[A-Z]" <iregexp> matches? ] unit-test [ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
[ f ] [ "3" "[A-Z]" <iregexp> matches? ] unit-test [ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test [ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test [ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test [ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test [ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test [ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test [ t ] [ "A" R/ [a-z]/i matches? ] unit-test
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test [ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test [ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test [ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test [ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test [ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
@ -253,7 +254,7 @@ IN: regexp-tests
[ ] [ "(\\$[\\p{XDigit}]|[\\p{Digit}])" <regexp> drop ] unit-test [ ] [ "(\\$[\\p{XDigit}]|[\\p{Digit}])" <regexp> drop ] unit-test
! Comment ! Comment inside a regular expression
[ t ] [ "ac" "a(?#boo)c" <regexp> matches? ] unit-test [ t ] [ "ac" "a(?#boo)c" <regexp> matches? ] unit-test
[ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test [ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test
@ -283,32 +284,90 @@ IN: regexp-tests
[ { "ABC" "DEF" "GHI" } ] [ { "ABC" "DEF" "GHI" } ]
[ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test [ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test
[ 3 ]
[ "1ABC2DEF3GHI4" R/ [A-Z]+/ count-matches ] unit-test
[ 0 ]
[ "123" R/ [A-Z]+/ count-matches ] unit-test
[ "1.2.3.4" ] [ "1.2.3.4" ]
[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test [ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test [ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test [ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test [ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test [ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
! Bug in parsing word
[ t ] [ "a" R' a' matches? ] unit-test
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
[ t ] [ "a" R/ ^a/ matches? ] unit-test
[ f ] [ "\na" R/ ^a/ matches? ] unit-test
[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
[ f ] [ "\ra" R/ ^a/ matches? ] unit-test
[ t ] [ "a" R/ a$/ matches? ] unit-test
[ f ] [ "a\n" R/ a$/ matches? ] unit-test
[ f ] [ "a\r" R/ a$/ matches? ] unit-test
[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test
! [ t ] [ "a" R/ \Aa/m matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test
! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test
! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test
! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test
! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test
! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test
! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test
! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
! [ t ] [ "a" R/ ^a/m matches? ] unit-test
! [ t ] [ "\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
! Convert to lowercase until E
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
! Convert to uppercase until E
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test ! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test ! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test ! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test ! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test ! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
@ -323,39 +382,29 @@ IN: regexp-tests
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test ! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
! Bug in parsing word ! "ab" "a(?=b*)" <regexp> match
! [ t ] [ "a" R' a' matches? ] unit-test ! "abbbbbc" "a(?=b*c)" <regexp> match
! "ab" "a(?=b*)" <regexp> match
! clear "a(?=b*)" <regexp> "ab" over match ! "baz" "(az)(?<=b)" <regexp> first-match
! clear "a(?=b*c)" <regexp> "abbbbbc" over match ! "cbaz" "a(?<=b*)" <regexp> first-match
! clear "a(?=b*)" <regexp> "ab" over match ! "baz" "a(?<=b)" <regexp> first-match
! clear "^a" <regexp> "a" over match ! "baz" "a(?<!b)" <regexp> first-match
! clear "^a" <regexp> "\na" over match ! "caz" "a(?<!b)" <regexp> first-match
! clear "^a" <regexp> "\r\na" over match
! clear "^a" <regexp> "\ra" over match
! clear "a$" <regexp> "a" over match ! "abcdefg" "a(?=bcdefg)bcd" <regexp> first-match
! clear "a$" <regexp> "a\n" over match ! "abcdefg" "a(?#bcdefg)bcd" <regexp> first-match
! clear "a$" <regexp> "a\r" over match ! "abcdefg" "a(?:bcdefg)" <regexp> first-match
! clear "a$" <regexp> "a\r\n" over match
! "(az)(?<=b)" <regexp> "baz" over first-match
! "a(?<=b*)" <regexp> "cbaz" over first-match
! "a(?<=b)" <regexp> "baz" over first-match
! "a(?<!b)" <regexp> "baz" over first-match
! "a(?<!b)" <regexp> "caz" over first-match
! "a(?=bcdefg)bcd" <regexp> "abcdefg" over first-match
! "a(?#bcdefg)bcd" <regexp> "abcdefg" over first-match
! "a(?:bcdefg)" <regexp> "abcdefg" over first-match
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
! "a(?<=b)" <regexp> "caba" over first-match
! "caba" "a(?<=b)" <regexp> first-match
! capture group 1: "aaaa" 2: "" ! capture group 1: "aaaa" 2: ""
! "aaaa" "(a*)(a*)" <regexp> match* ! "aaaa" "(a*)(a*)" <regexp> match*

View File

@ -1,9 +1,10 @@
! Copyright (C) 2008 Doug Coleman. ! Copyright (C) 2008 Doug Coleman.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors combinators kernel math sequences USING: accessors combinators kernel math sequences strings
sets assocs prettyprint.backend make lexer namespaces parser sets assocs prettyprint.backend make lexer namespaces parser
arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa
regexp.dfa regexp.traversal regexp.transition-tables splitting ; regexp.dfa regexp.traversal regexp.transition-tables splitting
sorting ;
IN: regexp IN: regexp
: default-regexp ( string -- regexp ) : default-regexp ( string -- regexp )
@ -73,42 +74,9 @@ IN: regexp
[ dup ] swap '[ _ next-match ] [ ] produce nip harvest ; [ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
: count-matches ( string regexp -- n ) : count-matches ( string regexp -- n )
all-matches length 1- ; all-matches length ;
: initial-option ( regexp option -- regexp' ) <PRIVATE
over options>> conjoin ;
: <regexp> ( string -- regexp )
default-regexp construct-regexp ;
: <iregexp> ( string -- regexp )
default-regexp
case-insensitive initial-option
construct-regexp ;
: <rregexp> ( string -- regexp )
default-regexp
reversed-regexp initial-option
construct-regexp ;
: parsing-regexp ( accum end -- accum )
lexer get dup skip-blank
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
lexer get dup still-parsing-line?
[ (parse-token) ] [ drop f ] if
"i" = [ <iregexp> ] [ <regexp> ] if parsed ;
: R! CHAR: ! parsing-regexp ; parsing
: R" CHAR: " parsing-regexp ; parsing
: R# CHAR: # parsing-regexp ; parsing
: R' CHAR: ' parsing-regexp ; parsing
: R( CHAR: ) parsing-regexp ; parsing
: R/ CHAR: / parsing-regexp ; parsing
: R@ CHAR: @ parsing-regexp ; parsing
: R[ CHAR: ] parsing-regexp ; parsing
: R` CHAR: ` parsing-regexp ; parsing
: R{ CHAR: } parsing-regexp ; parsing
: R| CHAR: | parsing-regexp ; parsing
: find-regexp-syntax ( string -- prefix suffix ) : find-regexp-syntax ( string -- prefix suffix )
{ {
@ -125,14 +93,67 @@ IN: regexp
{ "R| " "|" } { "R| " "|" }
} swap [ subseq? not nip ] curry assoc-find drop ; } swap [ subseq? not nip ] curry assoc-find drop ;
: option? ( option regexp -- ? ) ERROR: unknown-regexp-option option ;
options>> key? ;
: option>ch ( option -- string )
{
{ case-insensitive [ CHAR: i ] }
{ multiline [ CHAR: m ] }
{ reversed-regexp [ CHAR: r ] }
{ dotall [ CHAR: s ] }
[ unknown-regexp-option ]
} case ;
: ch>option ( ch -- option )
{
{ CHAR: i [ case-insensitive ] }
{ CHAR: m [ multiline ] }
{ CHAR: r [ reversed-regexp ] }
{ CHAR: s [ dotall ] }
[ unknown-regexp-option ]
} case ;
: string>options ( string -- options )
[ ch>option dup ] H{ } map>assoc ;
: options>string ( options -- string )
keys [ option>ch ] map natural-sort >string ;
PRIVATE>
: <optioned-regexp> ( string option-string -- regexp )
[ default-regexp ] [ string>options ] bi* >>options
construct-regexp ;
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
<PRIVATE
: parsing-regexp ( accum end -- accum )
lexer get dup skip-blank
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
lexer get dup still-parsing-line?
[ (parse-token) ] [ drop f ] if
<optioned-regexp> parsed ;
PRIVATE>
: R! CHAR: ! parsing-regexp ; parsing
: R" CHAR: " parsing-regexp ; parsing
: R# CHAR: # parsing-regexp ; parsing
: R' CHAR: ' parsing-regexp ; parsing
: R( CHAR: ) parsing-regexp ; parsing
: R/ CHAR: / parsing-regexp ; parsing
: R@ CHAR: @ parsing-regexp ; parsing
: R[ CHAR: ] parsing-regexp ; parsing
: R` CHAR: ` parsing-regexp ; parsing
: R{ CHAR: } parsing-regexp ; parsing
: R| CHAR: | parsing-regexp ; parsing
M: regexp pprint* M: regexp pprint*
[ [
[ [
dup raw>> [ raw>> dup find-regexp-syntax swap % swap % % ]
dup find-regexp-syntax swap % swap % % [ options>> options>string % ] bi
case-insensitive swap option? [ "i" % ] when
] "" make ] "" make
] keep present-text ; ] keep present-text ;

View File

@ -40,7 +40,7 @@ TUPLE: transition-table transitions start-state final-states ;
2dup [ to>> ] dip maybe-initialize-key 2dup [ to>> ] dip maybe-initialize-key
[ [ to>> ] [ obj>> ] [ from>> ] tri ] dip [ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
2dup at* [ 2nip insert-at ] 2dup at* [ 2nip insert-at ]
[ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ; [ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;
: add-transition ( transition transition-table -- ) : add-transition ( transition transition-table -- )
transitions>> set-transition ; transitions>> set-transition ;

View File

@ -17,6 +17,7 @@ TUPLE: dfa-traverser
capture-group-index capture-group-index
last-state current-state last-state current-state
text text
match-failed?
start-index current-index start-index current-index
matches ; matches ;
@ -37,14 +38,20 @@ TUPLE: dfa-traverser
H{ } clone >>captured-groups ; H{ } clone >>captured-groups ;
: final-state? ( dfa-traverser -- ? ) : final-state? ( dfa-traverser -- ? )
[ current-state>> ] [ dfa-table>> final-states>> ] bi [ current-state>> ]
key? ; [ dfa-table>> final-states>> ] bi key? ;
: beginning-of-text? ( dfa-traverser -- ? )
current-index>> 0 <= ; inline
: end-of-text? ( dfa-traverser -- ? )
[ current-index>> ] [ text>> length ] bi >= ; inline
: text-finished? ( dfa-traverser -- ? ) : text-finished? ( dfa-traverser -- ? )
{ {
[ current-state>> empty? ] [ current-state>> empty? ]
[ [ current-index>> ] [ text>> length ] bi >= ] [ end-of-text? ]
! [ current-index>> 0 < ] [ match-failed?>> ]
} 1|| ; } 1|| ;
: save-final-state ( dfa-straverser -- ) : save-final-state ( dfa-straverser -- )
@ -55,8 +62,47 @@ TUPLE: dfa-traverser
dup save-final-state dup save-final-state
] when text-finished? ; ] when text-finished? ;
: previous-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> 1- ] bi nth ;
: current-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> ] bi nth ;
: next-text-character ( dfa-traverser -- ch )
[ text>> ] [ current-index>> 1+ ] bi nth ;
GENERIC: flag-action ( dfa-traverser flag -- ) GENERIC: flag-action ( dfa-traverser flag -- )
M: beginning-of-input flag-action ( dfa-traverser flag -- )
drop
dup beginning-of-text? [ t >>match-failed? ] unless drop ;
M: end-of-input flag-action ( dfa-traverser flag -- )
drop
dup end-of-text? [ t >>match-failed? ] unless drop ;
M: beginning-of-line flag-action ( dfa-traverser flag -- )
drop
dup {
[ beginning-of-text? ]
[ previous-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: end-of-line flag-action ( dfa-traverser flag -- )
drop
dup {
[ end-of-text? ]
[ next-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: word-boundary flag-action ( dfa-traverser flag -- )
drop
dup {
[ end-of-text? ]
[ current-text-character terminator-class class-member? ]
} 1|| [ t >>match-failed? ] unless drop ;
M: lookahead-on flag-action ( dfa-traverser flag -- ) M: lookahead-on flag-action ( dfa-traverser flag -- )
drop drop
lookahead-counters>> 0 swap push ; lookahead-counters>> 0 swap push ;
@ -110,11 +156,7 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
[ [ 1+ ] change-current-index ] [ [ 1+ ] change-current-index ]
[ [ 1- ] change-current-index ] if [ [ 1- ] change-current-index ] if
dup current-state>> >>last-state dup current-state>> >>last-state
] dip ] [ first ] bi* >>current-state ;
first >>current-state ;
: match-failed ( dfa-traverser -- dfa-traverser )
V{ } clone >>matches ;
: match-literal ( transition from-state table -- to-state/f ) : match-literal ( transition from-state table -- to-state/f )
transitions>> at at ; transitions>> at at ;
@ -131,11 +173,9 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ; { [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
: setup-match ( match -- obj state dfa-table ) : setup-match ( match -- obj state dfa-table )
{ [ [ current-index>> ] [ text>> ] bi nth ]
[ current-index>> ] [ text>> ] [ current-state>> ]
[ current-state>> ] [ dfa-table>> ] [ dfa-table>> ] tri ;
} cleave
[ nth ] 2dip ;
: do-match ( dfa-traverser -- dfa-traverser ) : do-match ( dfa-traverser -- dfa-traverser )
dup process-flags dup process-flags

View File

@ -11,14 +11,14 @@ IN: benchmark.regex-dna
: count-patterns ( string -- ) : count-patterns ( string -- )
{ {
R/ agggtaaa|tttaccct/i, R/ agggtaaa|tttaccct/i
R/ [cgt]gggtaaa|tttaccc[acg]/i, R/ [cgt]gggtaaa|tttaccc[acg]/i
R/ a[act]ggtaaa|tttacc[agt]t/i, R/ a[act]ggtaaa|tttacc[agt]t/i
R/ ag[act]gtaaa|tttac[agt]ct/i, R/ ag[act]gtaaa|tttac[agt]ct/i
R/ agg[act]taaa|ttta[agt]cct/i, R/ agg[act]taaa|ttta[agt]cct/i
R/ aggg[acg]aaa|ttt[cgt]ccct/i, R/ aggg[acg]aaa|ttt[cgt]ccct/i
R/ agggt[cgt]aa|tt[acg]accct/i, R/ agggt[cgt]aa|tt[acg]accct/i
R/ agggta[cgt]a|t[acg]taccct/i, R/ agggta[cgt]a|t[acg]taccct/i
R/ agggtaa[cgt]|[acg]ttaccct/i R/ agggtaa[cgt]|[acg]ttaccct/i
} [ } [
[ raw>> write bl ] [ raw>> write bl ]