Merge branch 'master' of git://factorcode.org/git/factor
commit
d3d57c800b
|
@ -7,6 +7,7 @@ IN: regexp.classes
|
|||
GENERIC: class-member? ( obj class -- ? )
|
||||
|
||||
M: word class-member? ( obj class -- ? ) 2drop f ;
|
||||
|
||||
M: integer class-member? ( obj class -- ? ) 2drop f ;
|
||||
|
||||
M: character-class-range class-member? ( obj class -- ? )
|
||||
|
@ -60,3 +61,12 @@ M: java-blank-class class-member? ( obj class -- ? )
|
|||
|
||||
M: unmatchable-class class-member? ( obj class -- ? )
|
||||
2drop f ;
|
||||
|
||||
M: terminator-class class-member? ( obj class -- ? )
|
||||
drop {
|
||||
[ CHAR: \r = ]
|
||||
[ CHAR: \n = ]
|
||||
[ CHAR: \u000085 = ]
|
||||
[ CHAR: \u002028 = ]
|
||||
[ CHAR: \u002029 = ]
|
||||
} 1|| ;
|
||||
|
|
|
@ -43,7 +43,8 @@ IN: regexp.dfa
|
|||
dupd pop dup pick find-transitions rot
|
||||
[
|
||||
[ [ find-closure ] 2keep nip dupd add-todo-state ] 3keep
|
||||
>r swapd transition make-transition r> dfa-table>> add-transition
|
||||
[ swapd transition make-transition ] dip
|
||||
dfa-table>> add-transition
|
||||
] curry with each
|
||||
new-transitions
|
||||
] if-empty ;
|
||||
|
|
|
@ -18,6 +18,9 @@ SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
|
|||
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
|
||||
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
|
||||
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
|
||||
SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
|
||||
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
|
||||
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
|
||||
|
||||
: next-state ( regexp -- state )
|
||||
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
|
||||
|
@ -135,7 +138,21 @@ M: non-capture-group nfa-node ( node -- )
|
|||
M: reluctant-kleene-star nfa-node ( node -- )
|
||||
term>> <kleene-star> nfa-node ;
|
||||
|
||||
!
|
||||
|
||||
: add-epsilon-flag ( flag -- )
|
||||
eps literal-transition add-simple-entry add-traversal-flag ;
|
||||
|
||||
M: beginning-of-line nfa-node ( node -- )
|
||||
drop beginning-of-line add-epsilon-flag ;
|
||||
|
||||
M: end-of-line nfa-node ( node -- )
|
||||
drop end-of-line add-epsilon-flag ;
|
||||
|
||||
M: beginning-of-input nfa-node ( node -- )
|
||||
drop beginning-of-input add-epsilon-flag ;
|
||||
|
||||
M: end-of-input nfa-node ( node -- )
|
||||
drop end-of-input add-epsilon-flag ;
|
||||
|
||||
M: negation nfa-node ( node -- )
|
||||
negation-mode inc
|
||||
|
|
|
@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators io io.streams.string
|
|||
kernel math math.parser namespaces qualified sets
|
||||
quotations sequences splitting symbols vectors math.order
|
||||
unicode.categories strings regexp.backend regexp.utils
|
||||
unicode.case words ;
|
||||
unicode.case words locals ;
|
||||
IN: regexp.parser
|
||||
|
||||
FROM: math.ranges => [a,b] ;
|
||||
|
@ -44,18 +44,21 @@ TUPLE: character-class-range from to ; INSTANCE: character-class-range node
|
|||
SINGLETON: epsilon INSTANCE: epsilon node
|
||||
SINGLETON: any-char INSTANCE: any-char node
|
||||
SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
|
||||
SINGLETON: front-anchor INSTANCE: front-anchor node
|
||||
SINGLETON: back-anchor INSTANCE: back-anchor node
|
||||
SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
|
||||
SINGLETON: end-of-input INSTANCE: end-of-input node
|
||||
SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
|
||||
SINGLETON: end-of-line INSTANCE: end-of-line node
|
||||
|
||||
TUPLE: option-on option ; INSTANCE: option-on node
|
||||
TUPLE: option-off option ; INSTANCE: option-off node
|
||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;
|
||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive
|
||||
unicode-case reversed-regexp ;
|
||||
|
||||
SINGLETONS: letter-class LETTER-class Letter-class digit-class
|
||||
alpha-class non-newline-blank-class
|
||||
ascii-class punctuation-class java-printable-class blank-class
|
||||
control-character-class hex-digit-class java-blank-class c-identifier-class
|
||||
unmatchable-class ;
|
||||
terminator-class unmatchable-class word-boundary-class ;
|
||||
|
||||
SINGLETONS: beginning-of-group end-of-group
|
||||
beginning-of-character-class end-of-character-class
|
||||
|
@ -225,26 +228,12 @@ ERROR: invalid-range a b ;
|
|||
|
||||
: handle-left-brace ( -- )
|
||||
parse-repetition
|
||||
>r 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ r>
|
||||
[ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
|
||||
[
|
||||
2dup and [ from-m-to-n ]
|
||||
[ [ nip at-most-n ] [ at-least-n ] if* ] if
|
||||
] [ drop 0 max exactly-n ] if ;
|
||||
|
||||
SINGLETON: beginning-of-input
|
||||
SINGLETON: end-of-input
|
||||
|
||||
: newlines ( -- obj1 obj2 obj3 )
|
||||
CHAR: \r <constant>
|
||||
CHAR: \n <constant>
|
||||
2dup 2array <concatenation> ;
|
||||
|
||||
: beginning-of-line ( -- obj )
|
||||
beginning-of-input newlines 4array <alternation> lookbehind boa ;
|
||||
|
||||
: end-of-line ( -- obj )
|
||||
end-of-input newlines 4array <alternation> lookahead boa ;
|
||||
|
||||
: handle-front-anchor ( -- )
|
||||
get-multiline beginning-of-line beginning-of-input ? push-stack ;
|
||||
|
||||
|
@ -281,13 +270,26 @@ ERROR: expected-posix-class ;
|
|||
: parse-control-character ( -- n ) read1 ;
|
||||
|
||||
ERROR: bad-escaped-literals seq ;
|
||||
: parse-escaped-literals ( -- obj )
|
||||
"\\E" read-until [ bad-escaped-literals ] unless
|
||||
|
||||
: parse-til-E ( -- obj )
|
||||
"\\E" read-until [ bad-escaped-literals ] unless ;
|
||||
|
||||
:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
|
||||
parse-til-E
|
||||
drop1
|
||||
[ epsilon ] [
|
||||
[ <constant> ] V{ } map-as
|
||||
[ quot call <constant> ] V{ } map-as
|
||||
first|concatenation
|
||||
] if-empty ;
|
||||
] if-empty ; inline
|
||||
|
||||
: parse-escaped-literals ( -- obj )
|
||||
[ ] (parse-escaped-literals) ;
|
||||
|
||||
: lower-case-literals ( -- obj )
|
||||
[ ch>lower ] (parse-escaped-literals) ;
|
||||
|
||||
: upper-case-literals ( -- obj )
|
||||
[ ch>upper ] (parse-escaped-literals) ;
|
||||
|
||||
: parse-escaped ( -- obj )
|
||||
read1
|
||||
|
@ -299,12 +301,12 @@ ERROR: bad-escaped-literals seq ;
|
|||
{ CHAR: a [ HEX: 7 <constant> ] }
|
||||
{ CHAR: e [ HEX: 1b <constant> ] }
|
||||
|
||||
{ CHAR: d [ digit-class ] }
|
||||
{ CHAR: D [ digit-class <negation> ] }
|
||||
{ CHAR: s [ java-blank-class ] }
|
||||
{ CHAR: S [ java-blank-class <negation> ] }
|
||||
{ CHAR: w [ c-identifier-class ] }
|
||||
{ CHAR: W [ c-identifier-class <negation> ] }
|
||||
{ CHAR: s [ java-blank-class ] }
|
||||
{ CHAR: S [ java-blank-class <negation> ] }
|
||||
{ CHAR: d [ digit-class ] }
|
||||
{ CHAR: D [ digit-class <negation> ] }
|
||||
|
||||
{ CHAR: p [ parse-posix-class ] }
|
||||
{ CHAR: P [ parse-posix-class <negation> ] }
|
||||
|
@ -313,13 +315,19 @@ ERROR: bad-escaped-literals seq ;
|
|||
{ CHAR: 0 [ parse-octal <constant> ] }
|
||||
{ CHAR: c [ parse-control-character ] }
|
||||
|
||||
! { CHAR: b [ handle-word-boundary ] }
|
||||
! { CHAR: B [ handle-word-boundary <negation> ] }
|
||||
! { CHAR: A [ handle-beginning-of-input ] }
|
||||
! { CHAR: G [ end of previous match ] }
|
||||
! { CHAR: Z [ handle-end-of-input ] }
|
||||
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
|
||||
{ CHAR: Q [ parse-escaped-literals ] }
|
||||
|
||||
! { CHAR: b [ word-boundary-class ] }
|
||||
! { CHAR: B [ word-boundary-class <negation> ] }
|
||||
! { CHAR: A [ handle-beginning-of-input ] }
|
||||
! { CHAR: z [ handle-end-of-input ] }
|
||||
|
||||
! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
|
||||
|
||||
! m//g mode
|
||||
! { CHAR: G [ end of previous match ] }
|
||||
|
||||
! Group capture
|
||||
! { CHAR: 1 [ CHAR: 1 <constant> ] }
|
||||
! { CHAR: 2 [ CHAR: 2 <constant> ] }
|
||||
! { CHAR: 3 [ CHAR: 3 <constant> ] }
|
||||
|
@ -330,7 +338,11 @@ ERROR: bad-escaped-literals seq ;
|
|||
! { CHAR: 8 [ CHAR: 8 <constant> ] }
|
||||
! { CHAR: 9 [ CHAR: 9 <constant> ] }
|
||||
|
||||
{ CHAR: Q [ parse-escaped-literals ] }
|
||||
! Perl extensions
|
||||
! can't do \l and \u because \u is already a 4-hex
|
||||
{ CHAR: L [ lower-case-literals ] }
|
||||
{ CHAR: U [ upper-case-literals ] }
|
||||
|
||||
[ <constant> ]
|
||||
} case ;
|
||||
|
||||
|
|
|
@ -45,6 +45,7 @@ IN: regexp-tests
|
|||
! Off by default.
|
||||
[ f ] [ "\n" "." <regexp> matches? ] unit-test
|
||||
[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
|
||||
[ t ] [ "\n" R/ ./s matches? ] unit-test
|
||||
[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
|
||||
|
||||
[ f ] [ "" ".+" <regexp> matches? ] unit-test
|
||||
|
@ -210,34 +211,34 @@ IN: regexp-tests
|
|||
[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
|
||||
[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
|
||||
|
||||
[ t ] [ "aaa" "AAA" <iregexp> matches? ] unit-test
|
||||
[ f ] [ "aax" "AAA" <iregexp> matches? ] unit-test
|
||||
[ t ] [ "aaa" "A*" <iregexp> matches? ] unit-test
|
||||
[ f ] [ "aaba" "A*" <iregexp> matches? ] unit-test
|
||||
[ t ] [ "b" "[AB]" <iregexp> matches? ] unit-test
|
||||
[ f ] [ "c" "[AB]" <iregexp> matches? ] unit-test
|
||||
[ t ] [ "c" "[A-Z]" <iregexp> matches? ] unit-test
|
||||
[ f ] [ "3" "[A-Z]" <iregexp> matches? ] unit-test
|
||||
[ t ] [ "aaa" R/ AAA/i matches? ] unit-test
|
||||
[ f ] [ "aax" R/ AAA/i matches? ] unit-test
|
||||
[ t ] [ "aaa" R/ A*/i matches? ] unit-test
|
||||
[ f ] [ "aaba" R/ A*/i matches? ] unit-test
|
||||
[ t ] [ "b" R/ [AB]/i matches? ] unit-test
|
||||
[ f ] [ "c" R/ [AB]/i matches? ] unit-test
|
||||
[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
|
||||
[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
|
||||
|
||||
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
|
||||
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
|
||||
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
|
||||
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
|
||||
|
||||
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
|
||||
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
|
||||
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
|
||||
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
|
||||
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
|
||||
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
|
||||
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
|
||||
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
|
||||
|
||||
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
|
||||
[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
|
||||
[ t ] [ "A" R/ [a-z]/i matches? ] unit-test
|
||||
|
||||
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
|
||||
[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
|
||||
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
|
||||
|
||||
[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
|
||||
[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
|
||||
[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
|
||||
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
|
||||
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
|
||||
[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test
|
||||
|
||||
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||
|
@ -253,7 +254,7 @@ IN: regexp-tests
|
|||
|
||||
[ ] [ "(\\$[\\p{XDigit}]|[\\p{Digit}])" <regexp> drop ] unit-test
|
||||
|
||||
! Comment
|
||||
! Comment inside a regular expression
|
||||
[ t ] [ "ac" "a(?#boo)c" <regexp> matches? ] unit-test
|
||||
|
||||
[ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test
|
||||
|
@ -283,32 +284,90 @@ IN: regexp-tests
|
|||
[ { "ABC" "DEF" "GHI" } ]
|
||||
[ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test
|
||||
|
||||
[ 3 ]
|
||||
[ "1ABC2DEF3GHI4" R/ [A-Z]+/ count-matches ] unit-test
|
||||
|
||||
[ 0 ]
|
||||
[ "123" R/ [A-Z]+/ count-matches ] unit-test
|
||||
|
||||
[ "1.2.3.4" ]
|
||||
[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test
|
||||
|
||||
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
|
||||
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
|
||||
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
|
||||
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
|
||||
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
|
||||
|
||||
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
|
||||
|
||||
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
|
||||
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
|
||||
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
|
||||
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
|
||||
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
|
||||
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
|
||||
|
||||
! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||
! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||
|
||||
[ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
|
||||
[ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
|
||||
|
||||
! Bug in parsing word
|
||||
[ t ] [ "a" R' a' matches? ] unit-test
|
||||
|
||||
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
|
||||
|
||||
[ t ] [ "a" R/ ^a/ matches? ] unit-test
|
||||
[ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
||||
[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
||||
[ f ] [ "\ra" R/ ^a/ matches? ] unit-test
|
||||
|
||||
[ t ] [ "a" R/ a$/ matches? ] unit-test
|
||||
[ f ] [ "a\n" R/ a$/ matches? ] unit-test
|
||||
[ f ] [ "a\r" R/ a$/ matches? ] unit-test
|
||||
[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
|
||||
|
||||
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
|
||||
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
|
||||
! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test
|
||||
! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test
|
||||
|
||||
! [ t ] [ "a" R/ \Aa/m matches? ] unit-test
|
||||
! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test
|
||||
! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test
|
||||
! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test
|
||||
|
||||
! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test
|
||||
|
||||
! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test
|
||||
! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test
|
||||
|
||||
! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test
|
||||
! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test
|
||||
|
||||
! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test
|
||||
! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test
|
||||
! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test
|
||||
! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
|
||||
|
||||
! [ t ] [ "a" R/ ^a/m matches? ] unit-test
|
||||
! [ t ] [ "\na" R/ ^a/m matches? ] unit-test
|
||||
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
|
||||
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
|
||||
|
||||
! Convert to lowercase until E
|
||||
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
|
||||
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
|
||||
|
||||
! Convert to uppercase until E
|
||||
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
|
||||
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
|
||||
|
||||
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
|
||||
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
|
||||
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
|
||||
! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
|
||||
|
||||
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
|
||||
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
|
||||
|
||||
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
|
||||
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
||||
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
|
||||
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
||||
|
||||
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
|
||||
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
|
||||
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
|
||||
|
@ -323,39 +382,29 @@ IN: regexp-tests
|
|||
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
|
||||
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
|
||||
|
||||
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
|
||||
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
|
||||
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
|
||||
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
|
||||
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
|
||||
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
|
||||
|
||||
! Bug in parsing word
|
||||
! [ t ] [ "a" R' a' matches? ] unit-test
|
||||
! "ab" "a(?=b*)" <regexp> match
|
||||
! "abbbbbc" "a(?=b*c)" <regexp> match
|
||||
! "ab" "a(?=b*)" <regexp> match
|
||||
|
||||
! clear "a(?=b*)" <regexp> "ab" over match
|
||||
! clear "a(?=b*c)" <regexp> "abbbbbc" over match
|
||||
! clear "a(?=b*)" <regexp> "ab" over match
|
||||
! "baz" "(az)(?<=b)" <regexp> first-match
|
||||
! "cbaz" "a(?<=b*)" <regexp> first-match
|
||||
! "baz" "a(?<=b)" <regexp> first-match
|
||||
|
||||
! clear "^a" <regexp> "a" over match
|
||||
! clear "^a" <regexp> "\na" over match
|
||||
! clear "^a" <regexp> "\r\na" over match
|
||||
! clear "^a" <regexp> "\ra" over match
|
||||
! "baz" "a(?<!b)" <regexp> first-match
|
||||
! "caz" "a(?<!b)" <regexp> first-match
|
||||
|
||||
! clear "a$" <regexp> "a" over match
|
||||
! clear "a$" <regexp> "a\n" over match
|
||||
! clear "a$" <regexp> "a\r" over match
|
||||
! clear "a$" <regexp> "a\r\n" over match
|
||||
|
||||
! "(az)(?<=b)" <regexp> "baz" over first-match
|
||||
! "a(?<=b*)" <regexp> "cbaz" over first-match
|
||||
! "a(?<=b)" <regexp> "baz" over first-match
|
||||
|
||||
! "a(?<!b)" <regexp> "baz" over first-match
|
||||
! "a(?<!b)" <regexp> "caz" over first-match
|
||||
|
||||
! "a(?=bcdefg)bcd" <regexp> "abcdefg" over first-match
|
||||
! "a(?#bcdefg)bcd" <regexp> "abcdefg" over first-match
|
||||
! "a(?:bcdefg)" <regexp> "abcdefg" over first-match
|
||||
|
||||
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
|
||||
|
||||
! "a(?<=b)" <regexp> "caba" over first-match
|
||||
! "abcdefg" "a(?=bcdefg)bcd" <regexp> first-match
|
||||
! "abcdefg" "a(?#bcdefg)bcd" <regexp> first-match
|
||||
! "abcdefg" "a(?:bcdefg)" <regexp> first-match
|
||||
|
||||
! "caba" "a(?<=b)" <regexp> first-match
|
||||
|
||||
! capture group 1: "aaaa" 2: ""
|
||||
! "aaaa" "(a*)(a*)" <regexp> match*
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
! Copyright (C) 2008 Doug Coleman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors combinators kernel math sequences
|
||||
USING: accessors combinators kernel math sequences strings
|
||||
sets assocs prettyprint.backend make lexer namespaces parser
|
||||
arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa
|
||||
regexp.dfa regexp.traversal regexp.transition-tables splitting ;
|
||||
regexp.dfa regexp.traversal regexp.transition-tables splitting
|
||||
sorting ;
|
||||
IN: regexp
|
||||
|
||||
: default-regexp ( string -- regexp )
|
||||
|
@ -73,42 +74,9 @@ IN: regexp
|
|||
[ dup ] swap '[ _ next-match ] [ ] produce nip harvest ;
|
||||
|
||||
: count-matches ( string regexp -- n )
|
||||
all-matches length 1- ;
|
||||
all-matches length ;
|
||||
|
||||
: initial-option ( regexp option -- regexp' )
|
||||
over options>> conjoin ;
|
||||
|
||||
: <regexp> ( string -- regexp )
|
||||
default-regexp construct-regexp ;
|
||||
|
||||
: <iregexp> ( string -- regexp )
|
||||
default-regexp
|
||||
case-insensitive initial-option
|
||||
construct-regexp ;
|
||||
|
||||
: <rregexp> ( string -- regexp )
|
||||
default-regexp
|
||||
reversed-regexp initial-option
|
||||
construct-regexp ;
|
||||
|
||||
: parsing-regexp ( accum end -- accum )
|
||||
lexer get dup skip-blank
|
||||
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
|
||||
lexer get dup still-parsing-line?
|
||||
[ (parse-token) ] [ drop f ] if
|
||||
"i" = [ <iregexp> ] [ <regexp> ] if parsed ;
|
||||
|
||||
: R! CHAR: ! parsing-regexp ; parsing
|
||||
: R" CHAR: " parsing-regexp ; parsing
|
||||
: R# CHAR: # parsing-regexp ; parsing
|
||||
: R' CHAR: ' parsing-regexp ; parsing
|
||||
: R( CHAR: ) parsing-regexp ; parsing
|
||||
: R/ CHAR: / parsing-regexp ; parsing
|
||||
: R@ CHAR: @ parsing-regexp ; parsing
|
||||
: R[ CHAR: ] parsing-regexp ; parsing
|
||||
: R` CHAR: ` parsing-regexp ; parsing
|
||||
: R{ CHAR: } parsing-regexp ; parsing
|
||||
: R| CHAR: | parsing-regexp ; parsing
|
||||
<PRIVATE
|
||||
|
||||
: find-regexp-syntax ( string -- prefix suffix )
|
||||
{
|
||||
|
@ -125,14 +93,67 @@ IN: regexp
|
|||
{ "R| " "|" }
|
||||
} swap [ subseq? not nip ] curry assoc-find drop ;
|
||||
|
||||
: option? ( option regexp -- ? )
|
||||
options>> key? ;
|
||||
ERROR: unknown-regexp-option option ;
|
||||
|
||||
: option>ch ( option -- string )
|
||||
{
|
||||
{ case-insensitive [ CHAR: i ] }
|
||||
{ multiline [ CHAR: m ] }
|
||||
{ reversed-regexp [ CHAR: r ] }
|
||||
{ dotall [ CHAR: s ] }
|
||||
[ unknown-regexp-option ]
|
||||
} case ;
|
||||
|
||||
: ch>option ( ch -- option )
|
||||
{
|
||||
{ CHAR: i [ case-insensitive ] }
|
||||
{ CHAR: m [ multiline ] }
|
||||
{ CHAR: r [ reversed-regexp ] }
|
||||
{ CHAR: s [ dotall ] }
|
||||
[ unknown-regexp-option ]
|
||||
} case ;
|
||||
|
||||
: string>options ( string -- options )
|
||||
[ ch>option dup ] H{ } map>assoc ;
|
||||
|
||||
: options>string ( options -- string )
|
||||
keys [ option>ch ] map natural-sort >string ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: <optioned-regexp> ( string option-string -- regexp )
|
||||
[ default-regexp ] [ string>options ] bi* >>options
|
||||
construct-regexp ;
|
||||
|
||||
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
||||
|
||||
<PRIVATE
|
||||
|
||||
: parsing-regexp ( accum end -- accum )
|
||||
lexer get dup skip-blank
|
||||
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
|
||||
lexer get dup still-parsing-line?
|
||||
[ (parse-token) ] [ drop f ] if
|
||||
<optioned-regexp> parsed ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: R! CHAR: ! parsing-regexp ; parsing
|
||||
: R" CHAR: " parsing-regexp ; parsing
|
||||
: R# CHAR: # parsing-regexp ; parsing
|
||||
: R' CHAR: ' parsing-regexp ; parsing
|
||||
: R( CHAR: ) parsing-regexp ; parsing
|
||||
: R/ CHAR: / parsing-regexp ; parsing
|
||||
: R@ CHAR: @ parsing-regexp ; parsing
|
||||
: R[ CHAR: ] parsing-regexp ; parsing
|
||||
: R` CHAR: ` parsing-regexp ; parsing
|
||||
: R{ CHAR: } parsing-regexp ; parsing
|
||||
: R| CHAR: | parsing-regexp ; parsing
|
||||
|
||||
M: regexp pprint*
|
||||
[
|
||||
[
|
||||
dup raw>>
|
||||
dup find-regexp-syntax swap % swap % %
|
||||
case-insensitive swap option? [ "i" % ] when
|
||||
[ raw>> dup find-regexp-syntax swap % swap % % ]
|
||||
[ options>> options>string % ] bi
|
||||
] "" make
|
||||
] keep present-text ;
|
||||
|
|
|
@ -40,7 +40,7 @@ TUPLE: transition-table transitions start-state final-states ;
|
|||
2dup [ to>> ] dip maybe-initialize-key
|
||||
[ [ to>> ] [ obj>> ] [ from>> ] tri ] dip
|
||||
2dup at* [ 2nip insert-at ]
|
||||
[ drop >r >r H{ } clone [ insert-at ] keep r> r> set-at ] if ;
|
||||
[ drop [ H{ } clone [ insert-at ] keep ] 2dip set-at ] if ;
|
||||
|
||||
: add-transition ( transition transition-table -- )
|
||||
transitions>> set-transition ;
|
||||
|
|
|
@ -17,6 +17,7 @@ TUPLE: dfa-traverser
|
|||
capture-group-index
|
||||
last-state current-state
|
||||
text
|
||||
match-failed?
|
||||
start-index current-index
|
||||
matches ;
|
||||
|
||||
|
@ -37,14 +38,20 @@ TUPLE: dfa-traverser
|
|||
H{ } clone >>captured-groups ;
|
||||
|
||||
: final-state? ( dfa-traverser -- ? )
|
||||
[ current-state>> ] [ dfa-table>> final-states>> ] bi
|
||||
key? ;
|
||||
[ current-state>> ]
|
||||
[ dfa-table>> final-states>> ] bi key? ;
|
||||
|
||||
: beginning-of-text? ( dfa-traverser -- ? )
|
||||
current-index>> 0 <= ; inline
|
||||
|
||||
: end-of-text? ( dfa-traverser -- ? )
|
||||
[ current-index>> ] [ text>> length ] bi >= ; inline
|
||||
|
||||
: text-finished? ( dfa-traverser -- ? )
|
||||
{
|
||||
[ current-state>> empty? ]
|
||||
[ [ current-index>> ] [ text>> length ] bi >= ]
|
||||
! [ current-index>> 0 < ]
|
||||
[ end-of-text? ]
|
||||
[ match-failed?>> ]
|
||||
} 1|| ;
|
||||
|
||||
: save-final-state ( dfa-straverser -- )
|
||||
|
@ -55,8 +62,47 @@ TUPLE: dfa-traverser
|
|||
dup save-final-state
|
||||
] when text-finished? ;
|
||||
|
||||
: previous-text-character ( dfa-traverser -- ch )
|
||||
[ text>> ] [ current-index>> 1- ] bi nth ;
|
||||
|
||||
: current-text-character ( dfa-traverser -- ch )
|
||||
[ text>> ] [ current-index>> ] bi nth ;
|
||||
|
||||
: next-text-character ( dfa-traverser -- ch )
|
||||
[ text>> ] [ current-index>> 1+ ] bi nth ;
|
||||
|
||||
GENERIC: flag-action ( dfa-traverser flag -- )
|
||||
|
||||
|
||||
M: beginning-of-input flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
dup beginning-of-text? [ t >>match-failed? ] unless drop ;
|
||||
|
||||
M: end-of-input flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
dup end-of-text? [ t >>match-failed? ] unless drop ;
|
||||
|
||||
M: beginning-of-line flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
dup {
|
||||
[ beginning-of-text? ]
|
||||
[ previous-text-character terminator-class class-member? ]
|
||||
} 1|| [ t >>match-failed? ] unless drop ;
|
||||
|
||||
M: end-of-line flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
dup {
|
||||
[ end-of-text? ]
|
||||
[ next-text-character terminator-class class-member? ]
|
||||
} 1|| [ t >>match-failed? ] unless drop ;
|
||||
|
||||
M: word-boundary flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
dup {
|
||||
[ end-of-text? ]
|
||||
[ current-text-character terminator-class class-member? ]
|
||||
} 1|| [ t >>match-failed? ] unless drop ;
|
||||
|
||||
M: lookahead-on flag-action ( dfa-traverser flag -- )
|
||||
drop
|
||||
lookahead-counters>> 0 swap push ;
|
||||
|
@ -110,11 +156,7 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
|
|||
[ [ 1+ ] change-current-index ]
|
||||
[ [ 1- ] change-current-index ] if
|
||||
dup current-state>> >>last-state
|
||||
] dip
|
||||
first >>current-state ;
|
||||
|
||||
: match-failed ( dfa-traverser -- dfa-traverser )
|
||||
V{ } clone >>matches ;
|
||||
] [ first ] bi* >>current-state ;
|
||||
|
||||
: match-literal ( transition from-state table -- to-state/f )
|
||||
transitions>> at at ;
|
||||
|
@ -131,11 +173,9 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
|
|||
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
|
||||
|
||||
: setup-match ( match -- obj state dfa-table )
|
||||
{
|
||||
[ current-index>> ] [ text>> ]
|
||||
[ current-state>> ] [ dfa-table>> ]
|
||||
} cleave
|
||||
[ nth ] 2dip ;
|
||||
[ [ current-index>> ] [ text>> ] bi nth ]
|
||||
[ current-state>> ]
|
||||
[ dfa-table>> ] tri ;
|
||||
|
||||
: do-match ( dfa-traverser -- dfa-traverser )
|
||||
dup process-flags
|
||||
|
|
|
@ -11,14 +11,14 @@ IN: benchmark.regex-dna
|
|||
|
||||
: count-patterns ( string -- )
|
||||
{
|
||||
R/ agggtaaa|tttaccct/i,
|
||||
R/ [cgt]gggtaaa|tttaccc[acg]/i,
|
||||
R/ a[act]ggtaaa|tttacc[agt]t/i,
|
||||
R/ ag[act]gtaaa|tttac[agt]ct/i,
|
||||
R/ agg[act]taaa|ttta[agt]cct/i,
|
||||
R/ aggg[acg]aaa|ttt[cgt]ccct/i,
|
||||
R/ agggt[cgt]aa|tt[acg]accct/i,
|
||||
R/ agggta[cgt]a|t[acg]taccct/i,
|
||||
R/ agggtaaa|tttaccct/i
|
||||
R/ [cgt]gggtaaa|tttaccc[acg]/i
|
||||
R/ a[act]ggtaaa|tttacc[agt]t/i
|
||||
R/ ag[act]gtaaa|tttac[agt]ct/i
|
||||
R/ agg[act]taaa|ttta[agt]cct/i
|
||||
R/ aggg[acg]aaa|ttt[cgt]ccct/i
|
||||
R/ agggt[cgt]aa|tt[acg]accct/i
|
||||
R/ agggta[cgt]a|t[acg]taccct/i
|
||||
R/ agggtaa[cgt]|[acg]ttaccct/i
|
||||
} [
|
||||
[ raw>> write bl ]
|
||||
|
|
Loading…
Reference in New Issue