better literal syntax for regexps. support ^ and $ (not in multiline mode yet) but not yet in multiline mode. support for \A and \Z \z is next. removed some crud from the parser, and added more commented out unit tests...
parent
ce54c54ba0
commit
db3c21663e
|
@ -7,6 +7,7 @@ IN: regexp.classes
|
||||||
GENERIC: class-member? ( obj class -- ? )
|
GENERIC: class-member? ( obj class -- ? )
|
||||||
|
|
||||||
M: word class-member? ( obj class -- ? ) 2drop f ;
|
M: word class-member? ( obj class -- ? ) 2drop f ;
|
||||||
|
|
||||||
M: integer class-member? ( obj class -- ? ) 2drop f ;
|
M: integer class-member? ( obj class -- ? ) 2drop f ;
|
||||||
|
|
||||||
M: character-class-range class-member? ( obj class -- ? )
|
M: character-class-range class-member? ( obj class -- ? )
|
||||||
|
@ -60,3 +61,12 @@ M: java-blank-class class-member? ( obj class -- ? )
|
||||||
|
|
||||||
M: unmatchable-class class-member? ( obj class -- ? )
|
M: unmatchable-class class-member? ( obj class -- ? )
|
||||||
2drop f ;
|
2drop f ;
|
||||||
|
|
||||||
|
M: terminator-class class-member? ( obj class -- ? )
|
||||||
|
drop {
|
||||||
|
[ CHAR: \r = ]
|
||||||
|
[ CHAR: \n = ]
|
||||||
|
[ CHAR: \u000085 = ]
|
||||||
|
[ CHAR: \u002028 = ]
|
||||||
|
[ CHAR: \u002029 = ]
|
||||||
|
} 1|| ;
|
||||||
|
|
|
@ -18,6 +18,9 @@ SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
|
||||||
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
|
SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
|
||||||
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
|
SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
|
||||||
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
|
SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
|
||||||
|
SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
|
||||||
|
SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
|
||||||
|
SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
|
||||||
|
|
||||||
: next-state ( regexp -- state )
|
: next-state ( regexp -- state )
|
||||||
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
|
[ state>> ] [ [ 1+ ] change-state drop ] bi ;
|
||||||
|
@ -135,7 +138,21 @@ M: non-capture-group nfa-node ( node -- )
|
||||||
M: reluctant-kleene-star nfa-node ( node -- )
|
M: reluctant-kleene-star nfa-node ( node -- )
|
||||||
term>> <kleene-star> nfa-node ;
|
term>> <kleene-star> nfa-node ;
|
||||||
|
|
||||||
!
|
|
||||||
|
: add-epsilon-flag ( flag -- )
|
||||||
|
eps literal-transition add-simple-entry add-traversal-flag ;
|
||||||
|
|
||||||
|
M: beginning-of-line nfa-node ( node -- )
|
||||||
|
drop beginning-of-line add-epsilon-flag ;
|
||||||
|
|
||||||
|
M: end-of-line nfa-node ( node -- )
|
||||||
|
drop end-of-line add-epsilon-flag ;
|
||||||
|
|
||||||
|
M: beginning-of-input nfa-node ( node -- )
|
||||||
|
drop beginning-of-input add-epsilon-flag ;
|
||||||
|
|
||||||
|
M: end-of-input nfa-node ( node -- )
|
||||||
|
drop end-of-input add-epsilon-flag ;
|
||||||
|
|
||||||
M: negation nfa-node ( node -- )
|
M: negation nfa-node ( node -- )
|
||||||
negation-mode inc
|
negation-mode inc
|
||||||
|
|
|
@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators io io.streams.string
|
||||||
kernel math math.parser namespaces qualified sets
|
kernel math math.parser namespaces qualified sets
|
||||||
quotations sequences splitting symbols vectors math.order
|
quotations sequences splitting symbols vectors math.order
|
||||||
unicode.categories strings regexp.backend regexp.utils
|
unicode.categories strings regexp.backend regexp.utils
|
||||||
unicode.case words ;
|
unicode.case words locals ;
|
||||||
IN: regexp.parser
|
IN: regexp.parser
|
||||||
|
|
||||||
FROM: math.ranges => [a,b] ;
|
FROM: math.ranges => [a,b] ;
|
||||||
|
@ -44,18 +44,21 @@ TUPLE: character-class-range from to ; INSTANCE: character-class-range node
|
||||||
SINGLETON: epsilon INSTANCE: epsilon node
|
SINGLETON: epsilon INSTANCE: epsilon node
|
||||||
SINGLETON: any-char INSTANCE: any-char node
|
SINGLETON: any-char INSTANCE: any-char node
|
||||||
SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
|
SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node
|
||||||
SINGLETON: front-anchor INSTANCE: front-anchor node
|
SINGLETON: beginning-of-input INSTANCE: beginning-of-input node
|
||||||
SINGLETON: back-anchor INSTANCE: back-anchor node
|
SINGLETON: end-of-input INSTANCE: end-of-input node
|
||||||
|
SINGLETON: beginning-of-line INSTANCE: beginning-of-line node
|
||||||
|
SINGLETON: end-of-line INSTANCE: end-of-line node
|
||||||
|
|
||||||
TUPLE: option-on option ; INSTANCE: option-on node
|
TUPLE: option-on option ; INSTANCE: option-on node
|
||||||
TUPLE: option-off option ; INSTANCE: option-off node
|
TUPLE: option-off option ; INSTANCE: option-off node
|
||||||
SINGLETONS: unix-lines dotall multiline comments case-insensitive unicode-case reversed-regexp ;
|
SINGLETONS: unix-lines dotall multiline comments case-insensitive
|
||||||
|
unicode-case reversed-regexp ;
|
||||||
|
|
||||||
SINGLETONS: letter-class LETTER-class Letter-class digit-class
|
SINGLETONS: letter-class LETTER-class Letter-class digit-class
|
||||||
alpha-class non-newline-blank-class
|
alpha-class non-newline-blank-class
|
||||||
ascii-class punctuation-class java-printable-class blank-class
|
ascii-class punctuation-class java-printable-class blank-class
|
||||||
control-character-class hex-digit-class java-blank-class c-identifier-class
|
control-character-class hex-digit-class java-blank-class c-identifier-class
|
||||||
unmatchable-class ;
|
terminator-class unmatchable-class word-boundary-class ;
|
||||||
|
|
||||||
SINGLETONS: beginning-of-group end-of-group
|
SINGLETONS: beginning-of-group end-of-group
|
||||||
beginning-of-character-class end-of-character-class
|
beginning-of-character-class end-of-character-class
|
||||||
|
@ -231,20 +234,6 @@ ERROR: invalid-range a b ;
|
||||||
[ [ nip at-most-n ] [ at-least-n ] if* ] if
|
[ [ nip at-most-n ] [ at-least-n ] if* ] if
|
||||||
] [ drop 0 max exactly-n ] if ;
|
] [ drop 0 max exactly-n ] if ;
|
||||||
|
|
||||||
SINGLETON: beginning-of-input
|
|
||||||
SINGLETON: end-of-input
|
|
||||||
|
|
||||||
: newlines ( -- obj1 obj2 obj3 )
|
|
||||||
CHAR: \r <constant>
|
|
||||||
CHAR: \n <constant>
|
|
||||||
2dup 2array <concatenation> ;
|
|
||||||
|
|
||||||
: beginning-of-line ( -- obj )
|
|
||||||
beginning-of-input newlines 4array <alternation> lookbehind boa ;
|
|
||||||
|
|
||||||
: end-of-line ( -- obj )
|
|
||||||
end-of-input newlines 4array <alternation> lookahead boa ;
|
|
||||||
|
|
||||||
: handle-front-anchor ( -- )
|
: handle-front-anchor ( -- )
|
||||||
get-multiline beginning-of-line beginning-of-input ? push-stack ;
|
get-multiline beginning-of-line beginning-of-input ? push-stack ;
|
||||||
|
|
||||||
|
@ -281,13 +270,26 @@ ERROR: expected-posix-class ;
|
||||||
: parse-control-character ( -- n ) read1 ;
|
: parse-control-character ( -- n ) read1 ;
|
||||||
|
|
||||||
ERROR: bad-escaped-literals seq ;
|
ERROR: bad-escaped-literals seq ;
|
||||||
: parse-escaped-literals ( -- obj )
|
|
||||||
"\\E" read-until [ bad-escaped-literals ] unless
|
: parse-til-E ( -- obj )
|
||||||
|
"\\E" read-until [ bad-escaped-literals ] unless ;
|
||||||
|
|
||||||
|
:: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
|
||||||
|
parse-til-E
|
||||||
drop1
|
drop1
|
||||||
[ epsilon ] [
|
[ epsilon ] [
|
||||||
[ <constant> ] V{ } map-as
|
[ quot call <constant> ] V{ } map-as
|
||||||
first|concatenation
|
first|concatenation
|
||||||
] if-empty ;
|
] if-empty ; inline
|
||||||
|
|
||||||
|
: parse-escaped-literals ( -- obj )
|
||||||
|
[ ] (parse-escaped-literals) ;
|
||||||
|
|
||||||
|
: lower-case-literals ( -- obj )
|
||||||
|
[ ch>lower ] (parse-escaped-literals) ;
|
||||||
|
|
||||||
|
: upper-case-literals ( -- obj )
|
||||||
|
[ ch>upper ] (parse-escaped-literals) ;
|
||||||
|
|
||||||
: parse-escaped ( -- obj )
|
: parse-escaped ( -- obj )
|
||||||
read1
|
read1
|
||||||
|
@ -299,12 +301,12 @@ ERROR: bad-escaped-literals seq ;
|
||||||
{ CHAR: a [ HEX: 7 <constant> ] }
|
{ CHAR: a [ HEX: 7 <constant> ] }
|
||||||
{ CHAR: e [ HEX: 1b <constant> ] }
|
{ CHAR: e [ HEX: 1b <constant> ] }
|
||||||
|
|
||||||
{ CHAR: d [ digit-class ] }
|
|
||||||
{ CHAR: D [ digit-class <negation> ] }
|
|
||||||
{ CHAR: s [ java-blank-class ] }
|
|
||||||
{ CHAR: S [ java-blank-class <negation> ] }
|
|
||||||
{ CHAR: w [ c-identifier-class ] }
|
{ CHAR: w [ c-identifier-class ] }
|
||||||
{ CHAR: W [ c-identifier-class <negation> ] }
|
{ CHAR: W [ c-identifier-class <negation> ] }
|
||||||
|
{ CHAR: s [ java-blank-class ] }
|
||||||
|
{ CHAR: S [ java-blank-class <negation> ] }
|
||||||
|
{ CHAR: d [ digit-class ] }
|
||||||
|
{ CHAR: D [ digit-class <negation> ] }
|
||||||
|
|
||||||
{ CHAR: p [ parse-posix-class ] }
|
{ CHAR: p [ parse-posix-class ] }
|
||||||
{ CHAR: P [ parse-posix-class <negation> ] }
|
{ CHAR: P [ parse-posix-class <negation> ] }
|
||||||
|
@ -313,13 +315,19 @@ ERROR: bad-escaped-literals seq ;
|
||||||
{ CHAR: 0 [ parse-octal <constant> ] }
|
{ CHAR: 0 [ parse-octal <constant> ] }
|
||||||
{ CHAR: c [ parse-control-character ] }
|
{ CHAR: c [ parse-control-character ] }
|
||||||
|
|
||||||
! { CHAR: b [ handle-word-boundary ] }
|
{ CHAR: Q [ parse-escaped-literals ] }
|
||||||
! { CHAR: B [ handle-word-boundary <negation> ] }
|
|
||||||
! { CHAR: A [ handle-beginning-of-input ] }
|
|
||||||
! { CHAR: G [ end of previous match ] }
|
|
||||||
! { CHAR: Z [ handle-end-of-input ] }
|
|
||||||
! { CHAR: z [ handle-end-of-input ] } ! except for terminator
|
|
||||||
|
|
||||||
|
! { CHAR: b [ word-boundary-class ] }
|
||||||
|
! { CHAR: B [ word-boundary-class <negation> ] }
|
||||||
|
! { CHAR: A [ handle-beginning-of-input ] }
|
||||||
|
! { CHAR: z [ handle-end-of-input ] }
|
||||||
|
|
||||||
|
! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
|
||||||
|
|
||||||
|
! m//g mode
|
||||||
|
! { CHAR: G [ end of previous match ] }
|
||||||
|
|
||||||
|
! Group capture
|
||||||
! { CHAR: 1 [ CHAR: 1 <constant> ] }
|
! { CHAR: 1 [ CHAR: 1 <constant> ] }
|
||||||
! { CHAR: 2 [ CHAR: 2 <constant> ] }
|
! { CHAR: 2 [ CHAR: 2 <constant> ] }
|
||||||
! { CHAR: 3 [ CHAR: 3 <constant> ] }
|
! { CHAR: 3 [ CHAR: 3 <constant> ] }
|
||||||
|
@ -330,7 +338,11 @@ ERROR: bad-escaped-literals seq ;
|
||||||
! { CHAR: 8 [ CHAR: 8 <constant> ] }
|
! { CHAR: 8 [ CHAR: 8 <constant> ] }
|
||||||
! { CHAR: 9 [ CHAR: 9 <constant> ] }
|
! { CHAR: 9 [ CHAR: 9 <constant> ] }
|
||||||
|
|
||||||
{ CHAR: Q [ parse-escaped-literals ] }
|
! Perl extensions
|
||||||
|
! can't do \l and \u because \u is already a 4-hex
|
||||||
|
{ CHAR: L [ lower-case-literals ] }
|
||||||
|
{ CHAR: U [ upper-case-literals ] }
|
||||||
|
|
||||||
[ <constant> ]
|
[ <constant> ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,7 @@ IN: regexp-tests
|
||||||
! Off by default.
|
! Off by default.
|
||||||
[ f ] [ "\n" "." <regexp> matches? ] unit-test
|
[ f ] [ "\n" "." <regexp> matches? ] unit-test
|
||||||
[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
|
[ t ] [ "\n" "(?s)." <regexp> matches? ] unit-test
|
||||||
|
[ t ] [ "\n" R/ ./s matches? ] unit-test
|
||||||
[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
|
[ f ] [ "\n\n" "(?s).(?-s)." <regexp> matches? ] unit-test
|
||||||
|
|
||||||
[ f ] [ "" ".+" <regexp> matches? ] unit-test
|
[ f ] [ "" ".+" <regexp> matches? ] unit-test
|
||||||
|
@ -210,34 +211,34 @@ IN: regexp-tests
|
||||||
[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
|
[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
|
||||||
[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
|
[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
|
||||||
|
|
||||||
[ t ] [ "aaa" "AAA" <iregexp> matches? ] unit-test
|
[ t ] [ "aaa" R/ AAA/i matches? ] unit-test
|
||||||
[ f ] [ "aax" "AAA" <iregexp> matches? ] unit-test
|
[ f ] [ "aax" R/ AAA/i matches? ] unit-test
|
||||||
[ t ] [ "aaa" "A*" <iregexp> matches? ] unit-test
|
[ t ] [ "aaa" R/ A*/i matches? ] unit-test
|
||||||
[ f ] [ "aaba" "A*" <iregexp> matches? ] unit-test
|
[ f ] [ "aaba" R/ A*/i matches? ] unit-test
|
||||||
[ t ] [ "b" "[AB]" <iregexp> matches? ] unit-test
|
[ t ] [ "b" R/ [AB]/i matches? ] unit-test
|
||||||
[ f ] [ "c" "[AB]" <iregexp> matches? ] unit-test
|
[ f ] [ "c" R/ [AB]/i matches? ] unit-test
|
||||||
[ t ] [ "c" "[A-Z]" <iregexp> matches? ] unit-test
|
[ t ] [ "c" R/ [A-Z]/i matches? ] unit-test
|
||||||
[ f ] [ "3" "[A-Z]" <iregexp> matches? ] unit-test
|
[ f ] [ "3" R/ [A-Z]/i matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
|
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
|
[ t ] [ "a" "(?i)a" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
|
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
|
[ t ] [ "A" "(?i)a" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
|
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
|
||||||
[ t ] [ "a" "(?-i)a" <iregexp> matches? ] unit-test
|
[ t ] [ "a" R/ (?-i)a/i matches? ] unit-test
|
||||||
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
|
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
|
||||||
[ f ] [ "A" "(?-i)a" <iregexp> matches? ] unit-test
|
[ f ] [ "A" R/ (?-i)a/i matches? ] unit-test
|
||||||
|
|
||||||
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
|
[ f ] [ "A" "[a-z]" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "A" "[a-z]" <iregexp> matches? ] unit-test
|
[ t ] [ "A" R/ [a-z]/i matches? ] unit-test
|
||||||
|
|
||||||
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
|
[ f ] [ "A" "\\p{Lower}" <regexp> matches? ] unit-test
|
||||||
[ t ] [ "A" "\\p{Lower}" <iregexp> matches? ] unit-test
|
[ t ] [ "A" R/ \p{Lower}/i matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "abc" <reversed> "abc" <rregexp> matches? ] unit-test
|
[ t ] [ "abc" <reversed> R/ abc/r matches? ] unit-test
|
||||||
[ t ] [ "abc" <reversed> "a[bB][cC]" <rregexp> matches? ] unit-test
|
[ t ] [ "abc" <reversed> R/ a[bB][cC]/r matches? ] unit-test
|
||||||
[ t ] [ "adcbe" "a(?r)(bcd)(?-r)e" <rregexp> matches? ] unit-test
|
[ t ] [ "adcbe" R/ a(?r)(bcd)(?-r)e/r matches? ] unit-test
|
||||||
|
|
||||||
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
[ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||||
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
[ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
|
||||||
|
@ -293,6 +294,9 @@ IN: regexp-tests
|
||||||
[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test
|
[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test
|
||||||
|
|
||||||
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
|
[ f ] [ "ab" "a(?!b)" <regexp> first-match ] unit-test
|
||||||
|
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
|
||||||
|
[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||||
|
[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
||||||
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
|
[ "a" ] [ "ab" "a(?=b)(?=b)" <regexp> first-match >string ] unit-test
|
||||||
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
|
[ "a" ] [ "ba" "a(?<=b)(?<=b)" <regexp> first-match >string ] unit-test
|
||||||
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
|
[ "a" ] [ "cab" "a(?=b)(?<=c)" <regexp> first-match >string ] unit-test
|
||||||
|
@ -305,19 +309,65 @@ IN: regexp-tests
|
||||||
|
|
||||||
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
|
! [ "{Lower}" <regexp> ] [ invalid-range? ] must-fail-with
|
||||||
|
|
||||||
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
|
[ t ] [ "a" R/ ^a/ matches? ] unit-test
|
||||||
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
|
[ f ] [ "\na" R/ ^a/ matches? ] unit-test
|
||||||
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
|
[ f ] [ "\r\na" R/ ^a/ matches? ] unit-test
|
||||||
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
|
[ f ] [ "\ra" R/ ^a/ matches? ] unit-test
|
||||||
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
|
|
||||||
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
|
|
||||||
|
|
||||||
! [ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
[ t ] [ "a" R/ a$/ matches? ] unit-test
|
||||||
! [ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
|
[ f ] [ "a\n" R/ a$/ matches? ] unit-test
|
||||||
|
[ f ] [ "a\r" R/ a$/ matches? ] unit-test
|
||||||
|
[ f ] [ "a\r\n" R/ a$/ matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "a" R/ \Aa/ matches? ] unit-test
|
||||||
|
! [ f ] [ "\na" R/ \Aaa/ matches? ] unit-test
|
||||||
|
! [ f ] [ "\r\na" R/ \Aa/ matches? ] unit-test
|
||||||
|
! [ f ] [ "\ra" R/ \Aa/ matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "a" R/ \Aa/m matches? ] unit-test
|
||||||
|
! [ f ] [ "\na" R/ \Aaa/m matches? ] unit-test
|
||||||
|
! [ f ] [ "\r\na" R/ \Aa/m matches? ] unit-test
|
||||||
|
! [ f ] [ "\ra" R/ \Aa/m matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "\r\n\n\n\nam" R/ ^am/m matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "a" R/ \Aa\z/m matches? ] unit-test
|
||||||
|
! [ f ] [ "a\n" R/ \Aa\z/m matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "a\r\n" R/ \Aa\Z/m matches? ] unit-test
|
||||||
|
! [ t ] [ "a\n" R/ \Aa\Z/m matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "a" R/ \Aa\Z/m matches? ] unit-test
|
||||||
|
! [ f ] [ "\na" R/ \Aaa\Z/m matches? ] unit-test
|
||||||
|
! [ f ] [ "\r\na" R/ \Aa\Z/m matches? ] unit-test
|
||||||
|
! [ f ] [ "\ra" R/ \Aa\Z/m matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "a" R/ ^a/m matches? ] unit-test
|
||||||
|
! [ t ] [ "\na" R/ ^a/m matches? ] unit-test
|
||||||
|
! [ t ] [ "\r\na" R/ ^a/m matches? ] unit-test
|
||||||
|
! [ t ] [ "\ra" R/ ^a/m matches? ] unit-test
|
||||||
|
|
||||||
|
! Convert to lowercase until E
|
||||||
|
[ f ] [ "AA" R/ \LAA\E/ matches? ] unit-test
|
||||||
|
[ t ] [ "aa" R/ \LAA\E/ matches? ] unit-test
|
||||||
|
|
||||||
|
! Convert to uppercase until E
|
||||||
|
[ t ] [ "AA" R/ \Uaa\E/ matches? ] unit-test
|
||||||
|
[ f ] [ "aa" R/ \Uaa\E/ matches? ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "a" "a$" R/ a$/m matches? ] unit-test
|
||||||
|
! [ t ] [ "a\n" "a$" R/ a$/m matches? ] unit-test
|
||||||
|
! [ t ] [ "a\r" "a$" R/ a$/m matches? ] unit-test
|
||||||
|
! [ t ] [ "a\r\n" "a$" R/ a$/m matches? ] unit-test
|
||||||
|
|
||||||
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
|
! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
|
||||||
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
|
! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
|
||||||
|
|
||||||
|
! [ t ] [ "foo" "\\bfoo\\b" <regexp> matches? ] unit-test
|
||||||
|
! [ t ] [ "afoob" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
||||||
|
! [ t ] [ "afoob" "\\bfoo\\b" <regexp> matches? ] unit-test
|
||||||
|
! [ f ] [ "foo" "\\Bfoo\\B" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
|
! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
|
||||||
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
|
! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
|
||||||
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
|
! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
|
||||||
|
@ -332,34 +382,29 @@ IN: regexp-tests
|
||||||
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
|
! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
|
||||||
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
|
! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
|
||||||
|
|
||||||
! clear "a(?=b*)" <regexp> "ab" over match
|
! [ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
|
||||||
! clear "a(?=b*c)" <regexp> "abbbbbc" over match
|
! [ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
|
||||||
! clear "a(?=b*)" <regexp> "ab" over match
|
! [ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
|
||||||
|
! [ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
|
||||||
|
! [ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
|
||||||
|
! [ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
|
||||||
|
|
||||||
! clear "^a" <regexp> "a" over match
|
! "ab" "a(?=b*)" <regexp> match
|
||||||
! clear "^a" <regexp> "\na" over match
|
! "abbbbbc" "a(?=b*c)" <regexp> match
|
||||||
! clear "^a" <regexp> "\r\na" over match
|
! "ab" "a(?=b*)" <regexp> match
|
||||||
! clear "^a" <regexp> "\ra" over match
|
|
||||||
|
|
||||||
! clear "a$" <regexp> "a" over match
|
! "baz" "(az)(?<=b)" <regexp> first-match
|
||||||
! clear "a$" <regexp> "a\n" over match
|
! "cbaz" "a(?<=b*)" <regexp> first-match
|
||||||
! clear "a$" <regexp> "a\r" over match
|
! "baz" "a(?<=b)" <regexp> first-match
|
||||||
! clear "a$" <regexp> "a\r\n" over match
|
|
||||||
|
|
||||||
! "(az)(?<=b)" <regexp> "baz" over first-match
|
! "baz" "a(?<!b)" <regexp> first-match
|
||||||
! "a(?<=b*)" <regexp> "cbaz" over first-match
|
! "caz" "a(?<!b)" <regexp> first-match
|
||||||
! "a(?<=b)" <regexp> "baz" over first-match
|
|
||||||
|
|
||||||
! "a(?<!b)" <regexp> "baz" over first-match
|
! "abcdefg" "a(?=bcdefg)bcd" <regexp> first-match
|
||||||
! "a(?<!b)" <regexp> "caz" over first-match
|
! "abcdefg" "a(?#bcdefg)bcd" <regexp> first-match
|
||||||
|
! "abcdefg" "a(?:bcdefg)" <regexp> first-match
|
||||||
|
|
||||||
! "a(?=bcdefg)bcd" <regexp> "abcdefg" over first-match
|
! "caba" "a(?<=b)" <regexp> first-match
|
||||||
! "a(?#bcdefg)bcd" <regexp> "abcdefg" over first-match
|
|
||||||
! "a(?:bcdefg)" <regexp> "abcdefg" over first-match
|
|
||||||
|
|
||||||
[ "a" ] [ "ac" "a(?!b)" <regexp> first-match >string ] unit-test
|
|
||||||
|
|
||||||
! "a(?<=b)" <regexp> "caba" over first-match
|
|
||||||
|
|
||||||
! capture group 1: "aaaa" 2: ""
|
! capture group 1: "aaaa" 2: ""
|
||||||
! "aaaa" "(a*)(a*)" <regexp> match*
|
! "aaaa" "(a*)(a*)" <regexp> match*
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
! Copyright (C) 2008 Doug Coleman.
|
! Copyright (C) 2008 Doug Coleman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors combinators kernel math sequences
|
USING: accessors combinators kernel math sequences strings
|
||||||
sets assocs prettyprint.backend make lexer namespaces parser
|
sets assocs prettyprint.backend make lexer namespaces parser
|
||||||
arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa
|
arrays fry regexp.backend regexp.utils regexp.parser regexp.nfa
|
||||||
regexp.dfa regexp.traversal regexp.transition-tables splitting ;
|
regexp.dfa regexp.traversal regexp.transition-tables splitting
|
||||||
|
sorting ;
|
||||||
IN: regexp
|
IN: regexp
|
||||||
|
|
||||||
: default-regexp ( string -- regexp )
|
: default-regexp ( string -- regexp )
|
||||||
|
@ -75,40 +76,7 @@ IN: regexp
|
||||||
: count-matches ( string regexp -- n )
|
: count-matches ( string regexp -- n )
|
||||||
all-matches length ;
|
all-matches length ;
|
||||||
|
|
||||||
: initial-option ( regexp option -- regexp' )
|
<PRIVATE
|
||||||
over options>> conjoin ;
|
|
||||||
|
|
||||||
: <regexp> ( string -- regexp )
|
|
||||||
default-regexp construct-regexp ;
|
|
||||||
|
|
||||||
: <iregexp> ( string -- regexp )
|
|
||||||
default-regexp
|
|
||||||
case-insensitive initial-option
|
|
||||||
construct-regexp ;
|
|
||||||
|
|
||||||
: <rregexp> ( string -- regexp )
|
|
||||||
default-regexp
|
|
||||||
reversed-regexp initial-option
|
|
||||||
construct-regexp ;
|
|
||||||
|
|
||||||
: parsing-regexp ( accum end -- accum )
|
|
||||||
lexer get dup skip-blank
|
|
||||||
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
|
|
||||||
lexer get dup still-parsing-line?
|
|
||||||
[ (parse-token) ] [ drop f ] if
|
|
||||||
"i" = [ <iregexp> ] [ <regexp> ] if parsed ;
|
|
||||||
|
|
||||||
: R! CHAR: ! parsing-regexp ; parsing
|
|
||||||
: R" CHAR: " parsing-regexp ; parsing
|
|
||||||
: R# CHAR: # parsing-regexp ; parsing
|
|
||||||
: R' CHAR: ' parsing-regexp ; parsing
|
|
||||||
: R( CHAR: ) parsing-regexp ; parsing
|
|
||||||
: R/ CHAR: / parsing-regexp ; parsing
|
|
||||||
: R@ CHAR: @ parsing-regexp ; parsing
|
|
||||||
: R[ CHAR: ] parsing-regexp ; parsing
|
|
||||||
: R` CHAR: ` parsing-regexp ; parsing
|
|
||||||
: R{ CHAR: } parsing-regexp ; parsing
|
|
||||||
: R| CHAR: | parsing-regexp ; parsing
|
|
||||||
|
|
||||||
: find-regexp-syntax ( string -- prefix suffix )
|
: find-regexp-syntax ( string -- prefix suffix )
|
||||||
{
|
{
|
||||||
|
@ -125,14 +93,67 @@ IN: regexp
|
||||||
{ "R| " "|" }
|
{ "R| " "|" }
|
||||||
} swap [ subseq? not nip ] curry assoc-find drop ;
|
} swap [ subseq? not nip ] curry assoc-find drop ;
|
||||||
|
|
||||||
: option? ( option regexp -- ? )
|
ERROR: unknown-regexp-option option ;
|
||||||
options>> key? ;
|
|
||||||
|
: option>ch ( option -- string )
|
||||||
|
{
|
||||||
|
{ case-insensitive [ CHAR: i ] }
|
||||||
|
{ multiline [ CHAR: m ] }
|
||||||
|
{ reversed-regexp [ CHAR: r ] }
|
||||||
|
{ dotall [ CHAR: s ] }
|
||||||
|
[ unknown-regexp-option ]
|
||||||
|
} case ;
|
||||||
|
|
||||||
|
: ch>option ( ch -- option )
|
||||||
|
{
|
||||||
|
{ CHAR: i [ case-insensitive ] }
|
||||||
|
{ CHAR: m [ multiline ] }
|
||||||
|
{ CHAR: r [ reversed-regexp ] }
|
||||||
|
{ CHAR: s [ dotall ] }
|
||||||
|
[ unknown-regexp-option ]
|
||||||
|
} case ;
|
||||||
|
|
||||||
|
: string>options ( string -- options )
|
||||||
|
[ ch>option dup ] H{ } map>assoc ;
|
||||||
|
|
||||||
|
: options>string ( options -- string )
|
||||||
|
keys [ option>ch ] map natural-sort >string ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: <optioned-regexp> ( string option-string -- regexp )
|
||||||
|
[ default-regexp ] [ string>options ] bi* >>options
|
||||||
|
construct-regexp ;
|
||||||
|
|
||||||
|
: <regexp> ( string -- regexp ) "" <optioned-regexp> ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
: parsing-regexp ( accum end -- accum )
|
||||||
|
lexer get dup skip-blank
|
||||||
|
[ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
|
||||||
|
lexer get dup still-parsing-line?
|
||||||
|
[ (parse-token) ] [ drop f ] if
|
||||||
|
<optioned-regexp> parsed ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: R! CHAR: ! parsing-regexp ; parsing
|
||||||
|
: R" CHAR: " parsing-regexp ; parsing
|
||||||
|
: R# CHAR: # parsing-regexp ; parsing
|
||||||
|
: R' CHAR: ' parsing-regexp ; parsing
|
||||||
|
: R( CHAR: ) parsing-regexp ; parsing
|
||||||
|
: R/ CHAR: / parsing-regexp ; parsing
|
||||||
|
: R@ CHAR: @ parsing-regexp ; parsing
|
||||||
|
: R[ CHAR: ] parsing-regexp ; parsing
|
||||||
|
: R` CHAR: ` parsing-regexp ; parsing
|
||||||
|
: R{ CHAR: } parsing-regexp ; parsing
|
||||||
|
: R| CHAR: | parsing-regexp ; parsing
|
||||||
|
|
||||||
M: regexp pprint*
|
M: regexp pprint*
|
||||||
[
|
[
|
||||||
[
|
[
|
||||||
dup raw>>
|
[ raw>> dup find-regexp-syntax swap % swap % % ]
|
||||||
dup find-regexp-syntax swap % swap % %
|
[ options>> options>string % ] bi
|
||||||
case-insensitive swap option? [ "i" % ] when
|
|
||||||
] "" make
|
] "" make
|
||||||
] keep present-text ;
|
] keep present-text ;
|
||||||
|
|
|
@ -17,6 +17,7 @@ TUPLE: dfa-traverser
|
||||||
capture-group-index
|
capture-group-index
|
||||||
last-state current-state
|
last-state current-state
|
||||||
text
|
text
|
||||||
|
match-failed?
|
||||||
start-index current-index
|
start-index current-index
|
||||||
matches ;
|
matches ;
|
||||||
|
|
||||||
|
@ -37,14 +38,20 @@ TUPLE: dfa-traverser
|
||||||
H{ } clone >>captured-groups ;
|
H{ } clone >>captured-groups ;
|
||||||
|
|
||||||
: final-state? ( dfa-traverser -- ? )
|
: final-state? ( dfa-traverser -- ? )
|
||||||
[ current-state>> ] [ dfa-table>> final-states>> ] bi
|
[ current-state>> ]
|
||||||
key? ;
|
[ dfa-table>> final-states>> ] bi key? ;
|
||||||
|
|
||||||
|
: beginning-of-text? ( dfa-traverser -- ? )
|
||||||
|
current-index>> 0 <= ; inline
|
||||||
|
|
||||||
|
: end-of-text? ( dfa-traverser -- ? )
|
||||||
|
[ current-index>> ] [ text>> length ] bi >= ; inline
|
||||||
|
|
||||||
: text-finished? ( dfa-traverser -- ? )
|
: text-finished? ( dfa-traverser -- ? )
|
||||||
{
|
{
|
||||||
[ current-state>> empty? ]
|
[ current-state>> empty? ]
|
||||||
[ [ current-index>> ] [ text>> length ] bi >= ]
|
[ end-of-text? ]
|
||||||
! [ current-index>> 0 < ]
|
[ match-failed?>> ]
|
||||||
} 1|| ;
|
} 1|| ;
|
||||||
|
|
||||||
: save-final-state ( dfa-straverser -- )
|
: save-final-state ( dfa-straverser -- )
|
||||||
|
@ -55,8 +62,47 @@ TUPLE: dfa-traverser
|
||||||
dup save-final-state
|
dup save-final-state
|
||||||
] when text-finished? ;
|
] when text-finished? ;
|
||||||
|
|
||||||
|
: previous-text-character ( dfa-traverser -- ch )
|
||||||
|
[ text>> ] [ current-index>> 1- ] bi nth ;
|
||||||
|
|
||||||
|
: current-text-character ( dfa-traverser -- ch )
|
||||||
|
[ text>> ] [ current-index>> ] bi nth ;
|
||||||
|
|
||||||
|
: next-text-character ( dfa-traverser -- ch )
|
||||||
|
[ text>> ] [ current-index>> 1+ ] bi nth ;
|
||||||
|
|
||||||
GENERIC: flag-action ( dfa-traverser flag -- )
|
GENERIC: flag-action ( dfa-traverser flag -- )
|
||||||
|
|
||||||
|
|
||||||
|
M: beginning-of-input flag-action ( dfa-traverser flag -- )
|
||||||
|
drop
|
||||||
|
dup beginning-of-text? [ t >>match-failed? ] unless drop ;
|
||||||
|
|
||||||
|
M: end-of-input flag-action ( dfa-traverser flag -- )
|
||||||
|
drop
|
||||||
|
dup end-of-text? [ t >>match-failed? ] unless drop ;
|
||||||
|
|
||||||
|
M: beginning-of-line flag-action ( dfa-traverser flag -- )
|
||||||
|
drop
|
||||||
|
dup {
|
||||||
|
[ beginning-of-text? ]
|
||||||
|
[ previous-text-character terminator-class class-member? ]
|
||||||
|
} 1|| [ t >>match-failed? ] unless drop ;
|
||||||
|
|
||||||
|
M: end-of-line flag-action ( dfa-traverser flag -- )
|
||||||
|
drop
|
||||||
|
dup {
|
||||||
|
[ end-of-text? ]
|
||||||
|
[ next-text-character terminator-class class-member? ]
|
||||||
|
} 1|| [ t >>match-failed? ] unless drop ;
|
||||||
|
|
||||||
|
M: word-boundary flag-action ( dfa-traverser flag -- )
|
||||||
|
drop
|
||||||
|
dup {
|
||||||
|
[ end-of-text? ]
|
||||||
|
[ current-text-character terminator-class class-member? ]
|
||||||
|
} 1|| [ t >>match-failed? ] unless drop ;
|
||||||
|
|
||||||
M: lookahead-on flag-action ( dfa-traverser flag -- )
|
M: lookahead-on flag-action ( dfa-traverser flag -- )
|
||||||
drop
|
drop
|
||||||
lookahead-counters>> 0 swap push ;
|
lookahead-counters>> 0 swap push ;
|
||||||
|
@ -110,11 +156,7 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
|
||||||
[ [ 1+ ] change-current-index ]
|
[ [ 1+ ] change-current-index ]
|
||||||
[ [ 1- ] change-current-index ] if
|
[ [ 1- ] change-current-index ] if
|
||||||
dup current-state>> >>last-state
|
dup current-state>> >>last-state
|
||||||
] dip
|
] [ first ] bi* >>current-state ;
|
||||||
first >>current-state ;
|
|
||||||
|
|
||||||
: match-failed ( dfa-traverser -- dfa-traverser )
|
|
||||||
V{ } clone >>matches ;
|
|
||||||
|
|
||||||
: match-literal ( transition from-state table -- to-state/f )
|
: match-literal ( transition from-state table -- to-state/f )
|
||||||
transitions>> at at ;
|
transitions>> at at ;
|
||||||
|
@ -131,11 +173,9 @@ M: capture-group-off flag-action ( dfa-traverser flag -- )
|
||||||
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
|
{ [ match-literal ] [ match-class ] [ match-default ] } 3|| ;
|
||||||
|
|
||||||
: setup-match ( match -- obj state dfa-table )
|
: setup-match ( match -- obj state dfa-table )
|
||||||
{
|
[ [ current-index>> ] [ text>> ] bi nth ]
|
||||||
[ current-index>> ] [ text>> ]
|
[ current-state>> ]
|
||||||
[ current-state>> ] [ dfa-table>> ]
|
[ dfa-table>> ] tri ;
|
||||||
} cleave
|
|
||||||
[ nth ] 2dip ;
|
|
||||||
|
|
||||||
: do-match ( dfa-traverser -- dfa-traverser )
|
: do-match ( dfa-traverser -- dfa-traverser )
|
||||||
dup process-flags
|
dup process-flags
|
||||||
|
|
Loading…
Reference in New Issue