From ba680fae6f18e778954caceb59eac6b786788017 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sat, 22 Nov 2008 21:04:09 -0600 Subject: [PATCH] handle dotall mode -- . matches newlines when it's on. dotall mode is off by default. rearrange unit tests a bit --- basis/regexp/classes/classes.factor | 3 ++ basis/regexp/parser/parser.factor | 3 +- basis/regexp/regexp-tests.factor | 79 ++++++++++++++--------------- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index 240b27a9cc..7b729b2e50 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -14,6 +14,9 @@ M: character-class-range class-member? ( obj class -- ? ) M: any-char class-member? ( obj class -- ? ) 2drop t ; + +M: any-char-no-nl class-member? ( obj class -- ? ) + drop CHAR: \n = not ; M: letter-class class-member? ( obj class -- ? ) drop letter? ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index 6a1d40c573..7f1d92a1ab 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -43,6 +43,7 @@ INSTANCE: comment-group parentheses-group TUPLE: character-class-range from to ; INSTANCE: character-class-range node SINGLETON: epsilon INSTANCE: epsilon node SINGLETON: any-char INSTANCE: any-char node +SINGLETON: any-char-no-nl INSTANCE: any-char-no-nl node SINGLETON: front-anchor INSTANCE: front-anchor node SINGLETON: back-anchor INSTANCE: back-anchor node @@ -172,7 +173,7 @@ DEFER: (parse-regexp) [ drop1 (parse-special-group) ] [ capture-group f nested-parse-regexp ] if ; -: handle-dot ( -- ) any-char push-stack ; +: handle-dot ( -- ) get-dotall any-char any-char-no-nl ? push-stack ; : handle-pipe ( -- ) pipe push-stack ; : (handle-star) ( obj -- kleene-star ) peek1 { diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor index c0252b2ff4..872ea3e788 100644 --- a/basis/regexp/regexp-tests.factor +++ b/basis/regexp/regexp-tests.factor @@ -40,7 +40,11 @@ IN: regexp-tests [ f ] [ "" "." matches? ] unit-test [ t ] [ "a" "." matches? ] unit-test [ t ] [ "." "." matches? ] unit-test -! [ f ] [ "\n" "." matches? ] unit-test + +! Dotall mode -- when on, . matches newlines. +! Off by default. +[ f ] [ "\n" "." matches? ] unit-test +[ t ] [ "\n" "(?s)." matches? ] unit-test [ f ] [ "" ".+" matches? ] unit-test [ t ] [ "a" ".+" matches? ] unit-test @@ -170,7 +174,6 @@ IN: regexp-tests [ f ] [ "ABC" "\\p{Lower}{3}" matches? ] unit-test [ t ] [ "ABC" "\\p{Upper}{3}" matches? ] unit-test [ f ] [ "abc" "\\p{Upper}{3}" matches? ] unit-test -! [ f ] [ "abc" "[\\p{Upper}]{3}" matches? ] unit-test [ t ] [ "ABC" "[\\p{Upper}]{3}" matches? ] unit-test @@ -252,7 +255,40 @@ IN: regexp-tests ! Comment [ t ] [ "ac" "a(?#boo)c" matches? ] unit-test +[ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test +[ ] [ "USING: regexp kernel ; R' (ftp|http|https)://(\\w+:?\\w*@)?(\\S+)(:[0-9]+)?(/|/([\\w#!:.?+=&%@!\\-/]))?' drop" eval ] unit-test + +[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test + +[ "ab" ] [ "ab" "(a|ab)(bc)?" first-match >string ] unit-test +[ "abc" ] [ "abc" "(a|ab)(bc)?" first-match >string ] unit-test + +[ "ab" ] [ "ab" "(ab|a)(bc)?" first-match >string ] unit-test +[ "abc" ] [ "abc" "(ab|a)(bc)?" first-match >string ] unit-test + +[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" first-match >string ] unit-test + +[ t ] [ "a:b" ".+:?" matches? ] unit-test + +[ 1 ] [ "hello" ".+?" match length ] unit-test + +[ { "1" "2" "3" "4" } ] +[ "1ABC2DEF3GHI4" R/ [A-Z]+/ re-split [ >string ] map ] unit-test + +[ { "1" "2" "3" "4" } ] +[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ re-split [ >string ] map ] unit-test + +[ { "ABC" "DEF" "GHI" } ] +[ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test + +[ "1.2.3.4" ] +[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test + +[ f ] [ "ab" "a(?!b)" first-match ] unit-test +[ "a" ] [ "ab" "a(?=b)(?=b)" first-match >string ] unit-test +[ "a" ] [ "ba" "a(?<=b)(?<=b)" first-match >string ] unit-test +[ "a" ] [ "cab" "a(?=b)(?<=c)" first-match >string ] unit-test ! [ "{Lower}" ] [ invalid-range? ] must-fail-with @@ -286,21 +322,10 @@ IN: regexp-tests ! [ t ] [ "fooxbar" "foo\\Bxbar" matches? ] unit-test ! [ f ] [ "foo" "foo\\Bbar" matches? ] unit-test -[ ] [ "USING: regexp kernel ; R' -{3}[+]{1,6}(?:!!)?\\s' drop" eval ] unit-test - -[ ] [ "USING: regexp kernel ; R' (ftp|http|https)://(\\w+:?\\w*@)?(\\S+)(:[0-9]+)?(/|/([\\w#!:.?+=&%@!\\-/]))?' drop" eval ] unit-test - -[ ] [ "USING: regexp kernel ; R' \\*[^\s*][^*]*\\*' drop" eval ] unit-test ! Bug in parsing word ! [ t ] [ "a" R' a' matches? ] unit-test -! ((A)(B(C))) -! 1. ((A)(B(C))) -! 2. (A) -! 3. (B(C)) -! 4. (C) - ! clear "a(?=b*)" "ab" over match ! clear "a(?=b*c)" "abbbbbc" over match ! clear "a(?=b*)" "ab" over match @@ -327,38 +352,10 @@ IN: regexp-tests ! "a(?:bcdefg)" "abcdefg" over first-match [ "a" ] [ "ac" "a(?!b)" first-match >string ] unit-test -[ f ] [ "ab" "a(?!b)" first-match ] unit-test ! "a(?<=b)" "caba" over first-match -[ "a" ] [ "ab" "a(?=b)(?=b)" first-match >string ] unit-test -[ "a" ] [ "ba" "a(?<=b)(?<=b)" first-match >string ] unit-test -[ "a" ] [ "cab" "a(?=b)(?<=c)" first-match >string ] unit-test ! capture group 1: "aaaa" 2: "" ! "aaaa" "(a*)(a*)" match* ! "aaaa" "(a*)(a+)" match* - -[ "ab" ] [ "ab" "(a|ab)(bc)?" first-match >string ] unit-test -[ "abc" ] [ "abc" "(a|ab)(bc)?" first-match >string ] unit-test - -[ "ab" ] [ "ab" "(ab|a)(bc)?" first-match >string ] unit-test -[ "abc" ] [ "abc" "(ab|a)(bc)?" first-match >string ] unit-test - -[ "b" ] [ "aaaaaaaaaaaaaaaaaaaaaaab" "((a*)*b)*b" first-match >string ] unit-test - -[ t ] [ "a:b" ".+:?" matches? ] unit-test - -[ 1 ] [ "hello" ".+?" match length ] unit-test - -[ { "1" "2" "3" "4" } ] -[ "1ABC2DEF3GHI4" R/ [A-Z]+/ re-split [ >string ] map ] unit-test - -[ { "1" "2" "3" "4" } ] -[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ re-split [ >string ] map ] unit-test - -[ { "ABC" "DEF" "GHI" } ] -[ "1ABC2DEF3GHI4" R/ [A-Z]+/ all-matches [ >string ] map ] unit-test - -[ "1.2.3.4" ] -[ "1ABC2DEF3GHI4JK" R/ [A-Z]+/ "." re-replace ] unit-test