add more unit tests, add match-head

2008-05-21 18:36:01 -05:00 · 2008-05-21 18:36:01 -05:00 · c1d961748b
parent 39950b9884
commit c1d961748b
2 changed files with 141 additions and 1 deletions
--- a/extra/regexp4/regexp4-tests.factor
+++ b/extra/regexp4/regexp4-tests.factor
@ -131,6 +131,116 @@ IN: regexp4-tests
 [ f ] [ "\\" "[^\\\\]" <regexp> matches? ] unit-test
 [ t ] [ "a" "[^\\\\]" <regexp> matches? ] unit-test

+[ t ] [ "0" "[\\d]" <regexp> matches? ] unit-test
+[ f ] [ "a" "[\\d]" <regexp> matches? ] unit-test
+[ f ] [ "0" "[^\\d]" <regexp> matches? ] unit-test
+[ t ] [ "a" "[^\\d]" <regexp> matches? ] unit-test
+
+[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" <regexp> matches? ] unit-test
+[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" <regexp> matches? ] unit-test
+[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" <regexp> matches? ] unit-test
+
+[ t ] [ "1000" "\\d{4,6}" <regexp> matches? ] unit-test
+[ t ] [ "1000" "[0-9]{4,6}" <regexp> matches? ] unit-test
+
+[ t ] [ "abc" "\\p{Lower}{3}" <regexp> matches? ] unit-test
+[ f ] [ "ABC" "\\p{Lower}{3}" <regexp> matches? ] unit-test
+[ t ] [ "ABC" "\\p{Upper}{3}" <regexp> matches? ] unit-test
+[ f ] [ "abc" "\\p{Upper}{3}" <regexp> matches? ] unit-test
+
+[ f ] [ "abc" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
+[ t ] [ "ABC" "[\\p{Upper}]{3}" <regexp> matches? ] unit-test
+
+[ t ] [ "" "\\Q\\E" <regexp> matches? ] unit-test
+[ f ] [ "a" "\\Q\\E" <regexp> matches? ] unit-test
+[ t ] [ "|*+" "\\Q|*+\\E" <regexp> matches? ] unit-test
+[ f ] [ "abc" "\\Q|*+\\E" <regexp> matches? ] unit-test
+
+[ t ] [ "S" "\\0123" <regexp> matches? ] unit-test
+[ t ] [ "SXY" "\\0123XY" <regexp> matches? ] unit-test
+[ t ] [ "x" "\\x78" <regexp> matches? ] unit-test
+[ f ] [ "y" "\\x78" <regexp> matches? ] unit-test
+[ t ] [ "x" "\\u000078" <regexp> matches? ] unit-test
+[ f ] [ "y" "\\u000078" <regexp> matches? ] unit-test
+
+[ t ] [ "ab" "a+b" <regexp> matches? ] unit-test
+[ f ] [ "b" "a+b" <regexp> matches? ] unit-test
+[ t ] [ "aab" "a+b" <regexp> matches? ] unit-test
+[ f ] [ "abb" "a+b" <regexp> matches? ] unit-test
+
+[ t ] [ "abbbb" "ab*" <regexp> matches? ] unit-test
+[ t ] [ "a" "ab*" <regexp> matches? ] unit-test
+[ f ] [ "abab" "ab*" <regexp> matches? ] unit-test
+
+[ f ] [ "x" "\\." <regexp> matches? ] unit-test
+[ t ] [ "." "\\." <regexp> matches? ] unit-test
+
+[ t ] [ "aaaab" "a+ab" <regexp> matches? ] unit-test
+[ f ] [ "aaaxb" "a+ab" <regexp> matches? ] unit-test
+[ t ] [ "aaacb" "a+cb" <regexp> matches? ] unit-test
+[ f ] [ "aaaab" "a++ab" <regexp> matches? ] unit-test
+[ t ] [ "aaacb" "a++cb" <regexp> matches? ] unit-test
+
+[ 3 ] [ "aaacb" "a*" <regexp> match-head ] unit-test
+[ 1 ] [ "aaacb" "a+?" <regexp> match-head ] unit-test
+[ 2 ] [ "aaacb" "aa?" <regexp> match-head ] unit-test
+[ 1 ] [ "aaacb" "aa??" <regexp> match-head ] unit-test
+[ 3 ] [ "aacb" "aa?c" <regexp> match-head ] unit-test
+[ 3 ] [ "aacb" "aa??c" <regexp> match-head ] unit-test
+
+! [ t ] [ "aaa" "AAA" t <regexp> matches? ] unit-test
+! [ f ] [ "aax" "AAA" t <regexp> matches? ] unit-test
+! [ t ] [ "aaa" "A*" t <regexp> matches? ] unit-test
+! [ f ] [ "aaba" "A*" t <regexp> matches? ] unit-test
+! [ t ] [ "b" "[AB]" t <regexp> matches? ] unit-test
+! [ f ] [ "c" "[AB]" t <regexp> matches? ] unit-test
+! [ t ] [ "c" "[A-Z]" t <regexp> matches? ] unit-test
+! [ f ] [ "3" "[A-Z]" t <regexp> matches? ] unit-test
+
+[ ] [
+    "(0[lL]?|[1-9]\\d{0,9}(\\d{0,9}[lL])?|0[xX]\\p{XDigit}{1,8}(\\p{XDigit}{0,8}[lL])?|0[0-7]{1,11}([0-7]{0,11}[lL])?|([0-9]+\\.[0-9]*|\\.[0-9]+)([eE][+-]?[0-9]+)?[fFdD]?|[0-9]+([eE][+-]?[0-9]+[fFdD]?|([eE][+-]?[0-9]+)?[fFdD]))"
+    <regexp> drop
+] unit-test
+
+[ t ] [ "fxxbar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
+[ f ] [ "foobar" "(?!foo).{3}bar" <regexp> matches? ] unit-test
+
+! [ 3 ] [ "foobar" "foo(?=bar)" <regexp> match-head ] unit-test
+! [ f ] [ "foobxr" "foo(?=bar)" <regexp> match-head ] unit-test
+
+! [ f ] [ "foobxr" "foo\\z" <regexp> match-head ] unit-test
+! [ 3 ] [ "foo" "foo\\z" <regexp> match-head ] unit-test
+
+! [ 3 ] [ "foo bar" "foo\\b" <regexp> match-head ] unit-test
+! [ f ] [ "fooxbar" "foo\\b" <regexp> matches? ] unit-test
+! [ t ] [ "foo" "foo\\b" <regexp> matches? ] unit-test
+! [ t ] [ "foo bar" "foo\\b bar" <regexp> matches? ] unit-test
+! [ f ] [ "fooxbar" "foo\\bxbar" <regexp> matches? ] unit-test
+! [ f ] [ "foo" "foo\\bbar" <regexp> matches? ] unit-test
+
+! [ f ] [ "foo bar" "foo\\B" <regexp> matches? ] unit-test
+! [ 3 ] [ "fooxbar" "foo\\B" <regexp> match-head ] unit-test
+! [ t ] [ "foo" "foo\\B" <regexp> matches? ] unit-test
+! [ f ] [ "foo bar" "foo\\B bar" <regexp> matches? ] unit-test
+! [ t ] [ "fooxbar" "foo\\Bxbar" <regexp> matches? ] unit-test
+! [ f ] [ "foo" "foo\\Bbar" <regexp> matches? ] unit-test
+
+! [ t ] [ "s@f" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
+! [ f ] [ "a" "[a-z.-]@[a-z]" <regexp> matches? ] unit-test
+! [ t ] [ ".o" "\\.[a-z]" <regexp> matches? ] unit-test
+
+! Bug in parsing word
+[ t ] [
+    "a"
+    R' a'
+    matches?
+] unit-test
+
+
+
+
+
+

 ! ((A)(B(C)))
 ! 1.  ((A)(B(C)))
--- a/extra/regexp4/regexp4.factor
+++ b/extra/regexp4/regexp4.factor
@ -4,7 +4,7 @@ USING: accessors arrays assocs combinators kernel math
 sequences namespaces locals combinators.lib state-tables
 math.parser state-parser sets dlists unicode.categories
 math.order quotations shuffle math.ranges splitting
-symbols fry ;
+symbols fry parser ;
 IN: regexp4

 SYMBOLS: eps start-state final-state beginning-of-text
@ -544,6 +544,33 @@ ERROR: unsupported-token token ;
        <vector-table> >>nfa
        dup [ parse-raw-regexp ] [ subset-construction ] bi ;

+! Literal syntax for regexps
+: parse-options ( string -- ? )
+    #! Lame
+    {
+        { "" [ f ] }
+        { "i" [ t ] }
+    } case ;
+
+: parse-regexp ( accum end -- accum )
+    lexer get dup skip-blank
+    [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
+    ! lexer get dup still-parsing-line?
+    ! [ (parse-token) parse-options ] [ drop f ] if
+    <regexp> parsed ;
+
+: R! CHAR: ! parse-regexp ; parsing
+: R" CHAR: " parse-regexp ; parsing
+: R# CHAR: # parse-regexp ; parsing
+: R' CHAR: ' parse-regexp ; parsing
+: R( CHAR: ) parse-regexp ; parsing
+: R/ CHAR: / parse-regexp ; parsing
+: R@ CHAR: @ parse-regexp ; parsing
+: R[ CHAR: ] parse-regexp ; parsing
+: R` CHAR: ` parse-regexp ; parsing
+: R{ CHAR: } parse-regexp ; parsing
+: R| CHAR: | parse-regexp ; parsing
+
 TUPLE: dfa-traverser
    dfa
    last-state current-state
@ -611,6 +638,9 @@ TUPLE: dfa-traverser
 : matches? ( string regexp -- ? )
    dupd match [ [ length ] [ range-length 1- ] bi* = ] [ drop f ] if* ;

+: match-head ( string regexp -- end )
+    match length>> ;
+
 ! character classes
 ! TUPLE: range-class from to ;
 ! TUPLE: or-class left right ;