From 50948ae9db8c6fb90bbca4f5d4f8f4be6ce07c5a Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Mon, 26 Nov 2007 17:19:29 -0600 Subject: [PATCH] Add character classes, fails on one test case [^] Add lots of unit tests --- extra/regexp/regexp-tests.factor | 66 ++++++++++++++++++++++++++++++++ extra/regexp/regexp.factor | 54 ++++++++++++++++++++++---- 2 files changed, 112 insertions(+), 8 deletions(-) diff --git a/extra/regexp/regexp-tests.factor b/extra/regexp/regexp-tests.factor index 597a4f5143..5ebd6dc4d3 100644 --- a/extra/regexp/regexp-tests.factor +++ b/extra/regexp/regexp-tests.factor @@ -29,6 +29,7 @@ IN: regexp-tests [ f ] [ "" "." matches? ] unit-test [ t ] [ "a" "." matches? ] unit-test [ t ] [ "." "." matches? ] unit-test +[ f ] [ "\n" "." matches? ] unit-test [ f ] [ "" ".+" matches? ] unit-test [ t ] [ "a" ".+" matches? ] unit-test @@ -75,3 +76,68 @@ IN: regexp-tests [ t ] [ "aaa" "a{1,3}" matches? ] unit-test [ f ] [ "aaaa" "a{1,3}" matches? ] unit-test +[ f ] [ "" "[a]" matches? ] unit-test +[ t ] [ "a" "[a]" matches? ] unit-test +[ t ] [ "a" "[abc]" matches? ] unit-test +[ f ] [ "b" "[a]" matches? ] unit-test +[ f ] [ "d" "[abc]" matches? ] unit-test +[ t ] [ "ab" "[abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[abc]{1,2}" matches? ] unit-test + +[ f ] [ "" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^a]" matches? ] unit-test +[ f ] [ "a" "[^abc]" matches? ] unit-test +[ t ] [ "b" "[^a]" matches? ] unit-test +[ t ] [ "d" "[^abc]" matches? ] unit-test +[ f ] [ "ab" "[^abc]{1,2}" matches? ] unit-test +[ f ] [ "abc" "[^abc]{1,2}" matches? ] unit-test + +[ t ] [ "]" "[]]" matches? ] unit-test +[ f ] [ "]" "[^]]" matches? ] unit-test + +[ "^" "[^]" matches? ] unit-test-fails +[ t ] [ "^" "[]^]" matches? ] unit-test +[ t ] [ "]" "[]^]" matches? ] unit-test + +[ t ] [ "[" "[[]" matches? ] unit-test +[ f ] [ "^" "[^^]" matches? ] unit-test +[ t ] [ "a" "[^^]" matches? ] unit-test + +[ t ] [ "-" "[-]" matches? ] unit-test +[ f ] [ "a" "[-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test + +[ t ] [ "-" "[-a]" matches? ] unit-test +[ t ] [ "a" "[-a]" matches? ] unit-test +[ t ] [ "-" "[a-]" matches? ] unit-test +[ t ] [ "a" "[a-]" matches? ] unit-test +[ f ] [ "b" "[a-]" matches? ] unit-test +[ f ] [ "-" "[^-]" matches? ] unit-test +[ t ] [ "a" "[^-]" matches? ] unit-test + +[ f ] [ "-" "[a-c]" matches? ] unit-test +[ t ] [ "-" "[^a-c]" matches? ] unit-test +[ t ] [ "b" "[a-c]" matches? ] unit-test +[ f ] [ "b" "[^a-c]" matches? ] unit-test + +[ t ] [ "-" "[a-c-]" matches? ] unit-test +[ f ] [ "-" "[^a-c-]" matches? ] unit-test + +[ t ] [ "\\" "[\\\\]" matches? ] unit-test +[ f ] [ "a" "[\\\\]" matches? ] unit-test +[ f ] [ "\\" "[^\\\\]" matches? ] unit-test +[ t ] [ "a" "[^\\\\]" matches? ] unit-test + +[ t ] [ "0" "[\\d]" matches? ] unit-test +[ f ] [ "a" "[\\d]" matches? ] unit-test +[ f ] [ "0" "[^\\d]" matches? ] unit-test +[ t ] [ "a" "[^\\d]" matches? ] unit-test + +[ t ] [ "a" "[a-z]{1,}|[A-Z]{2,4}|b*|c|(f|g)*" matches? ] unit-test +[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}|b*|c|(f|g)*" matches? ] unit-test +[ t ] [ "a" "[a-z]{1,2}|[A-Z]{3,3}" matches? ] unit-test + +[ t ] [ "1000" "\\d{4,6}" matches? ] unit-test +! [ t ] [ "1000" "[0-9]{4,6}" matches? ] unit-test + diff --git a/extra/regexp/regexp.factor b/extra/regexp/regexp.factor index 02d66ee59b..8fdc1bed8b 100644 --- a/extra/regexp/regexp.factor +++ b/extra/regexp/regexp.factor @@ -1,6 +1,6 @@ USING: arrays combinators kernel lazy-lists math math.parser namespaces parser parser-combinators parser-combinators.simple -promises sequences sequences.lib strings ; +promises quotations sequences sequences.lib strings ; USING: continuations io prettyprint ; IN: regexp @@ -9,22 +9,29 @@ IN: regexp : escaped-char { - { CHAR: d [ [ digit? ] satisfy ] } - { CHAR: D [ [ digit? not ] satisfy ] } - { CHAR: s [ [ blank? ] satisfy ] } - { CHAR: S [ [ blank? not ] satisfy ] } - [ ] + { CHAR: d [ [ digit? ] ] } + { CHAR: D [ [ digit? not ] ] } + { CHAR: s [ [ blank? ] ] } + { CHAR: S [ [ blank? not ] ] } + { CHAR: \\ [ [ CHAR: \\ = ] ] } + [ "bad \\, use \\\\ to match a literal \\" throw ] } case ; : 'escaped-char' "\\" token any-char-parser &> [ escaped-char ] <@ ; +! Must escape to use as literals +! : meta-chars "[\\^$.|?*+()" ; + : 'ordinary-char' - [ "^*+?|(){}[]" member? not ] satisfy [ 1string token ] <@ ; + [ "\\^*+?|(){}[" member? not ] satisfy ; : 'char' 'escaped-char' 'ordinary-char' <|> ; -: 'string' 'char' <+> [ [ <&> ] reduce* ] <@ ; +: 'string' + 'char' <+> [ + [ dup quotation? [ satisfy ] [ 1token ] if ] [ <&> ] map-reduce + ] <@ ; : exactly-n ( parser n -- parser' ) swap and-parser construct-boa ; @@ -54,10 +61,41 @@ C: group-result 'regexp' [ [ ] <@ ] <@ ")" token <& &> ; +! Special cases: ]\\^- +: predicates>cond ( seq -- quot ) + #! Takes an array of quotation predicates/objects and makes a cond + #! Makes a predicate of each obj like so: [ dup obj = ] + #! Leaves quotations alone + #! The cond returns a boolean, t if one of the predicates matches + [ + dup callable? [ [ = ] curry ] unless + [ dup ] swap compose [ drop t ] 2array + ] map { [ t ] [ drop f ] } add [ cond ] curry ; + +: 'range' + any-char-parser "-" token <& any-char-parser <&> + [ first2 [ between? ] 2curry ] <@ ; + +: 'character-class-contents' + 'escaped-char' + 'range' <|> + [ "\\]" member? not ] satisfy <|> ; + +: 'character-class' + "[" token + "^" token 'character-class-contents' <+> <&:> + [ predicates>cond [ not ] compose satisfy ] <@ + "]" token [ first ] <@ 'character-class-contents' <*> <&:> + [ predicates>cond satisfy ] <@ <|> + 'character-class-contents' <+> [ predicates>cond satisfy ] <@ <|> + &> + "]" token <& ; + : 'term' 'any-char' 'string' <|> 'grouping' <|> + 'character-class' <|> <+> [ dup length 1 = [ first ] [ and-parser construct-boa ] if