From 113ea0962bde33389c975e22578c761c985e1691 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Mon, 3 Dec 2007 19:20:47 -0500 Subject: [PATCH] Regexp bug fixes and improved literal syntax --- extra/regexp/regexp-tests.factor | 12 ++++ extra/regexp/regexp.factor | 110 +++++++++++++++++++------------ 2 files changed, 81 insertions(+), 41 deletions(-) mode change 100644 => 100755 extra/regexp/regexp-tests.factor mode change 100644 => 100755 extra/regexp/regexp.factor diff --git a/extra/regexp/regexp-tests.factor b/extra/regexp/regexp-tests.factor old mode 100644 new mode 100755 index 021592a772..5cec0af0a9 --- a/extra/regexp/regexp-tests.factor +++ b/extra/regexp/regexp-tests.factor @@ -160,3 +160,15 @@ IN: regexp-tests [ f ] [ "y" "\\x78" matches? ] unit-test [ t ] [ "x" "\\u0078" matches? ] unit-test [ f ] [ "y" "\\u0078" matches? ] unit-test + +[ t ] [ "ab" "a+b" matches? ] unit-test +[ f ] [ "b" "a+b" matches? ] unit-test +[ t ] [ "aab" "a+b" matches? ] unit-test +[ f ] [ "abb" "a+b" matches? ] unit-test + +[ t ] [ "abbbb" "ab*" matches? ] unit-test +[ t ] [ "a" "ab*" matches? ] unit-test +[ f ] [ "abab" "ab*" matches? ] unit-test + +[ f ] [ "x" "\\." matches? ] unit-test +[ t ] [ "." "\\." matches? ] unit-test diff --git a/extra/regexp/regexp.factor b/extra/regexp/regexp.factor old mode 100644 new mode 100755 index d377085018..f1f2d3b1e4 --- a/extra/regexp/regexp.factor +++ b/extra/regexp/regexp.factor @@ -1,15 +1,20 @@ USING: arrays combinators kernel lazy-lists math math.parser namespaces parser parser-combinators parser-combinators.simple promises quotations sequences combinators.lib strings macros -assocs ; +assocs prettyprint.backend ; IN: regexp : or-predicates ( quots -- quot ) [ \ dup add* ] map [ [ t ] ] f short-circuit \ nip add ; -: octal-digit? ( n -- ? ) CHAR: 0 CHAR: 7 between? ; +MACRO: fast-member? ( str -- quot ) + [ dup ] H{ } map>assoc [ key? ] curry ; -: decimal-digit? ( n -- ? ) CHAR: 0 CHAR: 9 between? ; +: octal-digit? ( n -- ? ) + CHAR: 0 CHAR: 7 between? ; + +: decimal-digit? ( n -- ? ) + CHAR: 0 CHAR: 9 between? ; : hex-digit? ( n -- ? ) dup decimal-digit? @@ -19,9 +24,6 @@ IN: regexp dup 0 HEX: 1f between? swap HEX: 7f = or ; -MACRO: fast-member? ( str -- quot ) - [ dup ] H{ } map>assoc [ key? ] curry ; - : punct? ( n -- ? ) "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" fast-member? ; @@ -54,13 +56,6 @@ MACRO: fast-member? ( str -- quot ) "u" token 'hex-digit' 4 exactly-n &> <|> [ hex> ] <@ ; -: 'control-character' ( -- parser ) - "c" token [ LETTER? ] satisfy &> ; - -: 'simple-escape' ( -- parser ) - 'octal' 'hex' 'control-character' <|> <|> - [ [ = ] curry ] <@ ; - : satisfy-tokens ( assoc -- parser ) [ >r token r> [ nip ] curry <@ ] { } assoc>map ; @@ -102,21 +97,26 @@ MACRO: fast-member? ( str -- quot ) { "Space" [ java-blank? ] } } satisfy-tokens "p{" "}" surrounded-by ; +: 'simple-escape' ( -- parser ) + 'octal' + 'hex' <|> + "c" token [ LETTER? ] satisfy &> <|> + any-char-parser <|> + [ [ = ] curry ] <@ ; + : 'escape' ( -- parser ) "\\" token - 'simple-escape' - 'simple-escape-char' <|> + 'simple-escape-char' 'predefined-char-class' <|> - 'posix-character-class' <|> &> ; + 'posix-character-class' <|> + 'simple-escape' <|> &> ; -: 'any-char' "." token [ drop [ drop t ] ] <@ ; +: 'any-char' + "." token [ drop [ drop t ] ] <@ ; : 'char' 'any-char' 'escape' 'ordinary-char' <|> <|> [ satisfy ] <@ ; -: 'string' - 'char' <+> [ ] <@ ; - DEFER: 'regexp' TUPLE: group-result str ; @@ -152,47 +152,75 @@ C: group-result : 'escaped-seq' ( -- parser ) any-char-parser <*> [ token ] <@ "\\Q" "\\E" surrounded-by ; -: 'term' ( -- parser ) +: 'simple' ( -- parser ) 'escaped-seq' 'grouping' <|> - 'string' <|> - 'character-class' <|> - <+> [ ] <@ ; + 'char' <|> + 'character-class' <|> ; : 'interval' ( -- parser ) - 'term' 'integer' "{" "}" surrounded-by <&> [ first2 exactly-n ] <@ - 'term' 'integer' "{" ",}" surrounded-by <&> [ first2 at-least-n ] <@ <|> - 'term' 'integer' "{," "}" surrounded-by <&> [ first2 at-most-n ] <@ <|> - 'term' 'integer' "," token <& 'integer' <&> "{" "}" surrounded-by <&> [ first2 first2 from-m-to-n ] <@ <|> ; + 'simple' 'integer' "{" "}" surrounded-by <&> [ first2 exactly-n ] <@ + 'simple' 'integer' "{" ",}" surrounded-by <&> [ first2 at-least-n ] <@ <|> + 'simple' 'integer' "{," "}" surrounded-by <&> [ first2 at-most-n ] <@ <|> + 'simple' 'integer' "," token <& 'integer' <&> "{" "}" surrounded-by <&> [ first2 first2 from-m-to-n ] <@ <|> ; : 'repetition' ( -- parser ) - 'term' "*" token <& [ <*> ] <@ - 'term' "+" token <& [ <+> ] <@ <|> - 'term' "?" token <& [ ] <@ <|> ; + 'simple' "*" token <& [ <*> ] <@ + 'simple' "+" token <& [ <+> ] <@ <|> + 'simple' "?" token <& [ ] <@ <|> ; -: 'simple' 'term' 'repetition' 'interval' <|> <|> ; - -LAZY: 'union' ( -- parser ) - 'simple' "|" token nonempty-list-of [ ] <@ ; +: 'term' ( -- parser ) + 'simple' 'repetition' 'interval' <|> <|> + <+> [ ] <@ ; LAZY: 'regexp' ( -- parser ) - 'repetition' 'union' <|> ; + 'term' "|" token nonempty-list-of [ ] <@ ; -: 'regexp' just parse-1 ; +TUPLE: regexp source parser ; + +: dup 'regexp' just parse-1 regexp construct-boa ; GENERIC: >regexp ( obj -- parser ) -M: string >regexp 'regexp' just parse-1 ; + +M: string >regexp ; + M: object >regexp ; -: matches? ( string regexp -- ? ) >regexp just parse nil? not ; +: matches? ( string regexp -- ? ) + >regexp regexp-parser just parse nil? not ; +! Literal syntax for regexps : parse-regexp ( accum end -- accum ) lexer get dup skip-blank [ [ index* dup 1+ swap ] 2keep swapd subseq swap ] change-column parsed ; -: R/ CHAR: / parse-regexp ; parsing -: R| CHAR: | parse-regexp ; parsing +: R! CHAR: ! parse-regexp ; parsing : R" CHAR: " parse-regexp ; parsing +: R# CHAR: # parse-regexp ; parsing : R' CHAR: ' parse-regexp ; parsing +: R( CHAR: ) parse-regexp ; parsing +: R/ CHAR: / parse-regexp ; parsing +: R@ CHAR: @ parse-regexp ; parsing +: R[ CHAR: ] parse-regexp ; parsing : R` CHAR: ` parse-regexp ; parsing +: R{ CHAR: } parse-regexp ; parsing +: R| CHAR: | parse-regexp ; parsing + +: find-regexp-syntax ( string -- prefix suffix ) + { + { "R/ " "/" } + { "R! " "!" } + { "R\" " "\"" } + { "R# " "#" } + { "R' " "'" } + { "R( " ")" } + { "R@ " "@" } + { "R[ " "]" } + { "R` " "`" } + { "R{ " "}" } + { "R| " "|" } + } swap [ subseq? not nip ] curry assoc-find drop ; + +M: regexp pprint* + dup regexp-source dup find-regexp-syntax pprint-string ;