From ababfe80efee2df3e9ec2e1fae5064b5d91c3aff Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 16 Mar 2009 17:53:38 -0500 Subject: [PATCH] More regexp docs; unix line ending support --- basis/regexp/ast/ast.factor | 3 +- basis/regexp/classes/classes.factor | 2 +- basis/regexp/compiler/compiler.factor | 9 ++- basis/regexp/nfa/nfa.factor | 9 ++- basis/regexp/parser/parser.factor | 36 +++++------ basis/regexp/regexp-docs.factor | 86 ++++++++++++++++++++++++--- basis/regexp/regexp-tests.factor | 10 ++++ 7 files changed, 118 insertions(+), 37 deletions(-) diff --git a/basis/regexp/ast/ast.factor b/basis/regexp/ast/ast.factor index ffaed2db62..1c11ed5c7d 100644 --- a/basis/regexp/ast/ast.factor +++ b/basis/regexp/ast/ast.factor @@ -37,8 +37,7 @@ C: with-options TUPLE: options on off ; C: options -SINGLETONS: unix-lines dotall multiline comments case-insensitive -unicode-case reversed-regexp ; +SINGLETONS: unix-lines dotall multiline case-insensitive reversed-regexp ; : ( term -- term' ) f 2array ; diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor index d26ff7f69c..e3a1774585 100644 --- a/basis/regexp/classes/classes.factor +++ b/basis/regexp/classes/classes.factor @@ -12,7 +12,7 @@ ascii-class punctuation-class java-printable-class blank-class control-character-class hex-digit-class java-blank-class c-identifier-class unmatchable-class terminator-class word-boundary-class ; -SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file word-break ; +SINGLETONS: beginning-of-input ^ end-of-input $ end-of-file ^unix $unix word-break ; TUPLE: range from to ; C: range diff --git a/basis/regexp/compiler/compiler.factor b/basis/regexp/compiler/compiler.factor index b55cab6294..95511965d1 100644 --- a/basis/regexp/compiler/compiler.factor +++ b/basis/regexp/compiler/compiler.factor @@ -17,9 +17,6 @@ SYMBOL: backwards? M: t question>quot drop [ 2drop t ] ; M: f question>quot drop [ 2drop f ] ; -M: not-class question>quot - class>> question>quot [ not ] compose ; - M: beginning-of-input question>quot drop [ drop zero? ] ; @@ -40,6 +37,12 @@ M: $ question>quot M: ^ question>quot drop [ { [ drop zero? ] [ [ 1- ] dip ?nth "\r\n" member? ] } 2|| ] ; +M: $unix question>quot + drop [ { [ length = ] [ ?nth CHAR: \n = ] } 2|| ] ; + +M: ^unix question>quot + drop [ { [ drop zero? ] [ [ 1- ] dip ?nth CHAR: \n = ] } 2|| ] ; + M: word-break question>quot drop [ word-break-at? ] ; diff --git a/basis/regexp/nfa/nfa.factor b/basis/regexp/nfa/nfa.factor index 20be6b87d8..d59d4818ec 100644 --- a/basis/regexp/nfa/nfa.factor +++ b/basis/regexp/nfa/nfa.factor @@ -60,11 +60,16 @@ GENERIC: modify-epsilon ( tag -- newtag ) M: object modify-epsilon ; +: line-option ( multiline unix-lines default -- option ) + multiline option? [ + drop [ unix-lines option? ] 2dip swap ? + ] [ 2nip ] if ; + M: $ modify-epsilon - multiline option? [ drop end-of-input ] unless ; + $unix end-of-input line-option ; M: ^ modify-epsilon - multiline option? [ drop beginning-of-input ] unless ; + ^unix beginning-of-input line-option ; M: tagged-epsilon nfa-node clone [ modify-epsilon ] change-tag add-simple-entry ; diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor index c6a69f2508..7b2d6af2c1 100644 --- a/basis/regexp/parser/parser.factor +++ b/basis/regexp/parser/parser.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: peg.ebnf kernel math.parser sequences assocs arrays fry math combinators regexp.classes strings splitting peg locals accessors -regexp.ast ; +regexp.ast unicode.case ; IN: regexp.parser : allowed-char? ( ch -- ? ) @@ -19,20 +19,19 @@ ERROR: bad-number ; ERROR: bad-class name ; : name>class ( name -- class ) - { - { "Lower" letter-class } - { "Upper" LETTER-class } - { "Alpha" Letter-class } - { "ASCII" ascii-class } - { "Digit" digit-class } - { "Alnum" alpha-class } - { "Punct" punctuation-class } - { "Graph" java-printable-class } - { "Print" java-printable-class } - { "Blank" non-newline-blank-class } - { "Cntrl" control-character-class } - { "XDigit" hex-digit-class } - { "Space" java-blank-class } + >string >case-fold { + { "lower" letter-class } + { "upper" LETTER-class } + { "alpha" Letter-class } + { "ascii" ascii-class } + { "digit" digit-class } + { "alnum" alpha-class } + { "punct" punctuation-class } + { "graph" java-printable-class } + { "blank" non-newline-blank-class } + { "cntrl" control-character-class } + { "xdigit" hex-digit-class } + { "space" java-blank-class } ! TODO: unicode-character-class } [ bad-class ] at-error ; @@ -66,11 +65,8 @@ ERROR: bad-class name ; { CHAR: i case-insensitive } { CHAR: d unix-lines } { CHAR: m multiline } - { CHAR: n multiline } { CHAR: r reversed-regexp } { CHAR: s dotall } - { CHAR: u unicode-case } - { CHAR: x comments } } ; : ch>option ( ch -- singleton ) @@ -101,8 +97,8 @@ CharacterInBracket = !("}") Character QuotedCharacter = !("\\E") . -Escape = "p{" CharacterInBracket*:s "}" => [[ s >string name>class ]] - | "P{" CharacterInBracket*:s "}" => [[ s >string name>class ]] +Escape = "p{" CharacterInBracket*:s "}" => [[ s name>class ]] + | "P{" CharacterInBracket*:s "}" => [[ s name>class ]] | "Q" QuotedCharacter*:s "\\E" => [[ s ]] | "u" Character:a Character:b Character:c Character:d => [[ { a b c d } hex> ensure-number ]] diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor index b35f8d1cf3..a7cb0a3715 100644 --- a/basis/regexp/regexp-docs.factor +++ b/basis/regexp/regexp-docs.factor @@ -33,20 +33,71 @@ ARTICLE: { "regexp" "construction" } "Constructing regular expressions" "Another approach is to use " { $vocab-link "regexp.combinators" } "." ; ARTICLE: { "regexp" "syntax" } "Regular expression syntax" -"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. A new addition is the inclusion of a negation operator, with the syntax " { $snippet "(?~foo)" } " to match everything that does not match " { $snippet "foo" } "." +"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented." { $heading "Characters" } +"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } "for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "." +{ $heading "Concatenation, alternation and grouping" } +"Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for gropuing. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'." { $heading "Character classes" } +"Square brackets define a convenient way to refer to a set of characters. For example, " { $snippet "[ab]" } " refers to either a or b. And " { $snippet "[a-z]" } " refers to all of the characters between a and z, in code point order. You can use these together, as in " { $snippet "[ac-fz]" } " which matches all of the characters between c and f, in addition to a and z. Character classes can be negated using a carat, as in " { $snippet "[^a]" } " which matches all characters which are not a." { $heading "Predefined character classes" } +"Several character classes are predefined, both for convenience and because they are too large to represent directly. In Factor regular expressions, all character classes are Unicode-aware." +{ $table + { { $snippet "\\d" } "Digits" } + { { $snippet "\\D" } "Not digits" } + { { $snippet "\\s" } "Whitespace" } + { { $snippet "\\S" } "Not whitespace" } + { { $snippet "\\w" } "Word character (alphanumeric or underscore)" } + { { $snippet "\\W" } "Not word character" } + { { $snippet "\\p{property}" } "Character which fulfils the property" } + { { $snippet "\\P{property}" } "Character which does not fulfil the property" } } +"Properties for " { $snippet "\\p" } " and " { $snippet "\\P" } " (case-insensitive):" +{ $table + { { $snippet "\\p{lower}" } "Lower case letters" } + { { $snippet "\\p{upper}" } "Upper case letters" } + { { $snippet "\\p{alpha}" } "Letters" } + { { $snippet "\\p{ascii}" } "Characters in the ASCII range" } + { { $snippet "\\p{alnum}" } "Letters or numbers" } + { { $snippet "\\p{punct}" } "Punctuation" } + { { $snippet "\\p{blank}" } "Non-newline whitespace" } + { { $snippet "\\p{cntrl}" } "Control character" } + { { $snippet "\\p{space}" } "Whitespace" } + { { $snippet "\\p{xdigit}" } "Hexidecimal digit" } } ! In the future: Unicode +"Full unicode properties are not yet supported." { $heading "Boundaries" } +"Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters." +{ $table + { { $snippet "^" } "Beginning of a line" } + { { $snippet "$" } "End of a line" } + { { $snippet "\\A" } "Beginning of text" } + { { $snippet "\\z" } "End of text" } + { { $snippet "\\Z" } "Almost end of text: only thing after is newline" } + { { $snippet "\\b" } "Word boundary (by Unicode word boundaries)" } + { { $snippet "\\b" } "Not word boundary (by Unicode word boundaries)" } } { $heading "Greedy quantifiers" } -{ $heading "Reluctant quantifiers" } -{ $heading "Posessive quantifiers" } -{ $heading "Logical operations" } +"It is possible to have a regular expression which matches a variable number of occurrences of another regular expression." +{ $table + { { $snippet "a*" } "Zero or more occurrences of a" } + { { $snippet "a+" } "One or more occurrences of a" } + { { $snippet "a?" } "Zero or one occurrences of a" } + { { $snippet "a{n}" } "n occurrences of a" } + { { $snippet "a{n,}" } "At least n occurrences of a" } + { { $snippet "a{,m}" } "At most m occurrences of a" } + { { $snippet "a{n,m}" } "Between n and m occurrences of a" } } +"All of these quantifiers are " { $emphasis "greedy" } ", meaning that they take as many repetitions as possible within the larger regular expression. Reluctant and posessive quantifiers are not yet supported." { $heading "Lookaround" } +"Operators are provided to look ahead and behind the current point in the regular expression. These can be used in any context, but they're the most useful at the beginning or end of a regular expression." +{ $table + { { $snippet "(?=a)" } "Asserts that the current position is immediately followed by a" } + { { $snippet "(?!a)" } "Asserts that the current position is not immediately followed by a" } + { { $snippet "(?<=a)" } "Asserts that the current position is immediately preceded by a" } + { { $snippet "(? matches? ] unit-test [ 3 ] [ "caba" "(?<=b)a" first-match from>> ] unit-test + +[ t ] [ "\ra" R/ .^a/ms matches? ] unit-test +[ f ] [ "\ra" R/ .^a/mds matches? ] unit-test +[ t ] [ "\na" R/ .^a/ms matches? ] unit-test +[ t ] [ "\na" R/ .^a/mds matches? ] unit-test + +[ t ] [ "a\r" R/ a$./ms matches? ] unit-test +[ f ] [ "a\r" R/ a$./mds matches? ] unit-test +[ t ] [ "a\n" R/ a$./ms matches? ] unit-test +[ t ] [ "a\n" R/ a$./mds matches? ] unit-test