From f3038f2ae866d994ed9d31d56a2eb8b79f4b33a9 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <littledan@Macintosh-122.local>
Date: Fri, 20 Mar 2009 19:53:54 -0500
Subject: [PATCH 1/6] More docs for Unicode and simple-flat-file; moving more
 parsing code there

---
 basis/regexp/regexp-docs.factor               |  4 +--
 .../simple-flat-file-docs.factor              | 22 ++++++++++++--
 .../simple-flat-file/simple-flat-file.factor  | 24 ++++++++++++++-
 basis/unicode/breaks/breaks.factor            |  5 ++--
 basis/unicode/data/data-docs.factor           | 30 +++++++++++++++----
 basis/unicode/data/data.factor                | 26 +---------------
 basis/unicode/script/script.factor            |  8 ++---
 basis/unicode/syntax/syntax.factor            |  7 ++---
 8 files changed, 77 insertions(+), 49 deletions(-)

diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor
index 6d9f03781d..01a727d017 100644
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@@ -45,7 +45,7 @@ ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
 ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
 "Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented."
 { $heading "Characters" }
-"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } "for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
+"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } " for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
 { $heading "Concatenation, alternation and grouping" }
 "Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for gropuing. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'."
 { $heading "Character classes" }
@@ -72,7 +72,7 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
     { { $snippet "\\p{blank}" } "Non-newline whitespace" }
     { { $snippet "\\p{cntrl}" } "Control character" }
     { { $snippet "\\p{space}" } "Whitespace" }
-    { { $snippet "\\p{xdigit}" } "Hexidecimal digit" }
+    { { $snippet "\\p{xdigit}" } "Hexadecimal digit" }
     { { $snippet "\\p{Nd}" } "Character in Unicode category Nd" } 
     { { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" } 
     { { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
diff --git a/basis/simple-flat-file/simple-flat-file-docs.factor b/basis/simple-flat-file/simple-flat-file-docs.factor
index 9ed5de7d2b..0223d94af9 100644
--- a/basis/simple-flat-file/simple-flat-file-docs.factor
+++ b/basis/simple-flat-file/simple-flat-file-docs.factor
@@ -1,8 +1,24 @@
-USING: help.syntax help.markup strings ;
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.syntax help.markup strings biassocs arrays ;
 IN: simple-flat-file
 
 ABOUT: "simple-flat-file"
 
 ARTICLE: "simple-flat-file" "Parsing simple flat files"
-"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding tasks."
-{ $subsection flat-file>biassoc } ;
+"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding and Unicode tasks."
+{ $subsection flat-file>biassoc }
+{ $subsection load-interval-file }
+{ $subsection data } ;
+
+HELP: load-interval-file
+{ $values { "filename" string } { "table" "an interval map" } }
+{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
+
+HELP: data
+{ $values { "filename" string } { "data" array } }
+{ $description "This loads a file that's delineated by semicolons and lines, returning an array of lines, where each line is an array split by the semicolons, with whitespace trimmed off." } ;
+
+HELP: flat-file>biassoc
+{ $values { "filename" string } { "biassoc" biassoc } }
+{ $description "This loads a flat file, in the form that many encoding resource files are in, with two columns of numeric data in hex, and returns a biassoc associating them." } ;
diff --git a/basis/simple-flat-file/simple-flat-file.factor b/basis/simple-flat-file/simple-flat-file.factor
index 6e53c97738..88a64b7746 100644
--- a/basis/simple-flat-file/simple-flat-file.factor
+++ b/basis/simple-flat-file/simple-flat-file.factor
@@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: sequences splitting kernel math.parser io.files io.encodings.utf8
-biassocs ascii ;
+biassocs ascii namespaces arrays make assocs interval-maps sets ;
 IN: simple-flat-file
 
 : drop-comments ( seq -- newseq )
@@ -30,3 +30,25 @@ IN: simple-flat-file
 
 : data ( filename -- data )
     utf8 file-lines drop-comments [ split-; ] map ;
+
+SYMBOL: interned
+
+: range, ( value key -- )
+    swap interned get
+    [ = ] with find nip 2array , ;
+
+: expand-ranges ( assoc -- interval-map )
+    [
+        [
+            swap CHAR: . over member? [
+                ".." split1 [ hex> ] bi@ 2array
+            ] [ hex> ] if range,
+        ] assoc-each
+    ] { } make <interval-map> ;
+
+: process-interval-file ( ranges -- table )
+    dup values prune interned
+    [ expand-ranges ] with-variable ;
+
+: load-interval-file ( filename -- table )
+    data process-interval-file ;
diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor
index 91f6a45911..f397ebb2de 100644
--- a/basis/unicode/breaks/breaks.factor
+++ b/basis/unicode/breaks/breaks.factor
@@ -5,7 +5,8 @@ combinators splitting sequences math.parser io.files io assocs
 arrays namespaces make math.ranges unicode.normalize
 unicode.normalize.private values io.encodings.ascii
 unicode.syntax unicode.data compiler.units fry
-alien.syntax sets accessors interval-maps memoize locals words ;
+alien.syntax sets accessors interval-maps memoize locals words
+simple-flat-file ;
 IN: unicode.breaks
 
 <PRIVATE
@@ -127,7 +128,7 @@ to: grapheme-table
 
 VALUE: word-break-table
 
-"vocab:unicode/data/WordBreakProperty.txt" load-key-value
+"vocab:unicode/data/WordBreakProperty.txt" load-interval-file
 to: word-break-table
 
 C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
diff --git a/basis/unicode/data/data-docs.factor b/basis/unicode/data/data-docs.factor
index d1a458eb48..82706729bf 100644
--- a/basis/unicode/data/data-docs.factor
+++ b/basis/unicode/data/data-docs.factor
@@ -6,7 +6,7 @@ IN: unicode.data
 ABOUT: "unicode.data"
 
 ARTICLE: "unicode.data" "Unicode data tables"
-"The " { $vocab-link "unicode.data" "unicode.data" } " vocabulary contains core Unicode data tables and code for parsing this from files."
+"The " { $vocab-link "unicode.data" "unicode.data" } " vocabulary contains core Unicode data tables and code for parsing this from files. The following words access these data tables."
 { $subsection canonical-entry }
 { $subsection combine-chars }
 { $subsection combining-class }
@@ -14,7 +14,11 @@ ARTICLE: "unicode.data" "Unicode data tables"
 { $subsection name>char }
 { $subsection char>name }
 { $subsection property? }
-{ $subsection load-key-value } ;
+{ $subsection category }
+{ $subsection ch>upper }
+{ $subsection ch>lower } 
+{ $subsection ch>title } 
+{ $subsection special-case } ;
 
 HELP: canonical-entry
 { $values { "char" "a code point" } { "seq" string } }
@@ -48,6 +52,22 @@ HELP: property?
 { $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
 { $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
 
-HELP: load-key-value
-{ $values { "filename" string } { "table" "an interval map" } }
-{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
+HELP: category
+{ $values { "char" "a code point" } { "category" string } }
+{ $description "Returns the general category of a code point, in the form of a string. This will always be a string within the ASCII range of length two. If the code point is unassigned, then it returns " { $snippet "Cn" } "." } ;
+
+HELP: ch>upper
+{ $values { "ch" "a code point" } { "upper" "a code point" } }
+{ $description "Returns the simple upper-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
+
+HELP: ch>lower
+{ $values { "ch" "a code point" } { "lower" "a code point" } }
+{ $description "Returns the simple lower-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
+
+HELP: ch>title
+{ $values { "ch" "a code point" } { "title" "a code point" } }
+{ $description "Returns the simple title-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
+
+HELP: special-case
+{ $values { "ch" "a code point" } { "casing-tuple" { "a tuple, or " { $link f } } } }
+{ $description "If a code point has special casing behavior, returns a tuple which represents that information." } ;
diff --git a/basis/unicode/data/data.factor b/basis/unicode/data/data.factor
index e94036a85e..779ae64d48 100644
--- a/basis/unicode/data/data.factor
+++ b/basis/unicode/data/data.factor
@@ -58,7 +58,7 @@ CONSTANT: num-chars HEX: 2FA1E
 
 PRIVATE>
 
-: category# ( char -- category )
+: category# ( char -- n )
     ! There are a few characters that should be Cn
     ! that this gives Cf or Mn
     ! Cf = 26; Mn = 5; Cn = 29
@@ -219,27 +219,3 @@ load-properties to: properties
 
 [ name>char [ "Invalid character" throw ] unless* ]
 name>char-hook set-global
-
-SYMBOL: interned
-
-: range, ( value key -- )
-    swap interned get
-    [ = ] with find nip 2array , ;
-
-: expand-ranges ( assoc -- interval-map )
-    [
-        [
-            swap CHAR: . over member? [
-                ".." split1 [ hex> ] bi@ 2array
-            ] [ hex> ] if range,
-        ] assoc-each
-    ] { } make <interval-map> ;
-
-: process-key-value ( ranges -- table )
-    dup values prune interned
-    [ expand-ranges ] with-variable ;
-
-PRIVATE>
-
-: load-key-value ( filename -- table )
-    data process-key-value ;
diff --git a/basis/unicode/script/script.factor b/basis/unicode/script/script.factor
index ed80476084..4243c81623 100644
--- a/basis/unicode/script/script.factor
+++ b/basis/unicode/script/script.factor
@@ -1,17 +1,13 @@
 ! Copyright (C) 2008 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors values kernel sequences assocs io.files
-io.encodings ascii math.ranges io splitting math.parser
-namespaces make byte-arrays locals math sets io.encodings.ascii
-words words.symbol compiler.units arrays interval-maps
-unicode.data ;
+USING: values interval-maps simple-flat-file ;
 IN: unicode.script
 
 <PRIVATE
 
 VALUE: script-table
 
-"vocab:unicode/script/Scripts.txt" load-key-value
+"vocab:unicode/script/Scripts.txt" load-interval-file
 to: script-table
 
 PRIVATE>
diff --git a/basis/unicode/syntax/syntax.factor b/basis/unicode/syntax/syntax.factor
index 5bd8c05e15..a42adb42ef 100644
--- a/basis/unicode/syntax/syntax.factor
+++ b/basis/unicode/syntax/syntax.factor
@@ -2,7 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: unicode.data kernel math sequences parser lexer
 bit-arrays namespaces make sequences.private arrays quotations
-assocs classes.predicate math.order strings.parser ;
+assocs classes.predicate math.order strings.parser sets ;
 IN: unicode.syntax
 
 <PRIVATE
@@ -30,9 +30,6 @@ PRIVATE>
 : CATEGORY:
     CREATE ";" parse-tokens define-category ; parsing
 
-: seq-minus ( seq1 seq2 -- diff )
-    [ member? not ] curry filter ;
-
 : CATEGORY-NOT:
     CREATE ";" parse-tokens
-    categories swap seq-minus define-category ; parsing
+    categories swap diff define-category ; parsing

From 62638fb4d30fc1b6126ab84737cfec7305546b56 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <littledan@Macintosh-122.local>
Date: Sat, 21 Mar 2009 01:11:45 -0500
Subject: [PATCH 2/6] Moving unicode.syntax to unicode.categories.syntax;
 documenting and modifying syntax

---
 basis/unicode/breaks/breaks.factor            |  8 ++---
 basis/unicode/case/case.factor                |  4 +--
 .../unicode/categories/categories-docs.factor |  7 +++-
 basis/unicode/categories/categories.factor    | 13 +++----
 .../{ => categories}/syntax/authors.txt       |  0
 .../{ => categories}/syntax/summary.txt       |  0
 .../categories/syntax/syntax-docs.factor      | 19 ++++++++++
 .../categories/syntax/syntax-tests.factor     |  3 ++
 basis/unicode/categories/syntax/syntax.factor | 36 +++++++++++++++++++
 .../unicode/{ => categories}/syntax/tags.txt  |  0
 basis/unicode/collation/collation.factor      |  2 +-
 basis/unicode/normalize/normalize.factor      |  2 +-
 basis/unicode/syntax/syntax.factor            | 35 ------------------
 basis/unicode/unicode-docs.factor             |  2 +-
 basis/xml/char-classes/char-classes.factor    | 25 ++++++++-----
 15 files changed, 96 insertions(+), 60 deletions(-)
 rename basis/unicode/{ => categories}/syntax/authors.txt (100%)
 rename basis/unicode/{ => categories}/syntax/summary.txt (100%)
 create mode 100644 basis/unicode/categories/syntax/syntax-docs.factor
 create mode 100644 basis/unicode/categories/syntax/syntax-tests.factor
 create mode 100644 basis/unicode/categories/syntax/syntax.factor
 rename basis/unicode/{ => categories}/syntax/tags.txt (100%)
 delete mode 100644 basis/unicode/syntax/syntax.factor

diff --git a/basis/unicode/breaks/breaks.factor b/basis/unicode/breaks/breaks.factor
index f397ebb2de..22d6cddfb9 100644
--- a/basis/unicode/breaks/breaks.factor
+++ b/basis/unicode/breaks/breaks.factor
@@ -4,7 +4,7 @@ USING: combinators.short-circuit unicode.categories kernel math
 combinators splitting sequences math.parser io.files io assocs
 arrays namespaces make math.ranges unicode.normalize
 unicode.normalize.private values io.encodings.ascii
-unicode.syntax unicode.data compiler.units fry
+unicode.data compiler.units fry unicode.categories.syntax
 alien.syntax sets accessors interval-maps memoize locals words
 simple-flat-file ;
 IN: unicode.breaks
@@ -32,9 +32,9 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
         [ drop Control ]
     } case ;
 
-CATEGORY: (extend) Me Mn ;
-: extend? ( ch -- ? )
-    { [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
+CATEGORY: extend
+    Me Mn |
+    "Other_Grapheme_Extend" property? ;
 
 : loe? ( ch -- ? )
     "Logical_Order_Exception" property? ;
diff --git a/basis/unicode/case/case.factor b/basis/unicode/case/case.factor
index fa842b8b81..1ad3931746 100644
--- a/basis/unicode/case/case.factor
+++ b/basis/unicode/case/case.factor
@@ -1,8 +1,8 @@
 ! Copyright (C) 2008, 2009 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: unicode.data sequences namespaces
-sbufs make unicode.syntax unicode.normalize math hints
-unicode.categories combinators unicode.syntax assocs combinators.short-circuit
+sbufs make unicode.normalize math hints
+unicode.categories combinators assocs combinators.short-circuit
 strings splitting kernel accessors unicode.breaks fry locals ;
 QUALIFIED: ascii
 IN: unicode.case
diff --git a/basis/unicode/categories/categories-docs.factor b/basis/unicode/categories/categories-docs.factor
index b0870e28fb..924b197417 100644
--- a/basis/unicode/categories/categories-docs.factor
+++ b/basis/unicode/categories/categories-docs.factor
@@ -12,6 +12,9 @@ HELP: Letter
 HELP: alpha
 { $class-description "The class of alphanumeric characters." } ;
 
+HELP: math
+{ $class-description "The class of Unicode math characters." } ;
+
 HELP: blank
 { $class-description "The class of whitespace characters." } ;
 
@@ -54,6 +57,8 @@ ARTICLE: "unicode.categories" "Character classes"
 { $subsection uncased }
 { $subsection uncased? }
 { $subsection character }
-{ $subsection character? } ;
+{ $subsection character? }
+{ $subsection math }
+{ $subsection math? } ;
 
 ABOUT: "unicode.categories"
diff --git a/basis/unicode/categories/categories.factor b/basis/unicode/categories/categories.factor
index 0464e31b12..126c03c869 100644
--- a/basis/unicode/categories/categories.factor
+++ b/basis/unicode/categories/categories.factor
@@ -1,15 +1,16 @@
 ! Copyright (C) 2008 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.syntax ;
+USING: unicode.categories.syntax sequences unicode.data ;
 IN: unicode.categories
 
-CATEGORY: blank Zs Zl Zp \r\n ;
-CATEGORY: letter Ll ;
-CATEGORY: LETTER Lu ;
-CATEGORY: Letter Lu Ll Lt Lm Lo ;
+CATEGORY: blank Zs Zl Zp | "\r\n" member? ;
+CATEGORY: letter Ll | "Other_Lowercase" property? ;
+CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
+CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
 CATEGORY: digit Nd Nl No ;
 CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
-CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
+CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
 CATEGORY: control Cc ;
 CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ; 
 CATEGORY-NOT: character Cn ;
+CATEGORY: math Sm | "Other_Math" property? ;
diff --git a/basis/unicode/syntax/authors.txt b/basis/unicode/categories/syntax/authors.txt
similarity index 100%
rename from basis/unicode/syntax/authors.txt
rename to basis/unicode/categories/syntax/authors.txt
diff --git a/basis/unicode/syntax/summary.txt b/basis/unicode/categories/syntax/summary.txt
similarity index 100%
rename from basis/unicode/syntax/summary.txt
rename to basis/unicode/categories/syntax/summary.txt
diff --git a/basis/unicode/categories/syntax/syntax-docs.factor b/basis/unicode/categories/syntax/syntax-docs.factor
new file mode 100644
index 0000000000..6293b92c72
--- /dev/null
+++ b/basis/unicode/categories/syntax/syntax-docs.factor
@@ -0,0 +1,19 @@
+! Copyright (C) 2008 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.syntax help.markup ;
+IN: unicode.categories.syntax
+
+ABOUT: "unicode.categories.syntax"
+
+ARTICLE: "unicode.categories.syntax" "Unicode category syntax"
+"There is special syntax sugar for making predicate classes which are unions of Unicode general categories, plus some other code."
+{ $subsection POSTPONE: CATEGORY: }
+{ $subsection POSTPONE: CATEGORY-NOT: } ;
+
+HELP: CATEGORY:
+{ $syntax "CATEGORY: foo Nl Pd Lu | \"Diacritic\" property? ;" }
+{ $description "This defines a predicate class which is a subset of code points. In this example, " { $snippet "foo" } " is the class of characters which are in the general category Nl or Pd or Lu, or which have the Diacritic property." } ;
+
+HELP: CATEGORY-NOT:
+{ $syntax "CATEGORY-NOT: foo Nl Pd Lu | \"Diacritic\" property? ;" }
+{ $description "This defines a predicate class which is a subset of code points, the complement of what " { $link POSTPONE: CATEGORY: } " would define. In this example, " { $snippet "foo" } " is the class of characters which are neither in the general category Nl or Pd or Lu, nor have the Diacritic property." } ;
diff --git a/basis/unicode/categories/syntax/syntax-tests.factor b/basis/unicode/categories/syntax/syntax-tests.factor
new file mode 100644
index 0000000000..1ec622fc98
--- /dev/null
+++ b/basis/unicode/categories/syntax/syntax-tests.factor
@@ -0,0 +1,3 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+
diff --git a/basis/unicode/categories/syntax/syntax.factor b/basis/unicode/categories/syntax/syntax.factor
new file mode 100644
index 0000000000..593bb0bbdd
--- /dev/null
+++ b/basis/unicode/categories/syntax/syntax.factor
@@ -0,0 +1,36 @@
+! Copyright (C) 2008, 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: unicode.data kernel math sequences parser
+bit-arrays namespaces sequences.private arrays classes.parser
+assocs classes.predicate sets fry splitting accessors ;
+IN: unicode.categories.syntax
+
+! For use in CATEGORY:
+SYMBOLS: Cn Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co | ;
+
+<PRIVATE
+
+: >category-array ( categories -- bitarray )
+    categories [ swap member? ] with map >bit-array ;
+
+: [category] ( categories code -- quot )
+    [ >category-array ] dip
+    '[ dup category# _ nth-unsafe [ drop t ] _ if ] ;
+
+: define-category ( word categories code -- )
+    [category] integer swap define-predicate-class ;
+
+: parse-category ( -- word tokens quot )
+    CREATE-CLASS \ ; parse-until { | } split1
+    [ [ name>> ] map ]
+    [ [ [ ] like ] [ [ drop f ] ] if* ] bi* ;
+
+PRIVATE>
+
+: CATEGORY:
+    parse-category define-category ; parsing
+
+: CATEGORY-NOT:
+    parse-category
+    [ categories swap diff ] dip
+    define-category ; parsing
diff --git a/basis/unicode/syntax/tags.txt b/basis/unicode/categories/syntax/tags.txt
similarity index 100%
rename from basis/unicode/syntax/tags.txt
rename to basis/unicode/categories/syntax/tags.txt
diff --git a/basis/unicode/collation/collation.factor b/basis/unicode/collation/collation.factor
index 0c51ea4352..b6eddccae0 100755
--- a/basis/unicode/collation/collation.factor
+++ b/basis/unicode/collation/collation.factor
@@ -4,7 +4,7 @@ USING: combinators.short-circuit sequences io.files
 io.encodings.ascii kernel values splitting accessors math.parser
 ascii io assocs strings math namespaces make sorting combinators
 math.order arrays unicode.normalize unicode.data locals
-unicode.syntax macros sequences.deep words unicode.breaks
+macros sequences.deep words unicode.breaks
 quotations combinators.short-circuit simple-flat-file ;
 IN: unicode.collation
 
diff --git a/basis/unicode/normalize/normalize.factor b/basis/unicode/normalize/normalize.factor
index 602d9555ea..aca96a5694 100644
--- a/basis/unicode/normalize/normalize.factor
+++ b/basis/unicode/normalize/normalize.factor
@@ -2,7 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: ascii sequences namespaces make unicode.data kernel math arrays
 locals sorting.insertion accessors assocs math.order combinators
-unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
+strings sbufs hints combinators.short-circuit vectors ;
 IN: unicode.normalize
 
 <PRIVATE
diff --git a/basis/unicode/syntax/syntax.factor b/basis/unicode/syntax/syntax.factor
deleted file mode 100644
index a42adb42ef..0000000000
--- a/basis/unicode/syntax/syntax.factor
+++ /dev/null
@@ -1,35 +0,0 @@
-! Copyright (C) 2008 Daniel Ehrenberg.
-! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.data kernel math sequences parser lexer
-bit-arrays namespaces make sequences.private arrays quotations
-assocs classes.predicate math.order strings.parser sets ;
-IN: unicode.syntax
-
-<PRIVATE
-
-: >category-array ( categories -- bitarray )
-    categories [ swap member? ] with map >bit-array ;
-
-: as-string ( strings -- bit-array )
-    concat unescape-string ;
-
-: [category] ( categories -- quot )
-    [
-        [ [ categories member? not ] filter as-string ] keep 
-        [ categories member? ] filter >category-array
-        [ dup category# ] % , [ nth-unsafe [ drop t ] ] %
-        \ member? 2array >quotation ,
-        \ if ,
-    ] [ ] make ;
-
-: define-category ( word categories -- )
-    [category] integer swap define-predicate-class ;
-
-PRIVATE>
-
-: CATEGORY:
-    CREATE ";" parse-tokens define-category ; parsing
-
-: CATEGORY-NOT:
-    CREATE ";" parse-tokens
-    categories swap diff define-category ; parsing
diff --git a/basis/unicode/unicode-docs.factor b/basis/unicode/unicode-docs.factor
index 4ae326ac84..9450b49f0b 100644
--- a/basis/unicode/unicode-docs.factor
+++ b/basis/unicode/unicode-docs.factor
@@ -15,7 +15,7 @@ $nl
 { $vocab-subsection "Word and grapheme breaks" "unicode.breaks" }
 { $vocab-subsection "Unicode normalization" "unicode.normalize" }
 "The following are mostly for internal use:"
-{ $vocab-subsection "Unicode syntax" "unicode.syntax" }
+{ $vocab-subsection "Unicode category syntax" "unicode.categories.syntax" }
 { $vocab-subsection "Unicode data tables" "unicode.data" }
 { $see-also "ascii" "io.encodings" } ;
 
diff --git a/basis/xml/char-classes/char-classes.factor b/basis/xml/char-classes/char-classes.factor
index d510c8a881..153fca0bb7 100644
--- a/basis/xml/char-classes/char-classes.factor
+++ b/basis/xml/char-classes/char-classes.factor
@@ -1,19 +1,26 @@
 ! Copyright (C) 2005, 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-USING: kernel sequences unicode.syntax math math.order combinators
-hints ;
+USING: kernel sequences unicode.categories.syntax math math.order
+combinators hints ;
 IN: xml.char-classes
 
-CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
-: 1.0name-start? ( char -- ? )
-    dup 1.0name-start*? [ drop t ] 
-    [ HEX: 2BB HEX: 2C1 between? ] if ;
+CATEGORY: 1.0name-start
+    Ll Lu Lo Lt Nl | {
+        [ HEX: 2BB HEX: 2C1 between? ]
+        [ "\u000559\u0006E5\u0006E6_:" member? ]
+    } 1|| ;
 
-CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ;
+CATEGORY: 1.0name-char
+    Ll Lu Lo Lt Nl Mc Me Mn Lm Nd |
+    "_-.\u000387:" member? ;
 
-CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ;
+CATEGORY: 1.1name-start
+    Ll Lu Lo Lm Ln Nl |
+    "_:" member? ;
 
-CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
+CATEGORY: 1.1name-char
+    Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf |
+    "_-.\u0000b7:" member? ;
 
 : name-start? ( 1.0? char -- ? )
     swap [ 1.0name-start? ] [ 1.1name-start? ] if ;

From c6fc88f28f8d846da3d47fbfadfbf385cba01751 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <littledan@Macintosh-122.local>
Date: Sat, 21 Mar 2009 01:18:25 -0500
Subject: [PATCH 3/6] Unicode categories syntax cleanup

---
 basis/unicode/categories/syntax/syntax.factor | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/basis/unicode/categories/syntax/syntax.factor b/basis/unicode/categories/syntax/syntax.factor
index 593bb0bbdd..93f7919b6b 100644
--- a/basis/unicode/categories/syntax/syntax.factor
+++ b/basis/unicode/categories/syntax/syntax.factor
@@ -17,8 +17,14 @@ SYMBOLS: Cn Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So Zs
     [ >category-array ] dip
     '[ dup category# _ nth-unsafe [ drop t ] _ if ] ;
 
+: integer-predicate-class ( word predicate -- )
+    integer swap define-predicate-class ;
+
 : define-category ( word categories code -- )
-    [category] integer swap define-predicate-class ;
+    [category] integer-predicate-class ;
+
+: define-not-category ( word categories code -- )
+    [category] [ not ] compose integer-predicate-class ;
 
 : parse-category ( -- word tokens quot )
     CREATE-CLASS \ ; parse-until { | } split1
@@ -31,6 +37,4 @@ PRIVATE>
     parse-category define-category ; parsing
 
 : CATEGORY-NOT:
-    parse-category
-    [ categories swap diff ] dip
-    define-category ; parsing
+    parse-category define-not-category ; parsing

From d6f9947bff87e7c16d1e918056f70a0f8b1cc02b Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <littledan@Macintosh-122.local>
Date: Sat, 21 Mar 2009 02:53:36 -0500
Subject: [PATCH 4/6] Regexp character class intersection and difference syntax

---
 basis/regexp/classes/classes.factor     | 11 ++++++++++-
 basis/regexp/parser/parser-tests.factor |  2 +-
 basis/regexp/parser/parser.factor       | 18 +++++++++++++----
 basis/regexp/regexp-tests.factor        | 26 +++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/basis/regexp/classes/classes.factor b/basis/regexp/classes/classes.factor
index e114dea260..a1c4e3ca2a 100644
--- a/basis/regexp/classes/classes.factor
+++ b/basis/regexp/classes/classes.factor
@@ -230,7 +230,10 @@ TUPLE: class-partition integers not-integers simples not-simples and or other ;
     dup or-class flatten partition-classes
     dup not-integers>> length {
         { 0 [ nip make-or-class ] }
-        { 1 [ not-integers>> first [ class>> '[ _ swap class-member? ] any? ] keep or ] }
+        { 1 [
+            not-integers>> first
+            [ class>> '[ _ swap class-member? ] any? ] keep or
+        ] }
         [ 3drop t ]
     } case ;
 
@@ -251,6 +254,12 @@ M: or-class <not-class>
 M: t <not-class> drop f ;
 M: f <not-class> drop t ;
 
+: <minus-class> ( a b -- a-b )
+    <not-class> 2array <and-class> ;
+
+: <sym-diff-class> ( a b -- a~b )
+    2array [ <or-class> ] [ <and-class> ] bi <minus-class> ;
+
 M: primitive-class class-member?
     class>> class-member? ;
 
diff --git a/basis/regexp/parser/parser-tests.factor b/basis/regexp/parser/parser-tests.factor
index d606015f61..5ea9753fba 100644
--- a/basis/regexp/parser/parser-tests.factor
+++ b/basis/regexp/parser/parser-tests.factor
@@ -11,7 +11,7 @@ IN: regexp.parser.tests
     "a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
     "(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
     "[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
-    "[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
+    "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
     "(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
     "[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
     "\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
diff --git a/basis/regexp/parser/parser.factor b/basis/regexp/parser/parser.factor
index e8de469a94..9fcadc4008 100644
--- a/basis/regexp/parser/parser.factor
+++ b/basis/regexp/parser/parser.factor
@@ -148,19 +148,29 @@ Character = EscapeSequence
           | "^" => [[ ^ <tagged-epsilon> ]]
           | . ?[ allowed-char? ]?
 
-AnyRangeCharacter = EscapeSequence | .
+AnyRangeCharacter = !("&&"|"||"|"--"|"~~") (EscapeSequence | .)
 
 RangeCharacter = !("]") AnyRangeCharacter
 
-Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
+Range = RangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
       | RangeCharacter
 
-StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
+StartRange = AnyRangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
            | AnyRangeCharacter
 
 Ranges = StartRange:s Range*:r => [[ r s prefix ]]
 
-CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
+BasicCharClass =  "^"?:n Ranges:e => [[ e n char-class ]]
+
+CharClass = BasicCharClass:b "&&" CharClass:c
+                => [[ b c 2array <and-class> ]]
+          | BasicCharClass:b "||" CharClass:c
+                => [[ b c 2array <or-class> ]]
+          | BasicCharClass:b "~~" CharClass:c
+                => [[ b c <sym-diff-class> ]]
+          | BasicCharClass:b "--" CharClass:c
+                => [[ b c <minus-class> ]]
+          | BasicCharClass
 
 Options = [idmsux]*
 
diff --git a/basis/regexp/regexp-tests.factor b/basis/regexp/regexp-tests.factor
index 999caeaed6..2234386803 100644
--- a/basis/regexp/regexp-tests.factor
+++ b/basis/regexp/regexp-tests.factor
@@ -508,3 +508,29 @@ IN: regexp-tests
 [ t ] [ " " R/ \P{LL}/ matches? ] unit-test
 [ f ] [ "a" R/ \P{sCriPt = latin}/ matches? ] unit-test
 [ t ] [ " " R/ \P{SCRIPT = laTIn}/ matches? ] unit-test
+
+! Logical operators
+[ t ] [ "a" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
+[ t ] [ "π" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
+[ t ] [ "A" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
+
+[ t ] [ "a" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
+[ t ] [ "π" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
+[ t ] [ "A" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
+
+[ t ] [ "a" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
+[ f ] [ "π" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
+[ f ] [ "A" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
+
+[ f ] [ "a" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
+[ t ] [ "π" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
+[ t ] [ "A" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
+
+[ f ] [ "a" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
+[ f ] [ "π" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
+[ t ] [ "A" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
+[ f ] [ "3" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test

From 4f83e2057ad090d357a61767a0e858cd1b65ca14 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <littledan@Macintosh-122.local>
Date: Sat, 21 Mar 2009 03:17:20 -0500
Subject: [PATCH 5/6] Making unicode.categories.syntax use member?, simplifying
 the code

---
 basis/unicode/categories/syntax/syntax.factor | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/basis/unicode/categories/syntax/syntax.factor b/basis/unicode/categories/syntax/syntax.factor
index 93f7919b6b..87f143aadc 100644
--- a/basis/unicode/categories/syntax/syntax.factor
+++ b/basis/unicode/categories/syntax/syntax.factor
@@ -1,6 +1,6 @@
 ! Copyright (C) 2008, 2009 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: unicode.data kernel math sequences parser
+USING: unicode.data kernel math sequences parser unicode.data.private
 bit-arrays namespaces sequences.private arrays classes.parser
 assocs classes.predicate sets fry splitting accessors ;
 IN: unicode.categories.syntax
@@ -10,12 +10,8 @@ SYMBOLS: Cn Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So Zs
 
 <PRIVATE
 
-: >category-array ( categories -- bitarray )
-    categories [ swap member? ] with map >bit-array ;
-
 : [category] ( categories code -- quot )
-    [ >category-array ] dip
-    '[ dup category# _ nth-unsafe [ drop t ] _ if ] ;
+    '[ dup category# _ member? [ drop t ] _ if ] ;
 
 : integer-predicate-class ( word predicate -- )
     integer swap define-predicate-class ;
@@ -28,7 +24,7 @@ SYMBOLS: Cn Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So Zs
 
 : parse-category ( -- word tokens quot )
     CREATE-CLASS \ ; parse-until { | } split1
-    [ [ name>> ] map ]
+    [ [ name>> categories-map at ] map ]
     [ [ [ ] like ] [ [ drop f ] ] if* ] bi* ;
 
 PRIVATE>

From 62c2ee8d3b8d0a6aec1dcf19b64a70fdcdff6e9e Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <littledan@Macintosh-122.local>
Date: Sat, 21 Mar 2009 03:22:50 -0500
Subject: [PATCH 6/6] Documenting character class operations

---
 basis/regexp/regexp-docs.factor | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/basis/regexp/regexp-docs.factor b/basis/regexp/regexp-docs.factor
index 01a727d017..3a914f4283 100644
--- a/basis/regexp/regexp-docs.factor
+++ b/basis/regexp/regexp-docs.factor
@@ -76,6 +76,8 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
     { { $snippet "\\p{Nd}" } "Character in Unicode category Nd" } 
     { { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" } 
     { { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
+{ $heading "Character class operations" }
+"Character classes can be composed using four binary operations: " { $snippet "|| && ~~ --" } ". These do the operations union, intersection, symmetric difference and difference, respectively. For example, characters which are lower-case but not Latin script could be matched as " { $snippet "[\\p{lower}--\\p{script=latin}]" } ". These operations are right-associative, and " { $snippet "^" } " binds tighter than them. There is no syntax for grouping."
 { $heading "Boundaries" }
 "Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
 { $table