Update unicode.categories.syntax
commit
3de9723d22
|
@ -230,7 +230,10 @@ TUPLE: class-partition integers not-integers simples not-simples and or other ;
|
||||||
dup or-class flatten partition-classes
|
dup or-class flatten partition-classes
|
||||||
dup not-integers>> length {
|
dup not-integers>> length {
|
||||||
{ 0 [ nip make-or-class ] }
|
{ 0 [ nip make-or-class ] }
|
||||||
{ 1 [ not-integers>> first [ class>> '[ _ swap class-member? ] any? ] keep or ] }
|
{ 1 [
|
||||||
|
not-integers>> first
|
||||||
|
[ class>> '[ _ swap class-member? ] any? ] keep or
|
||||||
|
] }
|
||||||
[ 3drop t ]
|
[ 3drop t ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
|
@ -251,6 +254,12 @@ M: or-class <not-class>
|
||||||
M: t <not-class> drop f ;
|
M: t <not-class> drop f ;
|
||||||
M: f <not-class> drop t ;
|
M: f <not-class> drop t ;
|
||||||
|
|
||||||
|
: <minus-class> ( a b -- a-b )
|
||||||
|
<not-class> 2array <and-class> ;
|
||||||
|
|
||||||
|
: <sym-diff-class> ( a b -- a~b )
|
||||||
|
2array [ <or-class> ] [ <and-class> ] bi <minus-class> ;
|
||||||
|
|
||||||
M: primitive-class class-member?
|
M: primitive-class class-member?
|
||||||
class>> class-member? ;
|
class>> class-member? ;
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ IN: regexp.parser.tests
|
||||||
"a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
|
"a|b" "a.b" "a|b|c" "abc|b" "a|bcd" "a|(b)" "(?-i:a)" "||"
|
||||||
"(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
|
"(a)|b" "(a|b)" "((a)|(b))" "(?:a)" "(?i:a)" "|b" "b|"
|
||||||
"[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
|
"[abc]" "[a-c]" "[^a-c]" "[^]]" "[]a]" "[[]" "[]-a]" "[a-]" "[-]"
|
||||||
"[--a]" "foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
|
"foo*" "(foo)*" "(a|b)|c" "(foo){2,3}" "(foo){2,}"
|
||||||
"(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
|
"(foo){2}" "{2,3}" "{," "{,}" "}" "foo}" "[^]-a]" "[^-]a]"
|
||||||
"[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
|
"[a-]" "[^a-]" "[^a-]" "a{,2}" "(?#foobar)"
|
||||||
"\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
|
"\\p{Space}" "\\t" "\\[" "[\\]]" "\\P{Space}"
|
||||||
|
|
|
@ -148,19 +148,29 @@ Character = EscapeSequence
|
||||||
| "^" => [[ ^ <tagged-epsilon> ]]
|
| "^" => [[ ^ <tagged-epsilon> ]]
|
||||||
| . ?[ allowed-char? ]?
|
| . ?[ allowed-char? ]?
|
||||||
|
|
||||||
AnyRangeCharacter = EscapeSequence | .
|
AnyRangeCharacter = !("&&"|"||"|"--"|"~~") (EscapeSequence | .)
|
||||||
|
|
||||||
RangeCharacter = !("]") AnyRangeCharacter
|
RangeCharacter = !("]") AnyRangeCharacter
|
||||||
|
|
||||||
Range = RangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
|
Range = RangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
|
||||||
| RangeCharacter
|
| RangeCharacter
|
||||||
|
|
||||||
StartRange = AnyRangeCharacter:a "-" RangeCharacter:b => [[ a b <range-class> ]]
|
StartRange = AnyRangeCharacter:a "-" !("-") RangeCharacter:b => [[ a b <range-class> ]]
|
||||||
| AnyRangeCharacter
|
| AnyRangeCharacter
|
||||||
|
|
||||||
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
|
Ranges = StartRange:s Range*:r => [[ r s prefix ]]
|
||||||
|
|
||||||
CharClass = "^"?:n Ranges:e => [[ e n char-class ]]
|
BasicCharClass = "^"?:n Ranges:e => [[ e n char-class ]]
|
||||||
|
|
||||||
|
CharClass = BasicCharClass:b "&&" CharClass:c
|
||||||
|
=> [[ b c 2array <and-class> ]]
|
||||||
|
| BasicCharClass:b "||" CharClass:c
|
||||||
|
=> [[ b c 2array <or-class> ]]
|
||||||
|
| BasicCharClass:b "~~" CharClass:c
|
||||||
|
=> [[ b c <sym-diff-class> ]]
|
||||||
|
| BasicCharClass:b "--" CharClass:c
|
||||||
|
=> [[ b c <minus-class> ]]
|
||||||
|
| BasicCharClass
|
||||||
|
|
||||||
Options = [idmsux]*
|
Options = [idmsux]*
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,7 @@ ARTICLE: { "regexp" "construction" } "Constructing regular expressions"
|
||||||
ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
|
ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
|
||||||
"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented."
|
"Regexp syntax is largely compatible with Perl, Java and extended POSIX regexps, but not completely. Below, the syntax is documented."
|
||||||
{ $heading "Characters" }
|
{ $heading "Characters" }
|
||||||
"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } "for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
|
"At its core, regular expressions consist of character literals. For example, " { $snippet "R/ f/" } " is a regular expression matching just the string 'f'. In addition, the normal escape codes are provided, like " { $snippet "\\t" } " for the tab character and " { $snippet "\\uxxxxxx" } " for an arbitrary Unicode code point, by its hex value. In addition, any character can be preceded by a backslash to escape it, unless this has special meaning. For example, to match a literal opening parenthesis, use " { $snippet "\\(" } "."
|
||||||
{ $heading "Concatenation, alternation and grouping" }
|
{ $heading "Concatenation, alternation and grouping" }
|
||||||
"Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for gropuing. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'."
|
"Regular expressions can be built out of multiple characters by concatenation. For example, " { $snippet "R/ ab/" } " matches a followed by b. The " { $snippet "|" } " (alternation) operator can construct a regexp which matches one of two alternatives. Parentheses can be used for gropuing. So " { $snippet "R/ f(oo|ar)/" } " would match either 'foo' or 'far'."
|
||||||
{ $heading "Character classes" }
|
{ $heading "Character classes" }
|
||||||
|
@ -72,10 +72,12 @@ ARTICLE: { "regexp" "syntax" } "Regular expression syntax"
|
||||||
{ { $snippet "\\p{blank}" } "Non-newline whitespace" }
|
{ { $snippet "\\p{blank}" } "Non-newline whitespace" }
|
||||||
{ { $snippet "\\p{cntrl}" } "Control character" }
|
{ { $snippet "\\p{cntrl}" } "Control character" }
|
||||||
{ { $snippet "\\p{space}" } "Whitespace" }
|
{ { $snippet "\\p{space}" } "Whitespace" }
|
||||||
{ { $snippet "\\p{xdigit}" } "Hexidecimal digit" }
|
{ { $snippet "\\p{xdigit}" } "Hexadecimal digit" }
|
||||||
{ { $snippet "\\p{Nd}" } "Character in Unicode category Nd" }
|
{ { $snippet "\\p{Nd}" } "Character in Unicode category Nd" }
|
||||||
{ { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" }
|
{ { $snippet "\\p{Z}" } "Character in Unicode category beginning with Z" }
|
||||||
{ { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
|
{ { $snippet "\\p{script=Cham}" } "Character in the Cham writing system" } }
|
||||||
|
{ $heading "Character class operations" }
|
||||||
|
"Character classes can be composed using four binary operations: " { $snippet "|| && ~~ --" } ". These do the operations union, intersection, symmetric difference and difference, respectively. For example, characters which are lower-case but not Latin script could be matched as " { $snippet "[\\p{lower}--\\p{script=latin}]" } ". These operations are right-associative, and " { $snippet "^" } " binds tighter than them. There is no syntax for grouping."
|
||||||
{ $heading "Boundaries" }
|
{ $heading "Boundaries" }
|
||||||
"Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
|
"Special operators exist to match certain points in the string. These are called 'zero-width' because they do not consume any characters."
|
||||||
{ $table
|
{ $table
|
||||||
|
|
|
@ -508,3 +508,29 @@ IN: regexp-tests
|
||||||
[ t ] [ " " R/ \P{LL}/ matches? ] unit-test
|
[ t ] [ " " R/ \P{LL}/ matches? ] unit-test
|
||||||
[ f ] [ "a" R/ \P{sCriPt = latin}/ matches? ] unit-test
|
[ f ] [ "a" R/ \P{sCriPt = latin}/ matches? ] unit-test
|
||||||
[ t ] [ " " R/ \P{SCRIPT = laTIn}/ matches? ] unit-test
|
[ t ] [ " " R/ \P{SCRIPT = laTIn}/ matches? ] unit-test
|
||||||
|
|
||||||
|
! Logical operators
|
||||||
|
[ t ] [ "a" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
|
||||||
|
[ t ] [ "π" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
|
||||||
|
[ t ] [ "A" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
|
||||||
|
[ f ] [ "3" R/ [\p{script=latin}\p{lower}]/ matches? ] unit-test
|
||||||
|
|
||||||
|
[ t ] [ "a" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
|
||||||
|
[ t ] [ "π" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
|
||||||
|
[ t ] [ "A" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
|
||||||
|
[ f ] [ "3" R/ [\p{script=latin}||\p{lower}]/ matches? ] unit-test
|
||||||
|
|
||||||
|
[ t ] [ "a" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
|
||||||
|
[ f ] [ "π" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
|
||||||
|
[ f ] [ "A" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
|
||||||
|
[ f ] [ "3" R/ [\p{script=latin}&&\p{lower}]/ matches? ] unit-test
|
||||||
|
|
||||||
|
[ f ] [ "a" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
|
||||||
|
[ t ] [ "π" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
|
||||||
|
[ t ] [ "A" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
|
||||||
|
[ f ] [ "3" R/ [\p{script=latin}~~\p{lower}]/ matches? ] unit-test
|
||||||
|
|
||||||
|
[ f ] [ "a" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
|
||||||
|
[ f ] [ "π" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
|
||||||
|
[ t ] [ "A" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
|
||||||
|
[ f ] [ "3" R/ [\p{script=latin}--\p{lower}]/ matches? ] unit-test
|
||||||
|
|
|
@ -1,8 +1,24 @@
|
||||||
USING: help.syntax help.markup strings ;
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: help.syntax help.markup strings biassocs arrays ;
|
||||||
IN: simple-flat-file
|
IN: simple-flat-file
|
||||||
|
|
||||||
ABOUT: "simple-flat-file"
|
ABOUT: "simple-flat-file"
|
||||||
|
|
||||||
ARTICLE: "simple-flat-file" "Parsing simple flat files"
|
ARTICLE: "simple-flat-file" "Parsing simple flat files"
|
||||||
"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding tasks."
|
"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding and Unicode tasks."
|
||||||
{ $subsection flat-file>biassoc } ;
|
{ $subsection flat-file>biassoc }
|
||||||
|
{ $subsection load-interval-file }
|
||||||
|
{ $subsection data } ;
|
||||||
|
|
||||||
|
HELP: load-interval-file
|
||||||
|
{ $values { "filename" string } { "table" "an interval map" } }
|
||||||
|
{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
|
||||||
|
|
||||||
|
HELP: data
|
||||||
|
{ $values { "filename" string } { "data" array } }
|
||||||
|
{ $description "This loads a file that's delineated by semicolons and lines, returning an array of lines, where each line is an array split by the semicolons, with whitespace trimmed off." } ;
|
||||||
|
|
||||||
|
HELP: flat-file>biassoc
|
||||||
|
{ $values { "filename" string } { "biassoc" biassoc } }
|
||||||
|
{ $description "This loads a flat file, in the form that many encoding resource files are in, with two columns of numeric data in hex, and returns a biassoc associating them." } ;
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
! Copyright (C) 2009 Daniel Ehrenberg
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: sequences splitting kernel math.parser io.files io.encodings.utf8
|
USING: sequences splitting kernel math.parser io.files io.encodings.utf8
|
||||||
biassocs ascii ;
|
biassocs ascii namespaces arrays make assocs interval-maps sets ;
|
||||||
IN: simple-flat-file
|
IN: simple-flat-file
|
||||||
|
|
||||||
: drop-comments ( seq -- newseq )
|
: drop-comments ( seq -- newseq )
|
||||||
|
@ -30,3 +30,25 @@ IN: simple-flat-file
|
||||||
|
|
||||||
: data ( filename -- data )
|
: data ( filename -- data )
|
||||||
utf8 file-lines drop-comments [ split-; ] map ;
|
utf8 file-lines drop-comments [ split-; ] map ;
|
||||||
|
|
||||||
|
SYMBOL: interned
|
||||||
|
|
||||||
|
: range, ( value key -- )
|
||||||
|
swap interned get
|
||||||
|
[ = ] with find nip 2array , ;
|
||||||
|
|
||||||
|
: expand-ranges ( assoc -- interval-map )
|
||||||
|
[
|
||||||
|
[
|
||||||
|
swap CHAR: . over member? [
|
||||||
|
".." split1 [ hex> ] bi@ 2array
|
||||||
|
] [ hex> ] if range,
|
||||||
|
] assoc-each
|
||||||
|
] { } make <interval-map> ;
|
||||||
|
|
||||||
|
: process-interval-file ( ranges -- table )
|
||||||
|
dup values prune interned
|
||||||
|
[ expand-ranges ] with-variable ;
|
||||||
|
|
||||||
|
: load-interval-file ( filename -- table )
|
||||||
|
data process-interval-file ;
|
||||||
|
|
|
@ -4,8 +4,9 @@ USING: combinators.short-circuit unicode.categories kernel math
|
||||||
combinators splitting sequences math.parser io.files io assocs
|
combinators splitting sequences math.parser io.files io assocs
|
||||||
arrays namespaces make math.ranges unicode.normalize
|
arrays namespaces make math.ranges unicode.normalize
|
||||||
unicode.normalize.private values io.encodings.ascii
|
unicode.normalize.private values io.encodings.ascii
|
||||||
unicode.syntax unicode.data compiler.units fry
|
unicode.data compiler.units fry unicode.categories.syntax
|
||||||
alien.syntax sets accessors interval-maps memoize locals words ;
|
alien.syntax sets accessors interval-maps memoize locals words
|
||||||
|
simple-flat-file ;
|
||||||
IN: unicode.breaks
|
IN: unicode.breaks
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
@ -31,9 +32,9 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ;
|
||||||
[ drop Control ]
|
[ drop Control ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
CATEGORY: (extend) Me Mn ;
|
CATEGORY: extend
|
||||||
: extend? ( ch -- ? )
|
Me Mn |
|
||||||
{ [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
|
"Other_Grapheme_Extend" property? ;
|
||||||
|
|
||||||
: loe? ( ch -- ? )
|
: loe? ( ch -- ? )
|
||||||
"Logical_Order_Exception" property? ;
|
"Logical_Order_Exception" property? ;
|
||||||
|
@ -127,7 +128,7 @@ to: grapheme-table
|
||||||
|
|
||||||
VALUE: word-break-table
|
VALUE: word-break-table
|
||||||
|
|
||||||
"vocab:unicode/data/WordBreakProperty.txt" load-key-value
|
"vocab:unicode/data/WordBreakProperty.txt" load-interval-file
|
||||||
to: word-break-table
|
to: word-break-table
|
||||||
|
|
||||||
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
|
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
|
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: unicode.data sequences namespaces
|
USING: unicode.data sequences namespaces
|
||||||
sbufs make unicode.syntax unicode.normalize math hints
|
sbufs make unicode.normalize math hints
|
||||||
unicode.categories combinators unicode.syntax assocs combinators.short-circuit
|
unicode.categories combinators assocs combinators.short-circuit
|
||||||
strings splitting kernel accessors unicode.breaks fry locals ;
|
strings splitting kernel accessors unicode.breaks fry locals ;
|
||||||
QUALIFIED: ascii
|
QUALIFIED: ascii
|
||||||
IN: unicode.case
|
IN: unicode.case
|
||||||
|
|
|
@ -12,6 +12,9 @@ HELP: Letter
|
||||||
HELP: alpha
|
HELP: alpha
|
||||||
{ $class-description "The class of alphanumeric characters." } ;
|
{ $class-description "The class of alphanumeric characters." } ;
|
||||||
|
|
||||||
|
HELP: math
|
||||||
|
{ $class-description "The class of Unicode math characters." } ;
|
||||||
|
|
||||||
HELP: blank
|
HELP: blank
|
||||||
{ $class-description "The class of whitespace characters." } ;
|
{ $class-description "The class of whitespace characters." } ;
|
||||||
|
|
||||||
|
@ -54,6 +57,8 @@ ARTICLE: "unicode.categories" "Character classes"
|
||||||
{ $subsection uncased }
|
{ $subsection uncased }
|
||||||
{ $subsection uncased? }
|
{ $subsection uncased? }
|
||||||
{ $subsection character }
|
{ $subsection character }
|
||||||
{ $subsection character? } ;
|
{ $subsection character? }
|
||||||
|
{ $subsection math }
|
||||||
|
{ $subsection math? } ;
|
||||||
|
|
||||||
ABOUT: "unicode.categories"
|
ABOUT: "unicode.categories"
|
||||||
|
|
|
@ -1,15 +1,16 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: unicode.syntax ;
|
USING: unicode.categories.syntax sequences unicode.data ;
|
||||||
IN: unicode.categories
|
IN: unicode.categories
|
||||||
|
|
||||||
CATEGORY: blank Zs Zl Zp \r\n ;
|
CATEGORY: blank Zs Zl Zp | "\r\n" member? ;
|
||||||
CATEGORY: letter Ll ;
|
CATEGORY: letter Ll | "Other_Lowercase" property? ;
|
||||||
CATEGORY: LETTER Lu ;
|
CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
|
||||||
CATEGORY: Letter Lu Ll Lt Lm Lo ;
|
CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
|
||||||
CATEGORY: digit Nd Nl No ;
|
CATEGORY: digit Nd Nl No ;
|
||||||
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
|
CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
|
||||||
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No ;
|
CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
|
||||||
CATEGORY: control Cc ;
|
CATEGORY: control Cc ;
|
||||||
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
|
CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
|
||||||
CATEGORY-NOT: character Cn ;
|
CATEGORY-NOT: character Cn ;
|
||||||
|
CATEGORY: math Sm | "Other_Math" property? ;
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: help.syntax help.markup ;
|
||||||
|
IN: unicode.categories.syntax
|
||||||
|
|
||||||
|
ABOUT: "unicode.categories.syntax"
|
||||||
|
|
||||||
|
ARTICLE: "unicode.categories.syntax" "Unicode category syntax"
|
||||||
|
"There is special syntax sugar for making predicate classes which are unions of Unicode general categories, plus some other code."
|
||||||
|
{ $subsection POSTPONE: CATEGORY: }
|
||||||
|
{ $subsection POSTPONE: CATEGORY-NOT: } ;
|
||||||
|
|
||||||
|
HELP: CATEGORY:
|
||||||
|
{ $syntax "CATEGORY: foo Nl Pd Lu | \"Diacritic\" property? ;" }
|
||||||
|
{ $description "This defines a predicate class which is a subset of code points. In this example, " { $snippet "foo" } " is the class of characters which are in the general category Nl or Pd or Lu, or which have the Diacritic property." } ;
|
||||||
|
|
||||||
|
HELP: CATEGORY-NOT:
|
||||||
|
{ $syntax "CATEGORY-NOT: foo Nl Pd Lu | \"Diacritic\" property? ;" }
|
||||||
|
{ $description "This defines a predicate class which is a subset of code points, the complement of what " { $link POSTPONE: CATEGORY: } " would define. In this example, " { $snippet "foo" } " is the class of characters which are neither in the general category Nl or Pd or Lu, nor have the Diacritic property." } ;
|
|
@ -0,0 +1,3 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
! Copyright (C) 2008, 2009 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: unicode.data kernel math sequences parser unicode.data.private
|
||||||
|
bit-arrays namespaces sequences.private arrays classes.parser
|
||||||
|
assocs classes.predicate sets fry splitting accessors ;
|
||||||
|
IN: unicode.categories.syntax
|
||||||
|
|
||||||
|
! For use in CATEGORY:
|
||||||
|
SYMBOLS: Cn Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So Zs Zl Zp Cc Cf Cs Co | ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
: [category] ( categories code -- quot )
|
||||||
|
'[ dup category# _ member? [ drop t ] _ if ] ;
|
||||||
|
|
||||||
|
: integer-predicate-class ( word predicate -- )
|
||||||
|
integer swap define-predicate-class ;
|
||||||
|
|
||||||
|
: define-category ( word categories code -- )
|
||||||
|
[category] integer-predicate-class ;
|
||||||
|
|
||||||
|
: define-not-category ( word categories code -- )
|
||||||
|
[category] [ not ] compose integer-predicate-class ;
|
||||||
|
|
||||||
|
: parse-category ( -- word tokens quot )
|
||||||
|
CREATE-CLASS \ ; parse-until { | } split1
|
||||||
|
[ [ name>> categories-map at ] map ]
|
||||||
|
[ [ [ ] like ] [ [ drop f ] ] if* ] bi* ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
SYNTAX: CATEGORY: parse-category define-category ;
|
||||||
|
|
||||||
|
SYNTAX: CATEGORY-NOT: parse-category define-not-category ;
|
|
@ -4,7 +4,7 @@ USING: combinators.short-circuit sequences io.files
|
||||||
io.encodings.ascii kernel values splitting accessors math.parser
|
io.encodings.ascii kernel values splitting accessors math.parser
|
||||||
ascii io assocs strings math namespaces make sorting combinators
|
ascii io assocs strings math namespaces make sorting combinators
|
||||||
math.order arrays unicode.normalize unicode.data locals
|
math.order arrays unicode.normalize unicode.data locals
|
||||||
unicode.syntax macros sequences.deep words unicode.breaks
|
macros sequences.deep words unicode.breaks
|
||||||
quotations combinators.short-circuit simple-flat-file ;
|
quotations combinators.short-circuit simple-flat-file ;
|
||||||
IN: unicode.collation
|
IN: unicode.collation
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ IN: unicode.data
|
||||||
ABOUT: "unicode.data"
|
ABOUT: "unicode.data"
|
||||||
|
|
||||||
ARTICLE: "unicode.data" "Unicode data tables"
|
ARTICLE: "unicode.data" "Unicode data tables"
|
||||||
"The " { $vocab-link "unicode.data" "unicode.data" } " vocabulary contains core Unicode data tables and code for parsing this from files."
|
"The " { $vocab-link "unicode.data" "unicode.data" } " vocabulary contains core Unicode data tables and code for parsing this from files. The following words access these data tables."
|
||||||
{ $subsection canonical-entry }
|
{ $subsection canonical-entry }
|
||||||
{ $subsection combine-chars }
|
{ $subsection combine-chars }
|
||||||
{ $subsection combining-class }
|
{ $subsection combining-class }
|
||||||
|
@ -14,7 +14,11 @@ ARTICLE: "unicode.data" "Unicode data tables"
|
||||||
{ $subsection name>char }
|
{ $subsection name>char }
|
||||||
{ $subsection char>name }
|
{ $subsection char>name }
|
||||||
{ $subsection property? }
|
{ $subsection property? }
|
||||||
{ $subsection load-key-value } ;
|
{ $subsection category }
|
||||||
|
{ $subsection ch>upper }
|
||||||
|
{ $subsection ch>lower }
|
||||||
|
{ $subsection ch>title }
|
||||||
|
{ $subsection special-case } ;
|
||||||
|
|
||||||
HELP: canonical-entry
|
HELP: canonical-entry
|
||||||
{ $values { "char" "a code point" } { "seq" string } }
|
{ $values { "char" "a code point" } { "seq" string } }
|
||||||
|
@ -48,6 +52,22 @@ HELP: property?
|
||||||
{ $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
|
{ $values { "char" "a code point" } { "property" string } { "?" "a boolean" } }
|
||||||
{ $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
|
{ $description "Tests whether the code point is listed under the given property in PropList.txt in the Unicode Character Database." } ;
|
||||||
|
|
||||||
HELP: load-key-value
|
HELP: category
|
||||||
{ $values { "filename" string } { "table" "an interval map" } }
|
{ $values { "char" "a code point" } { "category" string } }
|
||||||
{ $description "This loads a file that looks like Script.txt in the Unicode Character Database and converts it into an efficient interval map, where the keys are characters and the values are strings for the properties." } ;
|
{ $description "Returns the general category of a code point, in the form of a string. This will always be a string within the ASCII range of length two. If the code point is unassigned, then it returns " { $snippet "Cn" } "." } ;
|
||||||
|
|
||||||
|
HELP: ch>upper
|
||||||
|
{ $values { "ch" "a code point" } { "upper" "a code point" } }
|
||||||
|
{ $description "Returns the simple upper-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
|
||||||
|
|
||||||
|
HELP: ch>lower
|
||||||
|
{ $values { "ch" "a code point" } { "lower" "a code point" } }
|
||||||
|
{ $description "Returns the simple lower-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
|
||||||
|
|
||||||
|
HELP: ch>title
|
||||||
|
{ $values { "ch" "a code point" } { "title" "a code point" } }
|
||||||
|
{ $description "Returns the simple title-cased version of the code point, if it exists. This does not handle context-sensitive or locale-dependent properties of linguistically accurate case conversion, and does not correctly handle characters which become multiple characters on conversion to this case." } ;
|
||||||
|
|
||||||
|
HELP: special-case
|
||||||
|
{ $values { "ch" "a code point" } { "casing-tuple" { "a tuple, or " { $link f } } } }
|
||||||
|
{ $description "If a code point has special casing behavior, returns a tuple which represents that information." } ;
|
||||||
|
|
|
@ -58,7 +58,7 @@ CONSTANT: num-chars HEX: 2FA1E
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: category# ( char -- category )
|
: category# ( char -- n )
|
||||||
! There are a few characters that should be Cn
|
! There are a few characters that should be Cn
|
||||||
! that this gives Cf or Mn
|
! that this gives Cf or Mn
|
||||||
! Cf = 26; Mn = 5; Cn = 29
|
! Cf = 26; Mn = 5; Cn = 29
|
||||||
|
@ -219,27 +219,3 @@ load-properties to: properties
|
||||||
|
|
||||||
[ name>char [ "Invalid character" throw ] unless* ]
|
[ name>char [ "Invalid character" throw ] unless* ]
|
||||||
name>char-hook set-global
|
name>char-hook set-global
|
||||||
|
|
||||||
SYMBOL: interned
|
|
||||||
|
|
||||||
: range, ( value key -- )
|
|
||||||
swap interned get
|
|
||||||
[ = ] with find nip 2array , ;
|
|
||||||
|
|
||||||
: expand-ranges ( assoc -- interval-map )
|
|
||||||
[
|
|
||||||
[
|
|
||||||
swap CHAR: . over member? [
|
|
||||||
".." split1 [ hex> ] bi@ 2array
|
|
||||||
] [ hex> ] if range,
|
|
||||||
] assoc-each
|
|
||||||
] { } make <interval-map> ;
|
|
||||||
|
|
||||||
: process-key-value ( ranges -- table )
|
|
||||||
dup values prune interned
|
|
||||||
[ expand-ranges ] with-variable ;
|
|
||||||
|
|
||||||
PRIVATE>
|
|
||||||
|
|
||||||
: load-key-value ( filename -- table )
|
|
||||||
data process-key-value ;
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: ascii sequences namespaces make unicode.data kernel math arrays
|
USING: ascii sequences namespaces make unicode.data kernel math arrays
|
||||||
locals sorting.insertion accessors assocs math.order combinators
|
locals sorting.insertion accessors assocs math.order combinators
|
||||||
unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
|
strings sbufs hints combinators.short-circuit vectors ;
|
||||||
IN: unicode.normalize
|
IN: unicode.normalize
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
|
@ -1,17 +1,13 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors values kernel sequences assocs io.files
|
USING: values interval-maps simple-flat-file ;
|
||||||
io.encodings ascii math.ranges io splitting math.parser
|
|
||||||
namespaces make byte-arrays locals math sets io.encodings.ascii
|
|
||||||
words words.symbol compiler.units arrays interval-maps
|
|
||||||
unicode.data ;
|
|
||||||
IN: unicode.script
|
IN: unicode.script
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
VALUE: script-table
|
VALUE: script-table
|
||||||
|
|
||||||
"vocab:unicode/script/Scripts.txt" load-key-value
|
"vocab:unicode/script/Scripts.txt" load-interval-file
|
||||||
to: script-table
|
to: script-table
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
|
||||||
USING: unicode.data kernel math sequences parser lexer
|
|
||||||
bit-arrays namespaces make sequences.private arrays quotations
|
|
||||||
assocs classes.predicate math.order strings.parser ;
|
|
||||||
IN: unicode.syntax
|
|
||||||
|
|
||||||
<PRIVATE
|
|
||||||
|
|
||||||
: >category-array ( categories -- bitarray )
|
|
||||||
categories [ swap member? ] with map >bit-array ;
|
|
||||||
|
|
||||||
: as-string ( strings -- bit-array )
|
|
||||||
concat unescape-string ;
|
|
||||||
|
|
||||||
: [category] ( categories -- quot )
|
|
||||||
[
|
|
||||||
[ [ categories member? not ] filter as-string ] keep
|
|
||||||
[ categories member? ] filter >category-array
|
|
||||||
[ dup category# ] % , [ nth-unsafe [ drop t ] ] %
|
|
||||||
\ member? 2array >quotation ,
|
|
||||||
\ if ,
|
|
||||||
] [ ] make ;
|
|
||||||
|
|
||||||
: define-category ( word categories -- )
|
|
||||||
[category] integer swap define-predicate-class ;
|
|
||||||
|
|
||||||
PRIVATE>
|
|
||||||
|
|
||||||
SYNTAX: CATEGORY:
|
|
||||||
CREATE ";" parse-tokens define-category ;
|
|
||||||
|
|
||||||
: seq-minus ( seq1 seq2 -- diff )
|
|
||||||
[ member? not ] curry filter ;
|
|
||||||
|
|
||||||
SYNTAX: CATEGORY-NOT:
|
|
||||||
CREATE ";" parse-tokens
|
|
||||||
categories swap seq-minus define-category ;
|
|
|
@ -15,7 +15,7 @@ $nl
|
||||||
{ $vocab-subsection "Word and grapheme breaks" "unicode.breaks" }
|
{ $vocab-subsection "Word and grapheme breaks" "unicode.breaks" }
|
||||||
{ $vocab-subsection "Unicode normalization" "unicode.normalize" }
|
{ $vocab-subsection "Unicode normalization" "unicode.normalize" }
|
||||||
"The following are mostly for internal use:"
|
"The following are mostly for internal use:"
|
||||||
{ $vocab-subsection "Unicode syntax" "unicode.syntax" }
|
{ $vocab-subsection "Unicode category syntax" "unicode.categories.syntax" }
|
||||||
{ $vocab-subsection "Unicode data tables" "unicode.data" }
|
{ $vocab-subsection "Unicode data tables" "unicode.data" }
|
||||||
{ $see-also "ascii" "io.encodings" } ;
|
{ $see-also "ascii" "io.encodings" } ;
|
||||||
|
|
||||||
|
|
|
@ -1,19 +1,26 @@
|
||||||
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: kernel sequences unicode.syntax math math.order combinators
|
USING: kernel sequences unicode.categories.syntax math math.order
|
||||||
hints ;
|
combinators hints ;
|
||||||
IN: xml.char-classes
|
IN: xml.char-classes
|
||||||
|
|
||||||
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
|
CATEGORY: 1.0name-start
|
||||||
: 1.0name-start? ( char -- ? )
|
Ll Lu Lo Lt Nl | {
|
||||||
dup 1.0name-start*? [ drop t ]
|
[ HEX: 2BB HEX: 2C1 between? ]
|
||||||
[ HEX: 2BB HEX: 2C1 between? ] if ;
|
[ "\u000559\u0006E5\u0006E6_:" member? ]
|
||||||
|
} 1|| ;
|
||||||
|
|
||||||
CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ;
|
CATEGORY: 1.0name-char
|
||||||
|
Ll Lu Lo Lt Nl Mc Me Mn Lm Nd |
|
||||||
|
"_-.\u000387:" member? ;
|
||||||
|
|
||||||
CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ;
|
CATEGORY: 1.1name-start
|
||||||
|
Ll Lu Lo Lm Ln Nl |
|
||||||
|
"_:" member? ;
|
||||||
|
|
||||||
CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
|
CATEGORY: 1.1name-char
|
||||||
|
Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf |
|
||||||
|
"_-.\u0000b7:" member? ;
|
||||||
|
|
||||||
: name-start? ( 1.0? char -- ? )
|
: name-start? ( 1.0? char -- ? )
|
||||||
swap [ 1.0name-start? ] [ 1.1name-start? ] if ;
|
swap [ 1.0name-start? ] [ 1.1name-start? ] if ;
|
||||||
|
|
Loading…
Reference in New Issue