diff --git a/extra/interval-maps/interval-maps.factor b/extra/interval-maps/interval-maps.factor index 904b76ce94..888665030d 100755 --- a/extra/interval-maps/interval-maps.factor +++ b/extra/interval-maps/interval-maps.factor @@ -44,6 +44,9 @@ PRIVATE> >intervals ensure-disjoint >tuple-array interval-map boa ; +: ( specification -- map ) + [ dup 2array ] map ; + :: coalesce ( alist -- specification ) ! Only works with integer keys, because they're discrete ! Makes 2array keys diff --git a/extra/unicode/PropList.txt b/extra/unicode/PropList.txt deleted file mode 100644 index 8c6b662800..0000000000 --- a/extra/unicode/PropList.txt +++ /dev/null @@ -1,20 +0,0 @@ -# ================================================ -# Note: This is only a portion of the original PropList.txt - -09BE ; Other_Grapheme_Extend # Mc BENGALI VOWEL SIGN AA -09D7 ; Other_Grapheme_Extend # Mc BENGALI AU LENGTH MARK -0B3E ; Other_Grapheme_Extend # Mc ORIYA VOWEL SIGN AA -0B57 ; Other_Grapheme_Extend # Mc ORIYA AU LENGTH MARK -0BBE ; Other_Grapheme_Extend # Mc TAMIL VOWEL SIGN AA -0BD7 ; Other_Grapheme_Extend # Mc TAMIL AU LENGTH MARK -0CC2 ; Other_Grapheme_Extend # Mc KANNADA VOWEL SIGN UU -0CD5..0CD6 ; Other_Grapheme_Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK -0D3E ; Other_Grapheme_Extend # Mc MALAYALAM VOWEL SIGN AA -0D57 ; Other_Grapheme_Extend # Mc MALAYALAM AU LENGTH MARK -0DCF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN AELA-PILLA -0DDF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA -200C..200D ; Other_Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER -1D165 ; Other_Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM -1D16E..1D172 ; Other_Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5 - -# Total code points: 21 diff --git a/extra/unicode/breaks/breaks.factor b/extra/unicode/breaks/breaks.factor index 53f81ccbf9..23dfc16e78 100755 --- a/extra/unicode/breaks/breaks.factor +++ b/extra/unicode/breaks/breaks.factor @@ -1,7 +1,8 @@ USING: unicode.categories kernel math combinators splitting sequences math.parser io.files io assocs arrays namespaces math.ranges unicode.normalize values io.encodings.ascii -unicode.syntax unicode.data compiler.units alien.syntax sets ; +unicode.syntax unicode.data compiler.units alien.syntax sets +combinators.lib ; IN: unicode.breaks C-ENUM: Any L V T Extend Control CR LF graphemes ; @@ -20,22 +21,10 @@ CATEGORY: grapheme-control Zl Zp Cc Cf ; [ drop Control ] } case ; -: trim-blank ( str -- newstr ) - [ blank? ] right-trim ; - -: process-other-extend ( lines -- set ) - [ "#" split1 drop ";" split1 drop trim-blank ] map harvest - [ ".." split1 [ dup ] unless* [ hex> ] bi@ [a,b] ] map - concat unique ; - -: other-extend-lines ( -- lines ) - "resource:extra/unicode/PropList.txt" ascii file-lines ; - -VALUE: other-extend - CATEGORY: (extend) Me Mn ; : extend? ( ch -- ? ) - dup (extend)? [ ] [ other-extend key? ] ?if ; + [ (extend)? ] + [ "Other_Grapheme_Extend" property? ] or? ; : grapheme-class ( ch -- class ) { @@ -108,10 +97,7 @@ VALUE: grapheme-table unclip-last-slice grapheme-class swap [ grapheme-class dup rot grapheme-break? ] find-last-index ?1+ nip ; -[ - other-extend-lines process-other-extend \ other-extend set-value +init-grapheme-table table +[ make-grapheme-table finish-table ] with-variable +\ grapheme-table set-value - init-grapheme-table table - [ make-grapheme-table finish-table ] with-variable - \ grapheme-table set-value -] with-compilation-unit diff --git a/extra/unicode/SpecialCasing.txt b/extra/unicode/case/SpecialCasing.txt similarity index 100% rename from extra/unicode/SpecialCasing.txt rename to extra/unicode/case/SpecialCasing.txt diff --git a/extra/unicode/collation/collation.factor b/extra/unicode/collation/collation.factor index 786693158f..fd1466d1dd 100755 --- a/extra/unicode/collation/collation.factor +++ b/extra/unicode/collation/collation.factor @@ -8,10 +8,6 @@ VALUE: ducet TUPLE: weight primary secondary tertiary ignorable? ; -: remove-comments ( lines -- lines ) - [ "#" split1 drop "@" split1 drop ] map - [ empty? not ] filter ; - : parse-weight ( string -- weight ) "]" split but-last [ weight new swap rest unclip CHAR: * = swapd >>ignorable? @@ -24,17 +20,27 @@ TUPLE: weight primary secondary tertiary ignorable? ; [ " " split [ hex> ] "" map-as ] [ parse-weight ] bi* ; : parse-ducet ( stream -- ducet ) - lines remove-comments + lines filter-comments [ parse-line ] H{ } map>assoc ; "resource:extra/unicode/collation/allkeys.txt" ascii parse-ducet \ ducet set-value -: derive-weight ( char -- weight ) - ! This should check Noncharacter_Code_Point - ! If yes, then ignore the character - ! otherwise, apply derivation formula with the right base - drop { } ; +: base ( char -- base ) + dup "Unified_Ideograph" property? + [ -16 shift zero? HEX: FB40 HEX: FB80 ? ] + [ drop HEX: FBC0 ] if ; + +: AAAA ( char -- weight ) + [ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ; + +: BBBB ( char -- weight ) + HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ; + +: derive-weight ( char -- weights ) + first dup "Noncharacter_Code_Point" property? + [ drop { } ] + [ [ AAAA ] [ BBBB ] bi 2array ] if ; : last ( -- char ) building get empty? [ 0 ] [ building get peek peek ] if ; diff --git a/extra/unicode/data/PropList.txt b/extra/unicode/data/PropList.txt new file mode 100644 index 0000000000..3bbbd76929 --- /dev/null +++ b/extra/unicode/data/PropList.txt @@ -0,0 +1,1207 @@ +# PropList-5.1.0.txt +# Date: 2008-03-20, 17:55:27 GMT [MD] +# +# Unicode Character Database +# Copyright (c) 1991-2008 Unicode, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# For documentation, see UCD.html + +# ================================================ + +0009..000D ; White_Space # Cc [5] .. +0020 ; White_Space # Zs SPACE +0085 ; White_Space # Cc +00A0 ; White_Space # Zs NO-BREAK SPACE +1680 ; White_Space # Zs OGHAM SPACE MARK +180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR +2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE +2028 ; White_Space # Zl LINE SEPARATOR +2029 ; White_Space # Zp PARAGRAPH SEPARATOR +202F ; White_Space # Zs NARROW NO-BREAK SPACE +205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE +3000 ; White_Space # Zs IDEOGRAPHIC SPACE + +# Total code points: 26 + +# ================================================ + +200E..200F ; Bidi_Control # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK +202A..202E ; Bidi_Control # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE + +# Total code points: 7 + +# ================================================ + +200C..200D ; Join_Control # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER + +# Total code points: 2 + +# ================================================ + +002D ; Dash # Pd HYPHEN-MINUS +058A ; Dash # Pd ARMENIAN HYPHEN +05BE ; Dash # Pd HEBREW PUNCTUATION MAQAF +1806 ; Dash # Pd MONGOLIAN TODO SOFT HYPHEN +2010..2015 ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR +2053 ; Dash # Po SWUNG DASH +207B ; Dash # Sm SUPERSCRIPT MINUS +208B ; Dash # Sm SUBSCRIPT MINUS +2212 ; Dash # Sm MINUS SIGN +2E17 ; Dash # Pd DOUBLE OBLIQUE HYPHEN +2E1A ; Dash # Pd HYPHEN WITH DIAERESIS +301C ; Dash # Pd WAVE DASH +3030 ; Dash # Pd WAVY DASH +30A0 ; Dash # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN +FE31..FE32 ; Dash # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH +FE58 ; Dash # Pd SMALL EM DASH +FE63 ; Dash # Pd SMALL HYPHEN-MINUS +FF0D ; Dash # Pd FULLWIDTH HYPHEN-MINUS + +# Total code points: 24 + +# ================================================ + +002D ; Hyphen # Pd HYPHEN-MINUS +00AD ; Hyphen # Cf SOFT HYPHEN +058A ; Hyphen # Pd ARMENIAN HYPHEN +1806 ; Hyphen # Pd MONGOLIAN TODO SOFT HYPHEN +2010..2011 ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN +2E17 ; Hyphen # Pd DOUBLE OBLIQUE HYPHEN +30FB ; Hyphen # Po KATAKANA MIDDLE DOT +FE63 ; Hyphen # Pd SMALL HYPHEN-MINUS +FF0D ; Hyphen # Pd FULLWIDTH HYPHEN-MINUS +FF65 ; Hyphen # Po HALFWIDTH KATAKANA MIDDLE DOT + +# Total code points: 11 + +# ================================================ + +0022 ; Quotation_Mark # Po QUOTATION MARK +0027 ; Quotation_Mark # Po APOSTROPHE +00AB ; Quotation_Mark # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +00BB ; Quotation_Mark # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +2018 ; Quotation_Mark # Pi LEFT SINGLE QUOTATION MARK +2019 ; Quotation_Mark # Pf RIGHT SINGLE QUOTATION MARK +201A ; Quotation_Mark # Ps SINGLE LOW-9 QUOTATION MARK +201B..201C ; Quotation_Mark # Pi [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK +201D ; Quotation_Mark # Pf RIGHT DOUBLE QUOTATION MARK +201E ; Quotation_Mark # Ps DOUBLE LOW-9 QUOTATION MARK +201F ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK +2039 ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK +203A ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +300C ; Quotation_Mark # Ps LEFT CORNER BRACKET +300D ; Quotation_Mark # Pe RIGHT CORNER BRACKET +300E ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET +300F ; Quotation_Mark # Pe RIGHT WHITE CORNER BRACKET +301D ; Quotation_Mark # Ps REVERSED DOUBLE PRIME QUOTATION MARK +301E..301F ; Quotation_Mark # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK +FE41 ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET +FE42 ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET +FE43 ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET +FE44 ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET +FF02 ; Quotation_Mark # Po FULLWIDTH QUOTATION MARK +FF07 ; Quotation_Mark # Po FULLWIDTH APOSTROPHE +FF62 ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET +FF63 ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET + +# Total code points: 29 + +# ================================================ + +0021 ; Terminal_Punctuation # Po EXCLAMATION MARK +002C ; Terminal_Punctuation # Po COMMA +002E ; Terminal_Punctuation # Po FULL STOP +003A..003B ; Terminal_Punctuation # Po [2] COLON..SEMICOLON +003F ; Terminal_Punctuation # Po QUESTION MARK +037E ; Terminal_Punctuation # Po GREEK QUESTION MARK +0387 ; Terminal_Punctuation # Po GREEK ANO TELEIA +0589 ; Terminal_Punctuation # Po ARMENIAN FULL STOP +05C3 ; Terminal_Punctuation # Po HEBREW PUNCTUATION SOF PASUQ +060C ; Terminal_Punctuation # Po ARABIC COMMA +061B ; Terminal_Punctuation # Po ARABIC SEMICOLON +061F ; Terminal_Punctuation # Po ARABIC QUESTION MARK +06D4 ; Terminal_Punctuation # Po ARABIC FULL STOP +0700..070A ; Terminal_Punctuation # Po [11] SYRIAC END OF PARAGRAPH..SYRIAC CONTRACTION +070C ; Terminal_Punctuation # Po SYRIAC HARKLEAN METOBELUS +07F8..07F9 ; Terminal_Punctuation # Po [2] NKO COMMA..NKO EXCLAMATION MARK +0964..0965 ; Terminal_Punctuation # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA +0E5A..0E5B ; Terminal_Punctuation # Po [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT +0F08 ; Terminal_Punctuation # Po TIBETAN MARK SBRUL SHAD +0F0D..0F12 ; Terminal_Punctuation # Po [6] TIBETAN MARK SHAD..TIBETAN MARK RGYA GRAM SHAD +104A..104B ; Terminal_Punctuation # Po [2] MYANMAR SIGN LITTLE SECTION..MYANMAR SIGN SECTION +1361..1368 ; Terminal_Punctuation # Po [8] ETHIOPIC WORDSPACE..ETHIOPIC PARAGRAPH SEPARATOR +166D..166E ; Terminal_Punctuation # Po [2] CANADIAN SYLLABICS CHI SIGN..CANADIAN SYLLABICS FULL STOP +16EB..16ED ; Terminal_Punctuation # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION +17D4..17D6 ; Terminal_Punctuation # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH +17DA ; Terminal_Punctuation # Po KHMER SIGN KOOMUUT +1802..1805 ; Terminal_Punctuation # Po [4] MONGOLIAN COMMA..MONGOLIAN FOUR DOTS +1808..1809 ; Terminal_Punctuation # Po [2] MONGOLIAN MANCHU COMMA..MONGOLIAN MANCHU FULL STOP +1944..1945 ; Terminal_Punctuation # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK +1B5A..1B5B ; Terminal_Punctuation # Po [2] BALINESE PANTI..BALINESE PAMADA +1B5D..1B5F ; Terminal_Punctuation # Po [3] BALINESE CARIK PAMUNGKAH..BALINESE CARIK PAREREN +1C3B..1C3F ; Terminal_Punctuation # Po [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK +1C7E..1C7F ; Terminal_Punctuation # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD +203C..203D ; Terminal_Punctuation # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG +2047..2049 ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK +2E2E ; Terminal_Punctuation # Po REVERSED QUESTION MARK +3001..3002 ; Terminal_Punctuation # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP +A60D..A60F ; Terminal_Punctuation # Po [3] VAI COMMA..VAI QUESTION MARK +A876..A877 ; Terminal_Punctuation # Po [2] PHAGS-PA MARK SHAD..PHAGS-PA MARK DOUBLE SHAD +A8CE..A8CF ; Terminal_Punctuation # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA +A92F ; Terminal_Punctuation # Po KAYAH LI SIGN SHYA +AA5D..AA5F ; Terminal_Punctuation # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA +FE50..FE52 ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP +FE54..FE57 ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK +FF01 ; Terminal_Punctuation # Po FULLWIDTH EXCLAMATION MARK +FF0C ; Terminal_Punctuation # Po FULLWIDTH COMMA +FF0E ; Terminal_Punctuation # Po FULLWIDTH FULL STOP +FF1A..FF1B ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON +FF1F ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK +FF61 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP +FF64 ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA +1039F ; Terminal_Punctuation # Po UGARITIC WORD DIVIDER +103D0 ; Terminal_Punctuation # Po OLD PERSIAN WORD DIVIDER +1091F ; Terminal_Punctuation # Po PHOENICIAN WORD SEPARATOR +12470..12473 ; Terminal_Punctuation # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON + +# Total code points: 119 + +# ================================================ + +005E ; Other_Math # Sk CIRCUMFLEX ACCENT +03D0..03D2 ; Other_Math # L& [3] GREEK BETA SYMBOL..GREEK UPSILON WITH HOOK SYMBOL +03D5 ; Other_Math # L& GREEK PHI SYMBOL +03F0..03F1 ; Other_Math # L& [2] GREEK KAPPA SYMBOL..GREEK RHO SYMBOL +03F4..03F5 ; Other_Math # L& [2] GREEK CAPITAL THETA SYMBOL..GREEK LUNATE EPSILON SYMBOL +2016 ; Other_Math # Po DOUBLE VERTICAL LINE +2032..2034 ; Other_Math # Po [3] PRIME..TRIPLE PRIME +2040 ; Other_Math # Pc CHARACTER TIE +2061..2064 ; Other_Math # Cf [4] FUNCTION APPLICATION..INVISIBLE PLUS +207D ; Other_Math # Ps SUPERSCRIPT LEFT PARENTHESIS +207E ; Other_Math # Pe SUPERSCRIPT RIGHT PARENTHESIS +208D ; Other_Math # Ps SUBSCRIPT LEFT PARENTHESIS +208E ; Other_Math # Pe SUBSCRIPT RIGHT PARENTHESIS +20D0..20DC ; Other_Math # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE +20E1 ; Other_Math # Mn COMBINING LEFT RIGHT ARROW ABOVE +20E5..20E6 ; Other_Math # Mn [2] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING DOUBLE VERTICAL STROKE OVERLAY +20EB..20EF ; Other_Math # Mn [5] COMBINING LONG DOUBLE SOLIDUS OVERLAY..COMBINING RIGHT ARROW BELOW +2102 ; Other_Math # L& DOUBLE-STRUCK CAPITAL C +210A..2113 ; Other_Math # L& [10] SCRIPT SMALL G..SCRIPT SMALL L +2115 ; Other_Math # L& DOUBLE-STRUCK CAPITAL N +2119..211D ; Other_Math # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R +2124 ; Other_Math # L& DOUBLE-STRUCK CAPITAL Z +2128 ; Other_Math # L& BLACK-LETTER CAPITAL Z +2129 ; Other_Math # So TURNED GREEK SMALL LETTER IOTA +212C..212D ; Other_Math # L& [2] SCRIPT CAPITAL B..BLACK-LETTER CAPITAL C +212F..2131 ; Other_Math # L& [3] SCRIPT SMALL E..SCRIPT CAPITAL F +2133..2134 ; Other_Math # L& [2] SCRIPT CAPITAL M..SCRIPT SMALL O +2135..2138 ; Other_Math # Lo [4] ALEF SYMBOL..DALET SYMBOL +213C..213F ; Other_Math # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI +2145..2149 ; Other_Math # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J +2195..2199 ; Other_Math # So [5] UP DOWN ARROW..SOUTH WEST ARROW +219C..219F ; Other_Math # So [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW +21A1..21A2 ; Other_Math # So [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL +21A4..21A5 ; Other_Math # So [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR +21A7 ; Other_Math # So DOWNWARDS ARROW FROM BAR +21A9..21AD ; Other_Math # So [5] LEFTWARDS ARROW WITH HOOK..LEFT RIGHT WAVE ARROW +21B0..21B1 ; Other_Math # So [2] UPWARDS ARROW WITH TIP LEFTWARDS..UPWARDS ARROW WITH TIP RIGHTWARDS +21B6..21B7 ; Other_Math # So [2] ANTICLOCKWISE TOP SEMICIRCLE ARROW..CLOCKWISE TOP SEMICIRCLE ARROW +21BC..21CD ; Other_Math # So [18] LEFTWARDS HARPOON WITH BARB UPWARDS..LEFTWARDS DOUBLE ARROW WITH STROKE +21D0..21D1 ; Other_Math # So [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW +21D3 ; Other_Math # So DOWNWARDS DOUBLE ARROW +21D5..21DB ; Other_Math # So [7] UP DOWN DOUBLE ARROW..RIGHTWARDS TRIPLE ARROW +21DD ; Other_Math # So RIGHTWARDS SQUIGGLE ARROW +21E4..21E5 ; Other_Math # So [2] LEFTWARDS ARROW TO BAR..RIGHTWARDS ARROW TO BAR +23B4..23B5 ; Other_Math # So [2] TOP SQUARE BRACKET..BOTTOM SQUARE BRACKET +23B7 ; Other_Math # So RADICAL SYMBOL BOTTOM +23D0 ; Other_Math # So VERTICAL LINE EXTENSION +23E2 ; Other_Math # So WHITE TRAPEZIUM +25A0..25A1 ; Other_Math # So [2] BLACK SQUARE..WHITE SQUARE +25AE..25B6 ; Other_Math # So [9] BLACK VERTICAL RECTANGLE..BLACK RIGHT-POINTING TRIANGLE +25BC..25C0 ; Other_Math # So [5] BLACK DOWN-POINTING TRIANGLE..BLACK LEFT-POINTING TRIANGLE +25C6..25C7 ; Other_Math # So [2] BLACK DIAMOND..WHITE DIAMOND +25CA..25CB ; Other_Math # So [2] LOZENGE..WHITE CIRCLE +25CF..25D3 ; Other_Math # So [5] BLACK CIRCLE..CIRCLE WITH UPPER HALF BLACK +25E2 ; Other_Math # So BLACK LOWER RIGHT TRIANGLE +25E4 ; Other_Math # So BLACK UPPER LEFT TRIANGLE +25E7..25EC ; Other_Math # So [6] SQUARE WITH LEFT HALF BLACK..WHITE UP-POINTING TRIANGLE WITH DOT +2605..2606 ; Other_Math # So [2] BLACK STAR..WHITE STAR +2640 ; Other_Math # So FEMALE SIGN +2642 ; Other_Math # So MALE SIGN +2660..2663 ; Other_Math # So [4] BLACK SPADE SUIT..BLACK CLUB SUIT +266D..266E ; Other_Math # So [2] MUSIC FLAT SIGN..MUSIC NATURAL SIGN +27C5 ; Other_Math # Ps LEFT S-SHAPED BAG DELIMITER +27C6 ; Other_Math # Pe RIGHT S-SHAPED BAG DELIMITER +27E6 ; Other_Math # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET +27E7 ; Other_Math # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET +27E8 ; Other_Math # Ps MATHEMATICAL LEFT ANGLE BRACKET +27E9 ; Other_Math # Pe MATHEMATICAL RIGHT ANGLE BRACKET +27EA ; Other_Math # Ps MATHEMATICAL LEFT DOUBLE ANGLE BRACKET +27EB ; Other_Math # Pe MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET +27EC ; Other_Math # Ps MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET +27ED ; Other_Math # Pe MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET +27EE ; Other_Math # Ps MATHEMATICAL LEFT FLATTENED PARENTHESIS +27EF ; Other_Math # Pe MATHEMATICAL RIGHT FLATTENED PARENTHESIS +2983 ; Other_Math # Ps LEFT WHITE CURLY BRACKET +2984 ; Other_Math # Pe RIGHT WHITE CURLY BRACKET +2985 ; Other_Math # Ps LEFT WHITE PARENTHESIS +2986 ; Other_Math # Pe RIGHT WHITE PARENTHESIS +2987 ; Other_Math # Ps Z NOTATION LEFT IMAGE BRACKET +2988 ; Other_Math # Pe Z NOTATION RIGHT IMAGE BRACKET +2989 ; Other_Math # Ps Z NOTATION LEFT BINDING BRACKET +298A ; Other_Math # Pe Z NOTATION RIGHT BINDING BRACKET +298B ; Other_Math # Ps LEFT SQUARE BRACKET WITH UNDERBAR +298C ; Other_Math # Pe RIGHT SQUARE BRACKET WITH UNDERBAR +298D ; Other_Math # Ps LEFT SQUARE BRACKET WITH TICK IN TOP CORNER +298E ; Other_Math # Pe RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +298F ; Other_Math # Ps LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +2990 ; Other_Math # Pe RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER +2991 ; Other_Math # Ps LEFT ANGLE BRACKET WITH DOT +2992 ; Other_Math # Pe RIGHT ANGLE BRACKET WITH DOT +2993 ; Other_Math # Ps LEFT ARC LESS-THAN BRACKET +2994 ; Other_Math # Pe RIGHT ARC GREATER-THAN BRACKET +2995 ; Other_Math # Ps DOUBLE LEFT ARC GREATER-THAN BRACKET +2996 ; Other_Math # Pe DOUBLE RIGHT ARC LESS-THAN BRACKET +2997 ; Other_Math # Ps LEFT BLACK TORTOISE SHELL BRACKET +2998 ; Other_Math # Pe RIGHT BLACK TORTOISE SHELL BRACKET +29D8 ; Other_Math # Ps LEFT WIGGLY FENCE +29D9 ; Other_Math # Pe RIGHT WIGGLY FENCE +29DA ; Other_Math # Ps LEFT DOUBLE WIGGLY FENCE +29DB ; Other_Math # Pe RIGHT DOUBLE WIGGLY FENCE +29FC ; Other_Math # Ps LEFT-POINTING CURVED ANGLE BRACKET +29FD ; Other_Math # Pe RIGHT-POINTING CURVED ANGLE BRACKET +FE61 ; Other_Math # Po SMALL ASTERISK +FE63 ; Other_Math # Pd SMALL HYPHEN-MINUS +FE68 ; Other_Math # Po SMALL REVERSE SOLIDUS +FF3C ; Other_Math # Po FULLWIDTH REVERSE SOLIDUS +FF3E ; Other_Math # Sk FULLWIDTH CIRCUMFLEX ACCENT +1D400..1D454 ; Other_Math # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G +1D456..1D49C ; Other_Math # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A +1D49E..1D49F ; Other_Math # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D +1D4A2 ; Other_Math # L& MATHEMATICAL SCRIPT CAPITAL G +1D4A5..1D4A6 ; Other_Math # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K +1D4A9..1D4AC ; Other_Math # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q +1D4AE..1D4B9 ; Other_Math # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D +1D4BB ; Other_Math # L& MATHEMATICAL SCRIPT SMALL F +1D4BD..1D4C3 ; Other_Math # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N +1D4C5..1D505 ; Other_Math # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B +1D507..1D50A ; Other_Math # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G +1D50D..1D514 ; Other_Math # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q +1D516..1D51C ; Other_Math # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y +1D51E..1D539 ; Other_Math # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B +1D53B..1D53E ; Other_Math # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G +1D540..1D544 ; Other_Math # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M +1D546 ; Other_Math # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O +1D54A..1D550 ; Other_Math # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y +1D552..1D6A5 ; Other_Math # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J +1D6A8..1D6C0 ; Other_Math # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA +1D6C2..1D6DA ; Other_Math # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA +1D6DC..1D6FA ; Other_Math # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA +1D6FC..1D714 ; Other_Math # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA +1D716..1D734 ; Other_Math # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA +1D736..1D74E ; Other_Math # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA +1D750..1D76E ; Other_Math # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA +1D770..1D788 ; Other_Math # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA +1D78A..1D7A8 ; Other_Math # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA +1D7AA..1D7C2 ; Other_Math # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA +1D7C4..1D7CB ; Other_Math # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA +1D7CE..1D7FF ; Other_Math # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE + +# Total code points: 1216 + +# ================================================ + +0030..0039 ; Hex_Digit # Nd [10] DIGIT ZERO..DIGIT NINE +0041..0046 ; Hex_Digit # L& [6] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER F +0061..0066 ; Hex_Digit # L& [6] LATIN SMALL LETTER A..LATIN SMALL LETTER F +FF10..FF19 ; Hex_Digit # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE +FF21..FF26 ; Hex_Digit # L& [6] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER F +FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER F + +# Total code points: 44 + +# ================================================ + +0030..0039 ; ASCII_Hex_Digit # Nd [10] DIGIT ZERO..DIGIT NINE +0041..0046 ; ASCII_Hex_Digit # L& [6] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER F +0061..0066 ; ASCII_Hex_Digit # L& [6] LATIN SMALL LETTER A..LATIN SMALL LETTER F + +# Total code points: 22 + +# ================================================ + +0345 ; Other_Alphabetic # Mn COMBINING GREEK YPOGEGRAMMENI +05B0..05BD ; Other_Alphabetic # Mn [14] HEBREW POINT SHEVA..HEBREW POINT METEG +05BF ; Other_Alphabetic # Mn HEBREW POINT RAFE +05C1..05C2 ; Other_Alphabetic # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT +05C4..05C5 ; Other_Alphabetic # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT +05C7 ; Other_Alphabetic # Mn HEBREW POINT QAMATS QATAN +0610..061A ; Other_Alphabetic # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA +064B..0657 ; Other_Alphabetic # Mn [13] ARABIC FATHATAN..ARABIC INVERTED DAMMA +0659..065E ; Other_Alphabetic # Mn [6] ARABIC ZWARAKAY..ARABIC FATHA WITH TWO DOTS +0670 ; Other_Alphabetic # Mn ARABIC LETTER SUPERSCRIPT ALEF +06D6..06DC ; Other_Alphabetic # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN +06E1..06E4 ; Other_Alphabetic # Mn [4] ARABIC SMALL HIGH DOTLESS HEAD OF KHAH..ARABIC SMALL HIGH MADDA +06E7..06E8 ; Other_Alphabetic # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON +06ED ; Other_Alphabetic # Mn ARABIC SMALL LOW MEEM +0711 ; Other_Alphabetic # Mn SYRIAC LETTER SUPERSCRIPT ALAPH +0730..073F ; Other_Alphabetic # Mn [16] SYRIAC PTHAHA ABOVE..SYRIAC RWAHA +07A6..07B0 ; Other_Alphabetic # Mn [11] THAANA ABAFILI..THAANA SUKUN +0901..0902 ; Other_Alphabetic # Mn [2] DEVANAGARI SIGN CANDRABINDU..DEVANAGARI SIGN ANUSVARA +0903 ; Other_Alphabetic # Mc DEVANAGARI SIGN VISARGA +093E..0940 ; Other_Alphabetic # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II +0941..0948 ; Other_Alphabetic # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI +0949..094C ; Other_Alphabetic # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU +0962..0963 ; Other_Alphabetic # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL +0981 ; Other_Alphabetic # Mn BENGALI SIGN CANDRABINDU +0982..0983 ; Other_Alphabetic # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA +09BE..09C0 ; Other_Alphabetic # Mc [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II +09C1..09C4 ; Other_Alphabetic # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR +09C7..09C8 ; Other_Alphabetic # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI +09CB..09CC ; Other_Alphabetic # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU +09D7 ; Other_Alphabetic # Mc BENGALI AU LENGTH MARK +09E2..09E3 ; Other_Alphabetic # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL +0A01..0A02 ; Other_Alphabetic # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI +0A03 ; Other_Alphabetic # Mc GURMUKHI SIGN VISARGA +0A3E..0A40 ; Other_Alphabetic # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II +0A41..0A42 ; Other_Alphabetic # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU +0A47..0A48 ; Other_Alphabetic # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI +0A4B..0A4C ; Other_Alphabetic # Mn [2] GURMUKHI VOWEL SIGN OO..GURMUKHI VOWEL SIGN AU +0A51 ; Other_Alphabetic # Mn GURMUKHI SIGN UDAAT +0A70..0A71 ; Other_Alphabetic # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK +0A75 ; Other_Alphabetic # Mn GURMUKHI SIGN YAKASH +0A81..0A82 ; Other_Alphabetic # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA +0A83 ; Other_Alphabetic # Mc GUJARATI SIGN VISARGA +0ABE..0AC0 ; Other_Alphabetic # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II +0AC1..0AC5 ; Other_Alphabetic # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E +0AC7..0AC8 ; Other_Alphabetic # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI +0AC9 ; Other_Alphabetic # Mc GUJARATI VOWEL SIGN CANDRA O +0ACB..0ACC ; Other_Alphabetic # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU +0AE2..0AE3 ; Other_Alphabetic # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL +0B01 ; Other_Alphabetic # Mn ORIYA SIGN CANDRABINDU +0B02..0B03 ; Other_Alphabetic # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA +0B3E ; Other_Alphabetic # Mc ORIYA VOWEL SIGN AA +0B3F ; Other_Alphabetic # Mn ORIYA VOWEL SIGN I +0B40 ; Other_Alphabetic # Mc ORIYA VOWEL SIGN II +0B41..0B44 ; Other_Alphabetic # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR +0B47..0B48 ; Other_Alphabetic # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI +0B4B..0B4C ; Other_Alphabetic # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU +0B56 ; Other_Alphabetic # Mn ORIYA AI LENGTH MARK +0B57 ; Other_Alphabetic # Mc ORIYA AU LENGTH MARK +0B62..0B63 ; Other_Alphabetic # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL +0B82 ; Other_Alphabetic # Mn TAMIL SIGN ANUSVARA +0BBE..0BBF ; Other_Alphabetic # Mc [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I +0BC0 ; Other_Alphabetic # Mn TAMIL VOWEL SIGN II +0BC1..0BC2 ; Other_Alphabetic # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU +0BC6..0BC8 ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI +0BCA..0BCC ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU +0BD7 ; Other_Alphabetic # Mc TAMIL AU LENGTH MARK +0C01..0C03 ; Other_Alphabetic # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA +0C3E..0C40 ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II +0C41..0C44 ; Other_Alphabetic # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR +0C46..0C48 ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI +0C4A..0C4C ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN O..TELUGU VOWEL SIGN AU +0C55..0C56 ; Other_Alphabetic # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK +0C62..0C63 ; Other_Alphabetic # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL +0C82..0C83 ; Other_Alphabetic # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA +0CBE ; Other_Alphabetic # Mc KANNADA VOWEL SIGN AA +0CBF ; Other_Alphabetic # Mn KANNADA VOWEL SIGN I +0CC0..0CC4 ; Other_Alphabetic # Mc [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR +0CC6 ; Other_Alphabetic # Mn KANNADA VOWEL SIGN E +0CC7..0CC8 ; Other_Alphabetic # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI +0CCA..0CCB ; Other_Alphabetic # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO +0CCC ; Other_Alphabetic # Mn KANNADA VOWEL SIGN AU +0CD5..0CD6 ; Other_Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK +0CE2..0CE3 ; Other_Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL +0D02..0D03 ; Other_Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA +0D3E..0D40 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II +0D41..0D44 ; Other_Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR +0D46..0D48 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI +0D4A..0D4C ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU +0D57 ; Other_Alphabetic # Mc MALAYALAM AU LENGTH MARK +0D62..0D63 ; Other_Alphabetic # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL +0D82..0D83 ; Other_Alphabetic # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA +0DCF..0DD1 ; Other_Alphabetic # Mc [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA +0DD2..0DD4 ; Other_Alphabetic # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA +0DD6 ; Other_Alphabetic # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA +0DD8..0DDF ; Other_Alphabetic # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA +0DF2..0DF3 ; Other_Alphabetic # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA +0E31 ; Other_Alphabetic # Mn THAI CHARACTER MAI HAN-AKAT +0E34..0E3A ; Other_Alphabetic # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU +0E4D ; Other_Alphabetic # Mn THAI CHARACTER NIKHAHIT +0EB1 ; Other_Alphabetic # Mn LAO VOWEL SIGN MAI KAN +0EB4..0EB9 ; Other_Alphabetic # Mn [6] LAO VOWEL SIGN I..LAO VOWEL SIGN UU +0EBB..0EBC ; Other_Alphabetic # Mn [2] LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN LO +0ECD ; Other_Alphabetic # Mn LAO NIGGAHITA +0F71..0F7E ; Other_Alphabetic # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO +0F7F ; Other_Alphabetic # Mc TIBETAN SIGN RNAM BCAD +0F80..0F81 ; Other_Alphabetic # Mn [2] TIBETAN VOWEL SIGN REVERSED I..TIBETAN VOWEL SIGN REVERSED II +0F90..0F97 ; Other_Alphabetic # Mn [8] TIBETAN SUBJOINED LETTER KA..TIBETAN SUBJOINED LETTER JA +0F99..0FBC ; Other_Alphabetic # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA +102B..102C ; Other_Alphabetic # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA +102D..1030 ; Other_Alphabetic # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU +1031 ; Other_Alphabetic # Mc MYANMAR VOWEL SIGN E +1032..1036 ; Other_Alphabetic # Mn [5] MYANMAR VOWEL SIGN AI..MYANMAR SIGN ANUSVARA +1038 ; Other_Alphabetic # Mc MYANMAR SIGN VISARGA +103B..103C ; Other_Alphabetic # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA +103D..103E ; Other_Alphabetic # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA +1056..1057 ; Other_Alphabetic # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR +1058..1059 ; Other_Alphabetic # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL +105E..1060 ; Other_Alphabetic # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA +1062 ; Other_Alphabetic # Mc MYANMAR VOWEL SIGN SGAW KAREN EU +1067..1068 ; Other_Alphabetic # Mc [2] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR VOWEL SIGN WESTERN PWO KAREN UE +1071..1074 ; Other_Alphabetic # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE +1082 ; Other_Alphabetic # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA +1083..1084 ; Other_Alphabetic # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E +1085..1086 ; Other_Alphabetic # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y +135F ; Other_Alphabetic # Mn ETHIOPIC COMBINING GEMINATION MARK +1712..1713 ; Other_Alphabetic # Mn [2] TAGALOG VOWEL SIGN I..TAGALOG VOWEL SIGN U +1732..1733 ; Other_Alphabetic # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U +1752..1753 ; Other_Alphabetic # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U +1772..1773 ; Other_Alphabetic # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U +17B6 ; Other_Alphabetic # Mc KHMER VOWEL SIGN AA +17B7..17BD ; Other_Alphabetic # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA +17BE..17C5 ; Other_Alphabetic # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU +17C6 ; Other_Alphabetic # Mn KHMER SIGN NIKAHIT +17C7..17C8 ; Other_Alphabetic # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU +18A9 ; Other_Alphabetic # Mn MONGOLIAN LETTER ALI GALI DAGALGA +1920..1922 ; Other_Alphabetic # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U +1923..1926 ; Other_Alphabetic # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU +1927..1928 ; Other_Alphabetic # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O +1929..192B ; Other_Alphabetic # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA +1930..1931 ; Other_Alphabetic # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA +1932 ; Other_Alphabetic # Mn LIMBU SMALL LETTER ANUSVARA +1933..1938 ; Other_Alphabetic # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA +19B0..19C0 ; Other_Alphabetic # Mc [17] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE VOWEL SIGN IY +19C8..19C9 ; Other_Alphabetic # Mc [2] NEW TAI LUE TONE MARK-1..NEW TAI LUE TONE MARK-2 +1A17..1A18 ; Other_Alphabetic # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U +1A19..1A1B ; Other_Alphabetic # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE +1B00..1B03 ; Other_Alphabetic # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG +1B04 ; Other_Alphabetic # Mc BALINESE SIGN BISAH +1B35 ; Other_Alphabetic # Mc BALINESE VOWEL SIGN TEDUNG +1B36..1B3A ; Other_Alphabetic # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA +1B3B ; Other_Alphabetic # Mc BALINESE VOWEL SIGN RA REPA TEDUNG +1B3C ; Other_Alphabetic # Mn BALINESE VOWEL SIGN LA LENGA +1B3D..1B41 ; Other_Alphabetic # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG +1B42 ; Other_Alphabetic # Mn BALINESE VOWEL SIGN PEPET +1B43 ; Other_Alphabetic # Mc BALINESE VOWEL SIGN PEPET TEDUNG +1B80..1B81 ; Other_Alphabetic # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR +1B82 ; Other_Alphabetic # Mc SUNDANESE SIGN PANGWISAD +1BA1 ; Other_Alphabetic # Mc SUNDANESE CONSONANT SIGN PAMINGKAL +1BA2..1BA5 ; Other_Alphabetic # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU +1BA6..1BA7 ; Other_Alphabetic # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG +1BA8..1BA9 ; Other_Alphabetic # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG +1C24..1C2B ; Other_Alphabetic # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU +1C2C..1C33 ; Other_Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T +1C34..1C35 ; Other_Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG +24B6..24E9 ; Other_Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z +2DE0..2DFF ; Other_Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS +A823..A824 ; Other_Alphabetic # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I +A825..A826 ; Other_Alphabetic # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E +A827 ; Other_Alphabetic # Mc SYLOTI NAGRI VOWEL SIGN OO +A880..A881 ; Other_Alphabetic # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA +A8B4..A8C3 ; Other_Alphabetic # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU +A926..A92A ; Other_Alphabetic # Mn [5] KAYAH LI VOWEL UE..KAYAH LI VOWEL O +A947..A951 ; Other_Alphabetic # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R +A952 ; Other_Alphabetic # Mc REJANG CONSONANT SIGN H +AA29..AA2E ; Other_Alphabetic # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE +AA2F..AA30 ; Other_Alphabetic # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI +AA31..AA32 ; Other_Alphabetic # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE +AA33..AA34 ; Other_Alphabetic # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA +AA35..AA36 ; Other_Alphabetic # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA +AA43 ; Other_Alphabetic # Mn CHAM CONSONANT SIGN FINAL NG +AA4C ; Other_Alphabetic # Mn CHAM CONSONANT SIGN FINAL M +AA4D ; Other_Alphabetic # Mc CHAM CONSONANT SIGN FINAL H +FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA +10A01..10A03 ; Other_Alphabetic # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R +10A05..10A06 ; Other_Alphabetic # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O +10A0C..10A0F ; Other_Alphabetic # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA + +# Total code points: 663 + +# ================================================ + +3006 ; Ideographic # Lo IDEOGRAPHIC CLOSING MARK +3007 ; Ideographic # Nl IDEOGRAPHIC NUMBER ZERO +3021..3029 ; Ideographic # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE +3038..303A ; Ideographic # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY +3400..4DB5 ; Ideographic # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5 +4E00..9FC3 ; Ideographic # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FC3 +F900..FA2D ; Ideographic # Lo [302] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA2D +FA30..FA6A ; Ideographic # Lo [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A +FA70..FAD9 ; Ideographic # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 +20000..2A6D6 ; Ideographic # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6 +2F800..2FA1D ; Ideographic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D + +# Total code points: 71248 + +# ================================================ + +005E ; Diacritic # Sk CIRCUMFLEX ACCENT +0060 ; Diacritic # Sk GRAVE ACCENT +00A8 ; Diacritic # Sk DIAERESIS +00AF ; Diacritic # Sk MACRON +00B4 ; Diacritic # Sk ACUTE ACCENT +00B7 ; Diacritic # Po MIDDLE DOT +00B8 ; Diacritic # Sk CEDILLA +02B0..02C1 ; Diacritic # Lm [18] MODIFIER LETTER SMALL H..MODIFIER LETTER REVERSED GLOTTAL STOP +02C2..02C5 ; Diacritic # Sk [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD +02C6..02D1 ; Diacritic # Lm [12] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER HALF TRIANGULAR COLON +02D2..02DF ; Diacritic # Sk [14] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCENT +02E0..02E4 ; Diacritic # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP +02E5..02EB ; Diacritic # Sk [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK +02EC ; Diacritic # Lm MODIFIER LETTER VOICING +02ED ; Diacritic # Sk MODIFIER LETTER UNASPIRATED +02EE ; Diacritic # Lm MODIFIER LETTER DOUBLE APOSTROPHE +02EF..02FF ; Diacritic # Sk [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW +0300..034E ; Diacritic # Mn [79] COMBINING GRAVE ACCENT..COMBINING UPWARDS ARROW BELOW +0350..0357 ; Diacritic # Mn [8] COMBINING RIGHT ARROWHEAD ABOVE..COMBINING RIGHT HALF RING ABOVE +035D..0362 ; Diacritic # Mn [6] COMBINING DOUBLE BREVE..COMBINING DOUBLE RIGHTWARDS ARROW BELOW +0374 ; Diacritic # Lm GREEK NUMERAL SIGN +0375 ; Diacritic # Sk GREEK LOWER NUMERAL SIGN +037A ; Diacritic # Lm GREEK YPOGEGRAMMENI +0384..0385 ; Diacritic # Sk [2] GREEK TONOS..GREEK DIALYTIKA TONOS +0483..0487 ; Diacritic # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE +0559 ; Diacritic # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING +0591..05A1 ; Diacritic # Mn [17] HEBREW ACCENT ETNAHTA..HEBREW ACCENT PAZER +05A3..05BD ; Diacritic # Mn [27] HEBREW ACCENT MUNAH..HEBREW POINT METEG +05BF ; Diacritic # Mn HEBREW POINT RAFE +05C1..05C2 ; Diacritic # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT +05C4 ; Diacritic # Mn HEBREW MARK UPPER DOT +064B..0652 ; Diacritic # Mn [8] ARABIC FATHATAN..ARABIC SUKUN +0657..0658 ; Diacritic # Mn [2] ARABIC INVERTED DAMMA..ARABIC MARK NOON GHUNNA +06DF..06E0 ; Diacritic # Mn [2] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO +06E5..06E6 ; Diacritic # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH +06EA..06EC ; Diacritic # Mn [3] ARABIC EMPTY CENTRE LOW STOP..ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE +0730..074A ; Diacritic # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH +07A6..07B0 ; Diacritic # Mn [11] THAANA ABAFILI..THAANA SUKUN +07EB..07F3 ; Diacritic # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE +07F4..07F5 ; Diacritic # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE +093C ; Diacritic # Mn DEVANAGARI SIGN NUKTA +094D ; Diacritic # Mn DEVANAGARI SIGN VIRAMA +0951..0954 ; Diacritic # Mn [4] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI ACUTE ACCENT +0971 ; Diacritic # Lm DEVANAGARI SIGN HIGH SPACING DOT +09BC ; Diacritic # Mn BENGALI SIGN NUKTA +09CD ; Diacritic # Mn BENGALI SIGN VIRAMA +0A3C ; Diacritic # Mn GURMUKHI SIGN NUKTA +0A4D ; Diacritic # Mn GURMUKHI SIGN VIRAMA +0ABC ; Diacritic # Mn GUJARATI SIGN NUKTA +0ACD ; Diacritic # Mn GUJARATI SIGN VIRAMA +0B3C ; Diacritic # Mn ORIYA SIGN NUKTA +0B4D ; Diacritic # Mn ORIYA SIGN VIRAMA +0BCD ; Diacritic # Mn TAMIL SIGN VIRAMA +0C4D ; Diacritic # Mn TELUGU SIGN VIRAMA +0CBC ; Diacritic # Mn KANNADA SIGN NUKTA +0CCD ; Diacritic # Mn KANNADA SIGN VIRAMA +0D4D ; Diacritic # Mn MALAYALAM SIGN VIRAMA +0DCA ; Diacritic # Mn SINHALA SIGN AL-LAKUNA +0E47..0E4C ; Diacritic # Mn [6] THAI CHARACTER MAITAIKHU..THAI CHARACTER THANTHAKHAT +0E4E ; Diacritic # Mn THAI CHARACTER YAMAKKAN +0EC8..0ECC ; Diacritic # Mn [5] LAO TONE MAI EK..LAO CANCELLATION MARK +0F18..0F19 ; Diacritic # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS +0F35 ; Diacritic # Mn TIBETAN MARK NGAS BZUNG NYI ZLA +0F37 ; Diacritic # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS +0F39 ; Diacritic # Mn TIBETAN MARK TSA -PHRU +0F3E..0F3F ; Diacritic # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES +0F82..0F84 ; Diacritic # Mn [3] TIBETAN SIGN NYI ZLA NAA DA..TIBETAN MARK HALANTA +0F86..0F87 ; Diacritic # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS +0FC6 ; Diacritic # Mn TIBETAN SYMBOL PADMA GDAN +1037 ; Diacritic # Mn MYANMAR SIGN DOT BELOW +1039..103A ; Diacritic # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT +1087..108C ; Diacritic # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 +108D ; Diacritic # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE +108F ; Diacritic # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 +17C9..17D3 ; Diacritic # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT +17DD ; Diacritic # Mn KHMER SIGN ATTHACAN +1939..193B ; Diacritic # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I +1B34 ; Diacritic # Mn BALINESE SIGN REREKAN +1B44 ; Diacritic # Mc BALINESE ADEG ADEG +1B6B..1B73 ; Diacritic # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG +1BAA ; Diacritic # Mc SUNDANESE SIGN PAMAAEH +1C36..1C37 ; Diacritic # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA +1C78..1C7D ; Diacritic # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD +1D2C..1D61 ; Diacritic # Lm [54] MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI +1D62..1D6A ; Diacritic # L& [9] LATIN SUBSCRIPT SMALL LETTER I..GREEK SUBSCRIPT SMALL LETTER CHI +1DC4..1DCF ; Diacritic # Mn [12] COMBINING MACRON-ACUTE..COMBINING ZIGZAG BELOW +1DFE..1DFF ; Diacritic # Mn [2] COMBINING LEFT ARROWHEAD ABOVE..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW +1FBD ; Diacritic # Sk GREEK KORONIS +1FBF..1FC1 ; Diacritic # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI +1FCD..1FCF ; Diacritic # Sk [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI +1FDD..1FDF ; Diacritic # Sk [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI +1FED..1FEF ; Diacritic # Sk [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA +1FFD..1FFE ; Diacritic # Sk [2] GREEK OXIA..GREEK DASIA +2E2F ; Diacritic # Lm VERTICAL TILDE +302A..302F ; Diacritic # Mn [6] IDEOGRAPHIC LEVEL TONE MARK..HANGUL DOUBLE DOT TONE MARK +3099..309A ; Diacritic # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +309B..309C ; Diacritic # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +30FC ; Diacritic # Lm KATAKANA-HIRAGANA PROLONGED SOUND MARK +A66F ; Diacritic # Mn COMBINING CYRILLIC VZMET +A67C..A67D ; Diacritic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK +A67F ; Diacritic # Lm CYRILLIC PAYEROK +A717..A71F ; Diacritic # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK +A720..A721 ; Diacritic # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE +A788 ; Diacritic # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT +A8C4 ; Diacritic # Mn SAURASHTRA SIGN VIRAMA +A92B..A92D ; Diacritic # Mn [3] KAYAH LI TONE PLOPHU..KAYAH LI TONE CALYA PLOPHU +A92E ; Diacritic # Po KAYAH LI SIGN CWI +A953 ; Diacritic # Mc REJANG VIRAMA +FB1E ; Diacritic # Mn HEBREW POINT JUDEO-SPANISH VARIKA +FE20..FE26 ; Diacritic # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON +FF3E ; Diacritic # Sk FULLWIDTH CIRCUMFLEX ACCENT +FF40 ; Diacritic # Sk FULLWIDTH GRAVE ACCENT +FF70 ; Diacritic # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK +FF9E..FF9F ; Diacritic # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +FFE3 ; Diacritic # Sk FULLWIDTH MACRON +1D167..1D169 ; Diacritic # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 +1D16D..1D172 ; Diacritic # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 +1D17B..1D182 ; Diacritic # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE +1D185..1D18B ; Diacritic # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE +1D1AA..1D1AD ; Diacritic # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO + +# Total code points: 565 + +# ================================================ + +00B7 ; Extender # Po MIDDLE DOT +02D0..02D1 ; Extender # Lm [2] MODIFIER LETTER TRIANGULAR COLON..MODIFIER LETTER HALF TRIANGULAR COLON +0640 ; Extender # Lm ARABIC TATWEEL +07FA ; Extender # Lm NKO LAJANYALAN +0E46 ; Extender # Lm THAI CHARACTER MAIYAMOK +0EC6 ; Extender # Lm LAO KO LA +1843 ; Extender # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN +1C36 ; Extender # Mn LEPCHA SIGN RAN +1C7B ; Extender # Lm OL CHIKI RELAA +3005 ; Extender # Lm IDEOGRAPHIC ITERATION MARK +3031..3035 ; Extender # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF +309D..309E ; Extender # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK +30FC..30FE ; Extender # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK +A015 ; Extender # Lm YI SYLLABLE WU +A60C ; Extender # Lm VAI SYLLABLE LENGTHENER +FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK + +# Total code points: 24 + +# ================================================ + +02B0..02B8 ; Other_Lowercase # Lm [9] MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y +02C0..02C1 ; Other_Lowercase # Lm [2] MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP +02E0..02E4 ; Other_Lowercase # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP +0345 ; Other_Lowercase # Mn COMBINING GREEK YPOGEGRAMMENI +037A ; Other_Lowercase # Lm GREEK YPOGEGRAMMENI +1D2C..1D61 ; Other_Lowercase # Lm [54] MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI +1D78 ; Other_Lowercase # Lm MODIFIER LETTER CYRILLIC EN +1D9B..1DBF ; Other_Lowercase # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA +2090..2094 ; Other_Lowercase # Lm [5] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER SCHWA +2170..217F ; Other_Lowercase # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND +24D0..24E9 ; Other_Lowercase # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z +2C7D ; Other_Lowercase # Lm MODIFIER LETTER CAPITAL V +A770 ; Other_Lowercase # Lm MODIFIER LETTER US + +# Total code points: 159 + +# ================================================ + +2160..216F ; Other_Uppercase # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND +24B6..24CF ; Other_Uppercase # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z + +# Total code points: 42 + +# ================================================ + +FDD0..FDEF ; Noncharacter_Code_Point # Cn [32] .. +FFFE..FFFF ; Noncharacter_Code_Point # Cn [2] .. +1FFFE..1FFFF ; Noncharacter_Code_Point # Cn [2] .. +2FFFE..2FFFF ; Noncharacter_Code_Point # Cn [2] .. +3FFFE..3FFFF ; Noncharacter_Code_Point # Cn [2] .. +4FFFE..4FFFF ; Noncharacter_Code_Point # Cn [2] .. +5FFFE..5FFFF ; Noncharacter_Code_Point # Cn [2] .. +6FFFE..6FFFF ; Noncharacter_Code_Point # Cn [2] .. +7FFFE..7FFFF ; Noncharacter_Code_Point # Cn [2] .. +8FFFE..8FFFF ; Noncharacter_Code_Point # Cn [2] .. +9FFFE..9FFFF ; Noncharacter_Code_Point # Cn [2] .. +AFFFE..AFFFF ; Noncharacter_Code_Point # Cn [2] .. +BFFFE..BFFFF ; Noncharacter_Code_Point # Cn [2] .. +CFFFE..CFFFF ; Noncharacter_Code_Point # Cn [2] .. +DFFFE..DFFFF ; Noncharacter_Code_Point # Cn [2] .. +EFFFE..EFFFF ; Noncharacter_Code_Point # Cn [2] .. +FFFFE..FFFFF ; Noncharacter_Code_Point # Cn [2] .. +10FFFE..10FFFF; Noncharacter_Code_Point # Cn [2] .. + +# Total code points: 66 + +# ================================================ + +09BE ; Other_Grapheme_Extend # Mc BENGALI VOWEL SIGN AA +09D7 ; Other_Grapheme_Extend # Mc BENGALI AU LENGTH MARK +0B3E ; Other_Grapheme_Extend # Mc ORIYA VOWEL SIGN AA +0B57 ; Other_Grapheme_Extend # Mc ORIYA AU LENGTH MARK +0BBE ; Other_Grapheme_Extend # Mc TAMIL VOWEL SIGN AA +0BD7 ; Other_Grapheme_Extend # Mc TAMIL AU LENGTH MARK +0CC2 ; Other_Grapheme_Extend # Mc KANNADA VOWEL SIGN UU +0CD5..0CD6 ; Other_Grapheme_Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK +0D3E ; Other_Grapheme_Extend # Mc MALAYALAM VOWEL SIGN AA +0D57 ; Other_Grapheme_Extend # Mc MALAYALAM AU LENGTH MARK +0DCF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN AELA-PILLA +0DDF ; Other_Grapheme_Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA +200C..200D ; Other_Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER +FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +1D165 ; Other_Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM +1D16E..1D172 ; Other_Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5 + +# Total code points: 23 + +# ================================================ + +2FF0..2FF1 ; IDS_Binary_Operator # So [2] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW +2FF4..2FFB ; IDS_Binary_Operator # So [8] IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID + +# Total code points: 10 + +# ================================================ + +2FF2..2FF3 ; IDS_Trinary_Operator # So [2] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW + +# Total code points: 2 + +# ================================================ + +2E80..2E99 ; Radical # So [26] CJK RADICAL REPEAT..CJK RADICAL RAP +2E9B..2EF3 ; Radical # So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE +2F00..2FD5 ; Radical # So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE + +# Total code points: 329 + +# ================================================ + +3400..4DB5 ; Unified_Ideograph # Lo [6582] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DB5 +4E00..9FC3 ; Unified_Ideograph # Lo [20932] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FC3 +FA0E..FA0F ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA0E..CJK COMPATIBILITY IDEOGRAPH-FA0F +FA11 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA11 +FA13..FA14 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA13..CJK COMPATIBILITY IDEOGRAPH-FA14 +FA1F ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA1F +FA21 ; Unified_Ideograph # Lo CJK COMPATIBILITY IDEOGRAPH-FA21 +FA23..FA24 ; Unified_Ideograph # Lo [2] CJK COMPATIBILITY IDEOGRAPH-FA23..CJK COMPATIBILITY IDEOGRAPH-FA24 +FA27..FA29 ; Unified_Ideograph # Lo [3] CJK COMPATIBILITY IDEOGRAPH-FA27..CJK COMPATIBILITY IDEOGRAPH-FA29 +20000..2A6D6 ; Unified_Ideograph # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6 + +# Total code points: 70237 + +# ================================================ + +034F ; Other_Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER +115F..1160 ; Other_Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER +2065..2069 ; Other_Default_Ignorable_Code_Point # Cn [5] .. +3164 ; Other_Default_Ignorable_Code_Point # Lo HANGUL FILLER +FFA0 ; Other_Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER +FFF0..FFF8 ; Other_Default_Ignorable_Code_Point # Cn [9] .. +E0000 ; Other_Default_Ignorable_Code_Point # Cn +E0002..E001F ; Other_Default_Ignorable_Code_Point # Cn [30] .. +E0080..E00FF ; Other_Default_Ignorable_Code_Point # Cn [128] .. +E01F0..E0FFF ; Other_Default_Ignorable_Code_Point # Cn [3600] .. + +# Total code points: 3778 + +# ================================================ + +0340..0341 ; Deprecated # Mn [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK +17A3 ; Deprecated # Lo KHMER INDEPENDENT VOWEL QAQ +17D3 ; Deprecated # Mn KHMER SIGN BATHAMASAT +206A..206F ; Deprecated # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES +E0001 ; Deprecated # Cf LANGUAGE TAG +E0020..E007F ; Deprecated # Cf [96] TAG SPACE..CANCEL TAG + +# Total code points: 107 + +# ================================================ + +0069..006A ; Soft_Dotted # L& [2] LATIN SMALL LETTER I..LATIN SMALL LETTER J +012F ; Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK +0249 ; Soft_Dotted # L& LATIN SMALL LETTER J WITH STROKE +0268 ; Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE +029D ; Soft_Dotted # L& LATIN SMALL LETTER J WITH CROSSED-TAIL +02B2 ; Soft_Dotted # Lm MODIFIER LETTER SMALL J +03F3 ; Soft_Dotted # L& GREEK LETTER YOT +0456 ; Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I +0458 ; Soft_Dotted # L& CYRILLIC SMALL LETTER JE +1D62 ; Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I +1D96 ; Soft_Dotted # L& LATIN SMALL LETTER I WITH RETROFLEX HOOK +1DA4 ; Soft_Dotted # Lm MODIFIER LETTER SMALL I WITH STROKE +1DA8 ; Soft_Dotted # Lm MODIFIER LETTER SMALL J WITH CROSSED-TAIL +1E2D ; Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW +1ECB ; Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW +2071 ; Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I +2148..2149 ; Soft_Dotted # L& [2] DOUBLE-STRUCK ITALIC SMALL I..DOUBLE-STRUCK ITALIC SMALL J +2C7C ; Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER J +1D422..1D423 ; Soft_Dotted # L& [2] MATHEMATICAL BOLD SMALL I..MATHEMATICAL BOLD SMALL J +1D456..1D457 ; Soft_Dotted # L& [2] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL ITALIC SMALL J +1D48A..1D48B ; Soft_Dotted # L& [2] MATHEMATICAL BOLD ITALIC SMALL I..MATHEMATICAL BOLD ITALIC SMALL J +1D4BE..1D4BF ; Soft_Dotted # L& [2] MATHEMATICAL SCRIPT SMALL I..MATHEMATICAL SCRIPT SMALL J +1D4F2..1D4F3 ; Soft_Dotted # L& [2] MATHEMATICAL BOLD SCRIPT SMALL I..MATHEMATICAL BOLD SCRIPT SMALL J +1D526..1D527 ; Soft_Dotted # L& [2] MATHEMATICAL FRAKTUR SMALL I..MATHEMATICAL FRAKTUR SMALL J +1D55A..1D55B ; Soft_Dotted # L& [2] MATHEMATICAL DOUBLE-STRUCK SMALL I..MATHEMATICAL DOUBLE-STRUCK SMALL J +1D58E..1D58F ; Soft_Dotted # L& [2] MATHEMATICAL BOLD FRAKTUR SMALL I..MATHEMATICAL BOLD FRAKTUR SMALL J +1D5C2..1D5C3 ; Soft_Dotted # L& [2] MATHEMATICAL SANS-SERIF SMALL I..MATHEMATICAL SANS-SERIF SMALL J +1D5F6..1D5F7 ; Soft_Dotted # L& [2] MATHEMATICAL SANS-SERIF BOLD SMALL I..MATHEMATICAL SANS-SERIF BOLD SMALL J +1D62A..1D62B ; Soft_Dotted # L& [2] MATHEMATICAL SANS-SERIF ITALIC SMALL I..MATHEMATICAL SANS-SERIF ITALIC SMALL J +1D65E..1D65F ; Soft_Dotted # L& [2] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL I..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL J +1D692..1D693 ; Soft_Dotted # L& [2] MATHEMATICAL MONOSPACE SMALL I..MATHEMATICAL MONOSPACE SMALL J + +# Total code points: 46 + +# ================================================ + +0E40..0E44 ; Logical_Order_Exception # Lo [5] THAI CHARACTER SARA E..THAI CHARACTER SARA AI MAIMALAI +0EC0..0EC4 ; Logical_Order_Exception # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI + +# Total code points: 10 + +# ================================================ + +2118 ; Other_ID_Start # So SCRIPT CAPITAL P +212E ; Other_ID_Start # So ESTIMATED SYMBOL +309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + +# Total code points: 4 + +# ================================================ + +00B7 ; Other_ID_Continue # Po MIDDLE DOT +0387 ; Other_ID_Continue # Po GREEK ANO TELEIA +1369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE + +# Total code points: 11 + +# ================================================ + +0021 ; STerm # Po EXCLAMATION MARK +002E ; STerm # Po FULL STOP +003F ; STerm # Po QUESTION MARK +055C ; STerm # Po ARMENIAN EXCLAMATION MARK +055E ; STerm # Po ARMENIAN QUESTION MARK +0589 ; STerm # Po ARMENIAN FULL STOP +061F ; STerm # Po ARABIC QUESTION MARK +06D4 ; STerm # Po ARABIC FULL STOP +0700..0702 ; STerm # Po [3] SYRIAC END OF PARAGRAPH..SYRIAC SUBLINEAR FULL STOP +07F9 ; STerm # Po NKO EXCLAMATION MARK +0964..0965 ; STerm # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA +104A..104B ; STerm # Po [2] MYANMAR SIGN LITTLE SECTION..MYANMAR SIGN SECTION +1362 ; STerm # Po ETHIOPIC FULL STOP +1367..1368 ; STerm # Po [2] ETHIOPIC QUESTION MARK..ETHIOPIC PARAGRAPH SEPARATOR +166E ; STerm # Po CANADIAN SYLLABICS FULL STOP +1803 ; STerm # Po MONGOLIAN FULL STOP +1809 ; STerm # Po MONGOLIAN MANCHU FULL STOP +1944..1945 ; STerm # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK +1B5A..1B5B ; STerm # Po [2] BALINESE PANTI..BALINESE PAMADA +1B5E..1B5F ; STerm # Po [2] BALINESE CARIK SIKI..BALINESE CARIK PAREREN +1C3B..1C3C ; STerm # Po [2] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION NYET THYOOM TA-ROL +1C7E..1C7F ; STerm # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD +203C..203D ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG +2047..2049 ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK +2E2E ; STerm # Po REVERSED QUESTION MARK +3002 ; STerm # Po IDEOGRAPHIC FULL STOP +A60E..A60F ; STerm # Po [2] VAI FULL STOP..VAI QUESTION MARK +A876..A877 ; STerm # Po [2] PHAGS-PA MARK SHAD..PHAGS-PA MARK DOUBLE SHAD +A8CE..A8CF ; STerm # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA +A92F ; STerm # Po KAYAH LI SIGN SHYA +AA5D..AA5F ; STerm # Po [3] CHAM PUNCTUATION DANDA..CHAM PUNCTUATION TRIPLE DANDA +FE52 ; STerm # Po SMALL FULL STOP +FE56..FE57 ; STerm # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK +FF01 ; STerm # Po FULLWIDTH EXCLAMATION MARK +FF0E ; STerm # Po FULLWIDTH FULL STOP +FF1F ; STerm # Po FULLWIDTH QUESTION MARK +FF61 ; STerm # Po HALFWIDTH IDEOGRAPHIC FULL STOP + +# Total code points: 56 + +# ================================================ + +180B..180D ; Variation_Selector # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE +FE00..FE0F ; Variation_Selector # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 +E0100..E01EF ; Variation_Selector # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 + +# Total code points: 259 + +# ================================================ + +0009..000D ; Pattern_White_Space # Cc [5] .. +0020 ; Pattern_White_Space # Zs SPACE +0085 ; Pattern_White_Space # Cc +200E..200F ; Pattern_White_Space # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK +2028 ; Pattern_White_Space # Zl LINE SEPARATOR +2029 ; Pattern_White_Space # Zp PARAGRAPH SEPARATOR + +# Total code points: 11 + +# ================================================ + +0021..0023 ; Pattern_Syntax # Po [3] EXCLAMATION MARK..NUMBER SIGN +0024 ; Pattern_Syntax # Sc DOLLAR SIGN +0025..0027 ; Pattern_Syntax # Po [3] PERCENT SIGN..APOSTROPHE +0028 ; Pattern_Syntax # Ps LEFT PARENTHESIS +0029 ; Pattern_Syntax # Pe RIGHT PARENTHESIS +002A ; Pattern_Syntax # Po ASTERISK +002B ; Pattern_Syntax # Sm PLUS SIGN +002C ; Pattern_Syntax # Po COMMA +002D ; Pattern_Syntax # Pd HYPHEN-MINUS +002E..002F ; Pattern_Syntax # Po [2] FULL STOP..SOLIDUS +003A..003B ; Pattern_Syntax # Po [2] COLON..SEMICOLON +003C..003E ; Pattern_Syntax # Sm [3] LESS-THAN SIGN..GREATER-THAN SIGN +003F..0040 ; Pattern_Syntax # Po [2] QUESTION MARK..COMMERCIAL AT +005B ; Pattern_Syntax # Ps LEFT SQUARE BRACKET +005C ; Pattern_Syntax # Po REVERSE SOLIDUS +005D ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET +005E ; Pattern_Syntax # Sk CIRCUMFLEX ACCENT +0060 ; Pattern_Syntax # Sk GRAVE ACCENT +007B ; Pattern_Syntax # Ps LEFT CURLY BRACKET +007C ; Pattern_Syntax # Sm VERTICAL LINE +007D ; Pattern_Syntax # Pe RIGHT CURLY BRACKET +007E ; Pattern_Syntax # Sm TILDE +00A1 ; Pattern_Syntax # Po INVERTED EXCLAMATION MARK +00A2..00A5 ; Pattern_Syntax # Sc [4] CENT SIGN..YEN SIGN +00A6..00A7 ; Pattern_Syntax # So [2] BROKEN BAR..SECTION SIGN +00A9 ; Pattern_Syntax # So COPYRIGHT SIGN +00AB ; Pattern_Syntax # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +00AC ; Pattern_Syntax # Sm NOT SIGN +00AE ; Pattern_Syntax # So REGISTERED SIGN +00B0 ; Pattern_Syntax # So DEGREE SIGN +00B1 ; Pattern_Syntax # Sm PLUS-MINUS SIGN +00B6 ; Pattern_Syntax # So PILCROW SIGN +00BB ; Pattern_Syntax # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +00BF ; Pattern_Syntax # Po INVERTED QUESTION MARK +00D7 ; Pattern_Syntax # Sm MULTIPLICATION SIGN +00F7 ; Pattern_Syntax # Sm DIVISION SIGN +2010..2015 ; Pattern_Syntax # Pd [6] HYPHEN..HORIZONTAL BAR +2016..2017 ; Pattern_Syntax # Po [2] DOUBLE VERTICAL LINE..DOUBLE LOW LINE +2018 ; Pattern_Syntax # Pi LEFT SINGLE QUOTATION MARK +2019 ; Pattern_Syntax # Pf RIGHT SINGLE QUOTATION MARK +201A ; Pattern_Syntax # Ps SINGLE LOW-9 QUOTATION MARK +201B..201C ; Pattern_Syntax # Pi [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK +201D ; Pattern_Syntax # Pf RIGHT DOUBLE QUOTATION MARK +201E ; Pattern_Syntax # Ps DOUBLE LOW-9 QUOTATION MARK +201F ; Pattern_Syntax # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK +2020..2027 ; Pattern_Syntax # Po [8] DAGGER..HYPHENATION POINT +2030..2038 ; Pattern_Syntax # Po [9] PER MILLE SIGN..CARET +2039 ; Pattern_Syntax # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK +203A ; Pattern_Syntax # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +203B..203E ; Pattern_Syntax # Po [4] REFERENCE MARK..OVERLINE +2041..2043 ; Pattern_Syntax # Po [3] CARET INSERTION POINT..HYPHEN BULLET +2044 ; Pattern_Syntax # Sm FRACTION SLASH +2045 ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH QUILL +2046 ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH QUILL +2047..2051 ; Pattern_Syntax # Po [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY +2052 ; Pattern_Syntax # Sm COMMERCIAL MINUS SIGN +2053 ; Pattern_Syntax # Po SWUNG DASH +2055..205E ; Pattern_Syntax # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS +2190..2194 ; Pattern_Syntax # Sm [5] LEFTWARDS ARROW..LEFT RIGHT ARROW +2195..2199 ; Pattern_Syntax # So [5] UP DOWN ARROW..SOUTH WEST ARROW +219A..219B ; Pattern_Syntax # Sm [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE +219C..219F ; Pattern_Syntax # So [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW +21A0 ; Pattern_Syntax # Sm RIGHTWARDS TWO HEADED ARROW +21A1..21A2 ; Pattern_Syntax # So [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL +21A3 ; Pattern_Syntax # Sm RIGHTWARDS ARROW WITH TAIL +21A4..21A5 ; Pattern_Syntax # So [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR +21A6 ; Pattern_Syntax # Sm RIGHTWARDS ARROW FROM BAR +21A7..21AD ; Pattern_Syntax # So [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW +21AE ; Pattern_Syntax # Sm LEFT RIGHT ARROW WITH STROKE +21AF..21CD ; Pattern_Syntax # So [31] DOWNWARDS ZIGZAG ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE +21CE..21CF ; Pattern_Syntax # Sm [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE +21D0..21D1 ; Pattern_Syntax # So [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW +21D2 ; Pattern_Syntax # Sm RIGHTWARDS DOUBLE ARROW +21D3 ; Pattern_Syntax # So DOWNWARDS DOUBLE ARROW +21D4 ; Pattern_Syntax # Sm LEFT RIGHT DOUBLE ARROW +21D5..21F3 ; Pattern_Syntax # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW +21F4..22FF ; Pattern_Syntax # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP +2300..2307 ; Pattern_Syntax # So [8] DIAMETER SIGN..WAVY LINE +2308..230B ; Pattern_Syntax # Sm [4] LEFT CEILING..RIGHT FLOOR +230C..231F ; Pattern_Syntax # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER +2320..2321 ; Pattern_Syntax # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL +2322..2328 ; Pattern_Syntax # So [7] FROWN..KEYBOARD +2329 ; Pattern_Syntax # Ps LEFT-POINTING ANGLE BRACKET +232A ; Pattern_Syntax # Pe RIGHT-POINTING ANGLE BRACKET +232B..237B ; Pattern_Syntax # So [81] ERASE TO THE LEFT..NOT CHECK MARK +237C ; Pattern_Syntax # Sm RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW +237D..239A ; Pattern_Syntax # So [30] SHOULDERED OPEN BOX..CLEAR SCREEN SYMBOL +239B..23B3 ; Pattern_Syntax # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM +23B4..23DB ; Pattern_Syntax # So [40] TOP SQUARE BRACKET..FUSE +23DC..23E1 ; Pattern_Syntax # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET +23E2..23E7 ; Pattern_Syntax # So [6] WHITE TRAPEZIUM..ELECTRICAL INTERSECTION +23E8..23FF ; Pattern_Syntax # Cn [24] .. +2400..2426 ; Pattern_Syntax # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO +2427..243F ; Pattern_Syntax # Cn [25] .. +2440..244A ; Pattern_Syntax # So [11] OCR HOOK..OCR DOUBLE BACKSLASH +244B..245F ; Pattern_Syntax # Cn [21] .. +2500..25B6 ; Pattern_Syntax # So [183] BOX DRAWINGS LIGHT HORIZONTAL..BLACK RIGHT-POINTING TRIANGLE +25B7 ; Pattern_Syntax # Sm WHITE RIGHT-POINTING TRIANGLE +25B8..25C0 ; Pattern_Syntax # So [9] BLACK RIGHT-POINTING SMALL TRIANGLE..BLACK LEFT-POINTING TRIANGLE +25C1 ; Pattern_Syntax # Sm WHITE LEFT-POINTING TRIANGLE +25C2..25F7 ; Pattern_Syntax # So [54] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE CIRCLE WITH UPPER RIGHT QUADRANT +25F8..25FF ; Pattern_Syntax # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE +2600..266E ; Pattern_Syntax # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN +266F ; Pattern_Syntax # Sm MUSIC SHARP SIGN +2670..269D ; Pattern_Syntax # So [46] WEST SYRIAC CROSS..OUTLINED WHITE STAR +269E..269F ; Pattern_Syntax # Cn [2] .. +26A0..26BC ; Pattern_Syntax # So [29] WARNING SIGN..SESQUIQUADRATE +26BD..26BF ; Pattern_Syntax # Cn [3] .. +26C0..26C3 ; Pattern_Syntax # So [4] WHITE DRAUGHTS MAN..BLACK DRAUGHTS KING +26C4..2700 ; Pattern_Syntax # Cn [61] .. +2701..2704 ; Pattern_Syntax # So [4] UPPER BLADE SCISSORS..WHITE SCISSORS +2705 ; Pattern_Syntax # Cn +2706..2709 ; Pattern_Syntax # So [4] TELEPHONE LOCATION SIGN..ENVELOPE +270A..270B ; Pattern_Syntax # Cn [2] .. +270C..2727 ; Pattern_Syntax # So [28] VICTORY HAND..WHITE FOUR POINTED STAR +2728 ; Pattern_Syntax # Cn +2729..274B ; Pattern_Syntax # So [35] STRESS OUTLINED WHITE STAR..HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK +274C ; Pattern_Syntax # Cn +274D ; Pattern_Syntax # So SHADOWED WHITE CIRCLE +274E ; Pattern_Syntax # Cn +274F..2752 ; Pattern_Syntax # So [4] LOWER RIGHT DROP-SHADOWED WHITE SQUARE..UPPER RIGHT SHADOWED WHITE SQUARE +2753..2755 ; Pattern_Syntax # Cn [3] .. +2756 ; Pattern_Syntax # So BLACK DIAMOND MINUS WHITE X +2757 ; Pattern_Syntax # Cn +2758..275E ; Pattern_Syntax # So [7] LIGHT VERTICAL BAR..HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT +275F..2760 ; Pattern_Syntax # Cn [2] .. +2761..2767 ; Pattern_Syntax # So [7] CURVED STEM PARAGRAPH SIGN ORNAMENT..ROTATED FLORAL HEART BULLET +2768 ; Pattern_Syntax # Ps MEDIUM LEFT PARENTHESIS ORNAMENT +2769 ; Pattern_Syntax # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT +276A ; Pattern_Syntax # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT +276B ; Pattern_Syntax # Pe MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT +276C ; Pattern_Syntax # Ps MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT +276D ; Pattern_Syntax # Pe MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT +276E ; Pattern_Syntax # Ps HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT +276F ; Pattern_Syntax # Pe HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT +2770 ; Pattern_Syntax # Ps HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT +2771 ; Pattern_Syntax # Pe HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT +2772 ; Pattern_Syntax # Ps LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT +2773 ; Pattern_Syntax # Pe LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT +2774 ; Pattern_Syntax # Ps MEDIUM LEFT CURLY BRACKET ORNAMENT +2775 ; Pattern_Syntax # Pe MEDIUM RIGHT CURLY BRACKET ORNAMENT +2794 ; Pattern_Syntax # So HEAVY WIDE-HEADED RIGHTWARDS ARROW +2795..2797 ; Pattern_Syntax # Cn [3] .. +2798..27AF ; Pattern_Syntax # So [24] HEAVY SOUTH EAST ARROW..NOTCHED LOWER RIGHT-SHADOWED WHITE RIGHTWARDS ARROW +27B0 ; Pattern_Syntax # Cn +27B1..27BE ; Pattern_Syntax # So [14] NOTCHED UPPER RIGHT-SHADOWED WHITE RIGHTWARDS ARROW..OPEN-OUTLINED RIGHTWARDS ARROW +27BF ; Pattern_Syntax # Cn +27C0..27C4 ; Pattern_Syntax # Sm [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET +27C5 ; Pattern_Syntax # Ps LEFT S-SHAPED BAG DELIMITER +27C6 ; Pattern_Syntax # Pe RIGHT S-SHAPED BAG DELIMITER +27C7..27CA ; Pattern_Syntax # Sm [4] OR WITH DOT INSIDE..VERTICAL BAR WITH HORIZONTAL STROKE +27CB ; Pattern_Syntax # Cn +27CC ; Pattern_Syntax # Sm LONG DIVISION +27CD..27CF ; Pattern_Syntax # Cn [3] .. +27D0..27E5 ; Pattern_Syntax # Sm [22] WHITE DIAMOND WITH CENTRED DOT..WHITE SQUARE WITH RIGHTWARDS TICK +27E6 ; Pattern_Syntax # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET +27E7 ; Pattern_Syntax # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET +27E8 ; Pattern_Syntax # Ps MATHEMATICAL LEFT ANGLE BRACKET +27E9 ; Pattern_Syntax # Pe MATHEMATICAL RIGHT ANGLE BRACKET +27EA ; Pattern_Syntax # Ps MATHEMATICAL LEFT DOUBLE ANGLE BRACKET +27EB ; Pattern_Syntax # Pe MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET +27EC ; Pattern_Syntax # Ps MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET +27ED ; Pattern_Syntax # Pe MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET +27EE ; Pattern_Syntax # Ps MATHEMATICAL LEFT FLATTENED PARENTHESIS +27EF ; Pattern_Syntax # Pe MATHEMATICAL RIGHT FLATTENED PARENTHESIS +27F0..27FF ; Pattern_Syntax # Sm [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW +2800..28FF ; Pattern_Syntax # So [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678 +2900..2982 ; Pattern_Syntax # Sm [131] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..Z NOTATION TYPE COLON +2983 ; Pattern_Syntax # Ps LEFT WHITE CURLY BRACKET +2984 ; Pattern_Syntax # Pe RIGHT WHITE CURLY BRACKET +2985 ; Pattern_Syntax # Ps LEFT WHITE PARENTHESIS +2986 ; Pattern_Syntax # Pe RIGHT WHITE PARENTHESIS +2987 ; Pattern_Syntax # Ps Z NOTATION LEFT IMAGE BRACKET +2988 ; Pattern_Syntax # Pe Z NOTATION RIGHT IMAGE BRACKET +2989 ; Pattern_Syntax # Ps Z NOTATION LEFT BINDING BRACKET +298A ; Pattern_Syntax # Pe Z NOTATION RIGHT BINDING BRACKET +298B ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH UNDERBAR +298C ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH UNDERBAR +298D ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH TICK IN TOP CORNER +298E ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +298F ; Pattern_Syntax # Ps LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +2990 ; Pattern_Syntax # Pe RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER +2991 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET WITH DOT +2992 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET WITH DOT +2993 ; Pattern_Syntax # Ps LEFT ARC LESS-THAN BRACKET +2994 ; Pattern_Syntax # Pe RIGHT ARC GREATER-THAN BRACKET +2995 ; Pattern_Syntax # Ps DOUBLE LEFT ARC GREATER-THAN BRACKET +2996 ; Pattern_Syntax # Pe DOUBLE RIGHT ARC LESS-THAN BRACKET +2997 ; Pattern_Syntax # Ps LEFT BLACK TORTOISE SHELL BRACKET +2998 ; Pattern_Syntax # Pe RIGHT BLACK TORTOISE SHELL BRACKET +2999..29D7 ; Pattern_Syntax # Sm [63] DOTTED FENCE..BLACK HOURGLASS +29D8 ; Pattern_Syntax # Ps LEFT WIGGLY FENCE +29D9 ; Pattern_Syntax # Pe RIGHT WIGGLY FENCE +29DA ; Pattern_Syntax # Ps LEFT DOUBLE WIGGLY FENCE +29DB ; Pattern_Syntax # Pe RIGHT DOUBLE WIGGLY FENCE +29DC..29FB ; Pattern_Syntax # Sm [32] INCOMPLETE INFINITY..TRIPLE PLUS +29FC ; Pattern_Syntax # Ps LEFT-POINTING CURVED ANGLE BRACKET +29FD ; Pattern_Syntax # Pe RIGHT-POINTING CURVED ANGLE BRACKET +29FE..2AFF ; Pattern_Syntax # Sm [258] TINY..N-ARY WHITE VERTICAL BAR +2B00..2B2F ; Pattern_Syntax # So [48] NORTH EAST WHITE ARROW..WHITE VERTICAL ELLIPSE +2B30..2B44 ; Pattern_Syntax # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET +2B45..2B46 ; Pattern_Syntax # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW +2B47..2B4C ; Pattern_Syntax # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR +2B4D..2B4F ; Pattern_Syntax # Cn [3] .. +2B50..2B54 ; Pattern_Syntax # So [5] WHITE MEDIUM STAR..WHITE RIGHT-POINTING PENTAGON +2B55..2BFF ; Pattern_Syntax # Cn [171] .. +2E00..2E01 ; Pattern_Syntax # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER +2E02 ; Pattern_Syntax # Pi LEFT SUBSTITUTION BRACKET +2E03 ; Pattern_Syntax # Pf RIGHT SUBSTITUTION BRACKET +2E04 ; Pattern_Syntax # Pi LEFT DOTTED SUBSTITUTION BRACKET +2E05 ; Pattern_Syntax # Pf RIGHT DOTTED SUBSTITUTION BRACKET +2E06..2E08 ; Pattern_Syntax # Po [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER +2E09 ; Pattern_Syntax # Pi LEFT TRANSPOSITION BRACKET +2E0A ; Pattern_Syntax # Pf RIGHT TRANSPOSITION BRACKET +2E0B ; Pattern_Syntax # Po RAISED SQUARE +2E0C ; Pattern_Syntax # Pi LEFT RAISED OMISSION BRACKET +2E0D ; Pattern_Syntax # Pf RIGHT RAISED OMISSION BRACKET +2E0E..2E16 ; Pattern_Syntax # Po [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE +2E17 ; Pattern_Syntax # Pd DOUBLE OBLIQUE HYPHEN +2E18..2E19 ; Pattern_Syntax # Po [2] INVERTED INTERROBANG..PALM BRANCH +2E1A ; Pattern_Syntax # Pd HYPHEN WITH DIAERESIS +2E1B ; Pattern_Syntax # Po TILDE WITH RING ABOVE +2E1C ; Pattern_Syntax # Pi LEFT LOW PARAPHRASE BRACKET +2E1D ; Pattern_Syntax # Pf RIGHT LOW PARAPHRASE BRACKET +2E1E..2E1F ; Pattern_Syntax # Po [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW +2E20 ; Pattern_Syntax # Pi LEFT VERTICAL BAR WITH QUILL +2E21 ; Pattern_Syntax # Pf RIGHT VERTICAL BAR WITH QUILL +2E22 ; Pattern_Syntax # Ps TOP LEFT HALF BRACKET +2E23 ; Pattern_Syntax # Pe TOP RIGHT HALF BRACKET +2E24 ; Pattern_Syntax # Ps BOTTOM LEFT HALF BRACKET +2E25 ; Pattern_Syntax # Pe BOTTOM RIGHT HALF BRACKET +2E26 ; Pattern_Syntax # Ps LEFT SIDEWAYS U BRACKET +2E27 ; Pattern_Syntax # Pe RIGHT SIDEWAYS U BRACKET +2E28 ; Pattern_Syntax # Ps LEFT DOUBLE PARENTHESIS +2E29 ; Pattern_Syntax # Pe RIGHT DOUBLE PARENTHESIS +2E2A..2E2E ; Pattern_Syntax # Po [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK +2E2F ; Pattern_Syntax # Lm VERTICAL TILDE +2E30 ; Pattern_Syntax # Po RING POINT +2E31..2E7F ; Pattern_Syntax # Cn [79] .. +3001..3003 ; Pattern_Syntax # Po [3] IDEOGRAPHIC COMMA..DITTO MARK +3008 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET +3009 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET +300A ; Pattern_Syntax # Ps LEFT DOUBLE ANGLE BRACKET +300B ; Pattern_Syntax # Pe RIGHT DOUBLE ANGLE BRACKET +300C ; Pattern_Syntax # Ps LEFT CORNER BRACKET +300D ; Pattern_Syntax # Pe RIGHT CORNER BRACKET +300E ; Pattern_Syntax # Ps LEFT WHITE CORNER BRACKET +300F ; Pattern_Syntax # Pe RIGHT WHITE CORNER BRACKET +3010 ; Pattern_Syntax # Ps LEFT BLACK LENTICULAR BRACKET +3011 ; Pattern_Syntax # Pe RIGHT BLACK LENTICULAR BRACKET +3012..3013 ; Pattern_Syntax # So [2] POSTAL MARK..GETA MARK +3014 ; Pattern_Syntax # Ps LEFT TORTOISE SHELL BRACKET +3015 ; Pattern_Syntax # Pe RIGHT TORTOISE SHELL BRACKET +3016 ; Pattern_Syntax # Ps LEFT WHITE LENTICULAR BRACKET +3017 ; Pattern_Syntax # Pe RIGHT WHITE LENTICULAR BRACKET +3018 ; Pattern_Syntax # Ps LEFT WHITE TORTOISE SHELL BRACKET +3019 ; Pattern_Syntax # Pe RIGHT WHITE TORTOISE SHELL BRACKET +301A ; Pattern_Syntax # Ps LEFT WHITE SQUARE BRACKET +301B ; Pattern_Syntax # Pe RIGHT WHITE SQUARE BRACKET +301C ; Pattern_Syntax # Pd WAVE DASH +301D ; Pattern_Syntax # Ps REVERSED DOUBLE PRIME QUOTATION MARK +301E..301F ; Pattern_Syntax # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK +3020 ; Pattern_Syntax # So POSTAL MARK FACE +3030 ; Pattern_Syntax # Pd WAVY DASH +FD3E ; Pattern_Syntax # Ps ORNATE LEFT PARENTHESIS +FD3F ; Pattern_Syntax # Pe ORNATE RIGHT PARENTHESIS +FE45..FE46 ; Pattern_Syntax # Po [2] SESAME DOT..WHITE SESAME DOT + +# Total code points: 2760 + +# EOF diff --git a/extra/unicode/data/SpecialCasing.txt b/extra/unicode/data/SpecialCasing.txt new file mode 100644 index 0000000000..4bfe148b06 --- /dev/null +++ b/extra/unicode/data/SpecialCasing.txt @@ -0,0 +1,264 @@ +# SpecialCasing-5.0.0.txt +# Date: 2006-03-03, 08:23:36 GMT [MD] +# +# Unicode Character Database +# Copyright (c) 1991-2006 Unicode, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# For documentation, see UCD.html +# +# Special Casing Properties +# +# This file is a supplement to the UnicodeData file. +# It contains additional information about the casing of Unicode characters. +# (For compatibility, the UnicodeData.txt file only contains case mappings for +# characters where they are 1-1, and does not have locale-specific mappings.) +# For more information, see the discussion of Case Mappings in the Unicode Standard. +# +# All code points not listed in this file that do not have a simple case mappings +# in UnicodeData.txt map to themselves. +# ================================================================================ +# Format +# ================================================================================ +# The entries in this file are in the following machine-readable format: +# +# ; ; ; <upper> ; (<condition_list> ;)? # <comment> +# +# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more +# than one character, they are separated by spaces. Other than as used to separate +# elements, spaces are to be ignored. +# +# The <condition_list> is optional. Where present, it consists of one or more locale IDs +# or contexts, separated by spaces. In these conditions: +# - A condition list overrides the normal behavior if all of the listed conditions are true. +# - The context is always the context of the characters in the original string, +# NOT in the resulting string. +# - Case distinctions in the condition list are not significant. +# - Conditions preceded by "Not_" represent the negation of the condition. +# +# A locale ID is defined by taking any language tag as defined by +# RFC 3066 (or its successor), and replacing '-' by '_'. +# +# A context for a character C is defined by Section 3.13 Default Case +# Operations, of The Unicode Standard, Version 5.0. +# (This is identical to the context defined by Unicode 4.1.0, +# as specified in http://www.unicode.org/versions/Unicode4.1.0/) +# +# Parsers of this file must be prepared to deal with future additions to this format: +# * Additional contexts +# * Additional fields +# ================================================================================ + +# ================================================================================ +# Unconditional mappings +# ================================================================================ + +# The German es-zed is special--the normal mapping is to SS. +# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>)) + +00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S + +# Preserve canonical equivalence for I with dot. Turkic is handled below. + +0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE + +# Ligatures + +FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF +FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI +FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL +FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI +FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL +FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T +FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST + +0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN +FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW +FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH +FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI +FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW +FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH + +# No corresponding uppercase precomposed character + +0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON +1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW +1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS +1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE +1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE +1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING +1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI +1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA +1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA +1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI +1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI +1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI +1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA +1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI +1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA +1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI +1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI +1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI + +# IMPORTANT-when capitalizing iota-subscript (0345) +# It MUST be in normalized form--moved to the end of any sequence of combining marks. +# This is because logically it represents a following base character! +# E.g. <iota_subscript> (<Mn> | <Mc> | <Me>)+ => (<Mn> | <Mc> | <Me>)+ <iota_subscript> +# It should never be the first character in a word, so in titlecasing it can be left as is. + +# The following cases are already in the UnicodeData file, so are only commented here. + +# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI + +# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript) +# have special uppercases. +# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase! + +1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI +1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI +1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI +1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI +1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI +1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI +1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI +1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI +1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI +1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + +# Some characters with YPOGEGRAMMENI also have no corresponding titlecases + +1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI +1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI +1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI +1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI + +1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI + +# ================================================================================ +# Conditional mappings +# ================================================================================ + +# Special case for final form of sigma + +03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA + +# Note: the following cases for non-final are already in the UnicodeData file. + +# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA +# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA +# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA + +# Note: the following cases are not included, since they would case-fold in lowercasing + +# 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA +# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA + +# ================================================================================ +# Locale-sensitive mappings +# ================================================================================ + +# Lithuanian + +# Lithuanian retains the dot in a lowercase i when followed by accents. + +# Remove DOT ABOVE after "i" with upper or titlecase + +0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE + +# Introduce an explicit dot above when lowercasing capital I's and J's +# whenever there are more accents above. +# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) + +0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I +004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J +012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK +00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE +00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE +0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE + +# ================================================================================ + +# Turkish and Azeri + +# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri +# The following rules handle those cases. + +0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE + +# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. +# This matches the behavior of the canonically equivalent I-dot_above + +0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE +0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE + +# When lowercasing, unless an I is before a dot_above, it turns into a dotless i. + +0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I +0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I + +# When uppercasing, i turns into a dotted capital I + +0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I +0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I + +# Note: the following case is already in the UnicodeData file. + +# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I + +# EOF + diff --git a/extra/unicode/UnicodeData.txt b/extra/unicode/data/UnicodeData.txt similarity index 100% rename from extra/unicode/UnicodeData.txt rename to extra/unicode/data/UnicodeData.txt diff --git a/extra/unicode/data/data.factor b/extra/unicode/data/data.factor index b411e4e209..706a8adef4 100755 --- a/extra/unicode/data/data.factor +++ b/extra/unicode/data/data.factor @@ -1,7 +1,8 @@ USING: assocs math kernel sequences io.files hashtables quotations splitting arrays math.parser hash2 math.order byte-arrays words namespaces words compiler.units parser -io.encodings.ascii values ; +io.encodings.ascii values interval-maps ascii sets assocs.lib +combinators.lib ; IN: unicode.data ! Convenience functions @@ -10,15 +11,21 @@ IN: unicode.data ! Loading data from UnicodeData.txt +: split-; ( line -- array ) + ";" split [ [ blank? ] trim ] map ; + : data ( filename -- data ) - ascii file-lines [ ";" split ] map ; + ascii file-lines [ split-; ] map ; : load-data ( -- data ) - "resource:extra/unicode/UnicodeData.txt" data ; + "resource:extra/unicode/data/UnicodeData.txt" data ; + +: filter-comments ( lines -- lines ) + [ "#@" split first ] map harvest ; : (process-data) ( index data -- newdata ) + filter-comments [ [ nth ] keep first swap 2array ] with map - [ second empty? not ] filter [ >r hex> r> ] assoc-map ; : process-data ( index data -- hash ) @@ -34,7 +41,7 @@ IN: unicode.data dup [ swap (chain-decomposed) ] curry assoc-map ; : first* ( seq -- ? ) - second dup empty? [ ] [ first ] ?if ; + second [ empty? ] [ first ] or? ; : (process-decomposed) ( data -- alist ) 5 swap (process-data) @@ -46,12 +53,12 @@ IN: unicode.data [ second length 2 = ] filter ! using 1009 as the size, the maximum load is 4 [ first2 first2 rot 3array ] map 1009 alist>hash2 - ] keep - >hashtable chain-decomposed ; + ] [ >hashtable chain-decomposed ] bi ; -: process-compat ( data -- hash ) +: process-compatibility ( data -- hash ) (process-decomposed) [ dup first* [ first2 rest 2array ] unless ] map + [ second empty? not ] filter >hashtable chain-decomposed ; : process-combining ( data -- hash ) @@ -99,30 +106,51 @@ C: <code-point> code-point 4 head [ multihex ] map first4 <code-point> swap first set ; +! Extra properties +: properties-lines ( -- lines ) + "resource:extra/unicode/data/PropList.txt" + ascii file-lines ; + +: parse-properties ( -- {{[a,b],prop}} ) + properties-lines filter-comments [ + split-; first2 + [ ".." split1 [ dup ] unless* [ hex> ] bi@ 2array ] dip + ] { } map>assoc ; + +: properties>intervals ( properties -- assoc[str,interval] ) + dup values prune [ f ] H{ } map>assoc + [ [ insert-at ] curry assoc-each ] keep + [ <interval-set> ] assoc-map ; + +: load-properties ( -- assoc ) + parse-properties properties>intervals ; + +! Special casing data +: load-special-casing ( -- special-casing ) + "resource:extra/unicode/data/SpecialCasing.txt" data + [ length 5 = ] filter + [ [ set-code-point ] each ] H{ } make-assoc ; + VALUE: simple-lower VALUE: simple-upper VALUE: simple-title VALUE: canonical-map VALUE: combine-map VALUE: class-map -VALUE: compat-map +VALUE: compatibility-map VALUE: category-map VALUE: name-map VALUE: special-casing +VALUE: properties : canonical-entry ( char -- seq ) canonical-map at ; : combine-chars ( a b -- char/f ) combine-map hash2 ; -: compat-entry ( char -- seq ) compat-map at ; +: compatibility-entry ( char -- seq ) compatibility-map at ; : combining-class ( char -- n ) class-map at ; : non-starter? ( char -- ? ) class-map key? ; : name>char ( string -- char ) name-map at ; : char>name ( char -- string ) name-map value-at ; - -! Special casing data -: load-special-casing ( -- special-casing ) - "resource:extra/unicode/SpecialCasing.txt" data - [ length 5 = ] filter - [ [ set-code-point ] each ] H{ } make-assoc ; +: property? ( char property -- ? ) properties at interval-key? ; load-data dup process-names \ name-map set-value @@ -132,6 +160,9 @@ dup process-names \ name-map set-value dup process-combining \ class-map set-value dup process-canonical \ canonical-map set-value \ combine-map set-value -dup process-compat \ compat-map set-value +dup process-compatibility \ compatibility-map set-value process-category \ category-map set-value + load-special-casing \ special-casing set-value + +load-properties \ properties set-value diff --git a/extra/unicode/normalize/normalize.factor b/extra/unicode/normalize/normalize.factor index 97bcf145d9..dca8dbf161 100755 --- a/extra/unicode/normalize/normalize.factor +++ b/extra/unicode/normalize/normalize.factor @@ -1,5 +1,5 @@ USING: sequences namespaces unicode.data kernel math arrays -locals combinators.lib sequences.lib ; +locals combinators.lib sequences.lib combinators.lib ; IN: unicode.normalize ! Conjoining Jamo behavior @@ -55,15 +55,17 @@ IN: unicode.normalize : reorder-back ( string i -- ) over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ; -: decompose ( string quot -- decomposed ) +:: decompose ( string quot -- decomposed ) ! When there are 8 and 32-bit strings, this'll be ! equivalent to clone on 8 and the contents of the last ! main quotation on 32. - over [ 127 < ] all? [ drop ] [ - swap [ [ - dup hangul? [ hangul>jamo % drop ] - [ dup rot call [ % ] [ , ] ?if ] if - ] with each ] "" make + string [ 127 < ] all? [ string ] [ + [ + string [ + dup hangul? [ hangul>jamo % ] + [ dup quot call [ % ] [ , ] ?if ] if + ] each + ] "" make dup reorder ] if ; inline @@ -71,7 +73,7 @@ IN: unicode.normalize [ canonical-entry ] decompose ; : nfkd ( string -- string ) - [ compat-entry ] decompose ; + [ compatibility-entry ] decompose ; : string-append ( s1 s2 -- string ) ! This could be more optimized, diff --git a/extra/unicode/script/script.factor b/extra/unicode/script/script.factor index 2d07ba2caa..949b464669 100755 --- a/extra/unicode/script/script.factor +++ b/extra/unicode/script/script.factor @@ -1,7 +1,7 @@ USING: values kernel sequences assocs io.files io.encodings ascii math.ranges io splitting math.parser namespaces byte-arrays locals math sets io.encodings.ascii -words compiler.units arrays interval-maps ; +words compiler.units arrays interval-maps unicode.data ; IN: unicode.script <PRIVATE @@ -10,9 +10,7 @@ SYMBOL: interned : parse-script ( stream -- assoc ) ! assoc is code point/range => name - lines [ "#" split1 drop ] map harvest [ - ";" split1 [ [ blank? ] trim ] bi@ - ] H{ } map>assoc ; + lines filter-comments [ split-; ] map >hashtable ; : range, ( value key -- ) swap interned get