Fixing Unicode collation, except for Korean
parent
8ea775f9b5
commit
5646d067e4
|
@ -5,3 +5,7 @@ USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
|
|||
printable? alpha? control? uncased? character?
|
||||
} [ execute ] with map ] unit-test
|
||||
[ "Nd" ] [ CHAR: 3 category ] unit-test
|
||||
[ "Lo" ] [ HEX: 3400 category ] unit-test
|
||||
[ "Lo" ] [ HEX: 3450 category ] unit-test
|
||||
[ "Lo" ] [ HEX: 4DB5 category ] unit-test
|
||||
[ "Cs" ] [ HEX: DD00 category ] unit-test
|
||||
|
|
|
@ -21,3 +21,5 @@ IN: unicode.collation.tests
|
|||
: failures
|
||||
parse-test dup 2 <clumps>
|
||||
[ string<=> +lt+ = not ] assoc-filter dup assoc-size ;
|
||||
|
||||
[ 7 ] [ failures 2nip ] unit-test
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
USING: sequences io.files io.encodings.ascii kernel values
|
||||
splitting accessors math.parser ascii io assocs strings math
|
||||
namespaces sorting combinators math.order arrays
|
||||
unicode.normalize unicode.data combinators.lib locals ;
|
||||
unicode.normalize unicode.data combinators.lib locals
|
||||
unicode.syntax macros sequences.deep words unicode.breaks
|
||||
quotations ;
|
||||
IN: unicode.collation
|
||||
|
||||
VALUE: ducet
|
||||
|
@ -27,9 +29,12 @@ TUPLE: weight primary secondary tertiary ignorable? ;
|
|||
ascii <file-reader> parse-ducet \ ducet set-value
|
||||
|
||||
: base ( char -- base )
|
||||
dup "Unified_Ideograph" property?
|
||||
[ -16 shift zero? HEX: FB40 HEX: FB80 ? ]
|
||||
[ drop HEX: FBC0 ] if ;
|
||||
{
|
||||
{ [ dup HEX: 3400 HEX: 4DB5 between? ] [ drop HEX: FB80 ] } ! Extension A
|
||||
{ [ dup HEX: 20000 HEX: 2A6D6 between? ] [ drop HEX: FB80 ] } ! Extension B
|
||||
{ [ dup HEX: 4E00 HEX: 9FC3 between? ] [ drop HEX: FB40 ] } ! CJK
|
||||
[ drop HEX: FBC0 ] ! Other
|
||||
} cond ;
|
||||
|
||||
: AAAA ( char -- weight )
|
||||
[ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ;
|
||||
|
@ -37,8 +42,12 @@ ascii <file-reader> parse-ducet \ ducet set-value
|
|||
: BBBB ( char -- weight )
|
||||
HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ;
|
||||
|
||||
: illegal? ( char -- ? )
|
||||
[ "Noncharacter_Code_Point" property? ]
|
||||
[ category "Cs" = ] or? ;
|
||||
|
||||
: derive-weight ( char -- weights )
|
||||
first dup "Noncharacter_Code_Point" property?
|
||||
first dup illegal?
|
||||
[ drop { } ]
|
||||
[ [ AAAA ] [ BBBB ] bi 2array ] if ;
|
||||
|
||||
|
@ -67,12 +76,34 @@ ascii <file-reader> parse-ducet \ ducet set-value
|
|||
[ drop ] [ 1string , ] if
|
||||
] if ;
|
||||
|
||||
: terminator 1 0 0 f weight boa ;
|
||||
|
||||
MACRO: const ( seq -- seq )
|
||||
[ dup word? [ execute ] when ] deep-map 1quotation ;
|
||||
|
||||
! : char, ( char -- )
|
||||
! [
|
||||
! building get peek [ first ] bi@ dup jamo? [
|
||||
! over jamo? [
|
||||
! [ grapheme-class ] bi@ swap 2array
|
||||
! { { T L } { V L } { V T } } const
|
||||
! member? [ terminator , ] when
|
||||
! ] [ 2drop terminator , ] if
|
||||
! ] [ 2drop ] if
|
||||
! ] [ , ] bi ;
|
||||
|
||||
! : insert-terminators ( graphemes -- graphemes )
|
||||
! Insert a terminator between hangul syllables
|
||||
! [ unclip , [ char, ] each ] { } make ;
|
||||
|
||||
: string>graphemes ( string -- graphemes )
|
||||
[ [ add ] each ] { } make ;
|
||||
[ [ add ] each ] { } make ; ! insert-terminators ;
|
||||
|
||||
: graphemes>weights ( graphemes -- weights )
|
||||
[ dup ducet at [ ] [ derive-weight ] ?if ]
|
||||
{ } map-as concat ;
|
||||
[
|
||||
dup weight? [ 1array ] ! From tailoring
|
||||
[ dup ducet at [ ] [ derive-weight ] ?if ] if
|
||||
] { } map-as concat ;
|
||||
|
||||
: append-weights ( weights quot -- )
|
||||
swap [ ignorable?>> not ] filter
|
||||
|
|
|
@ -2,9 +2,30 @@ USING: assocs math kernel sequences io.files hashtables
|
|||
quotations splitting arrays math.parser hash2 math.order
|
||||
byte-arrays words namespaces words compiler.units parser
|
||||
io.encodings.ascii values interval-maps ascii sets assocs.lib
|
||||
combinators.lib ;
|
||||
combinators.lib combinators locals math.ranges sorting ;
|
||||
IN: unicode.data
|
||||
|
||||
VALUE: simple-lower
|
||||
VALUE: simple-upper
|
||||
VALUE: simple-title
|
||||
VALUE: canonical-map
|
||||
VALUE: combine-map
|
||||
VALUE: class-map
|
||||
VALUE: compatibility-map
|
||||
VALUE: category-map
|
||||
VALUE: name-map
|
||||
VALUE: special-casing
|
||||
VALUE: properties
|
||||
|
||||
: canonical-entry ( char -- seq ) canonical-map at ;
|
||||
: combine-chars ( a b -- char/f ) combine-map hash2 ;
|
||||
: compatibility-entry ( char -- seq ) compatibility-map at ;
|
||||
: combining-class ( char -- n ) class-map at ;
|
||||
: non-starter? ( char -- ? ) class-map key? ;
|
||||
: name>char ( string -- char ) name-map at ;
|
||||
: char>name ( char -- string ) name-map value-at ;
|
||||
: property? ( char property -- ? ) properties at interval-key? ;
|
||||
|
||||
! Convenience functions
|
||||
: ?between? ( n/f from to -- ? )
|
||||
pick [ between? ] [ 3drop f ] if ;
|
||||
|
@ -69,23 +90,34 @@ IN: unicode.data
|
|||
|
||||
: categories ( -- names )
|
||||
! For non-existent characters, use Cn
|
||||
{ "Lu" "Ll" "Lt" "Lm" "Lo"
|
||||
{ "Cn"
|
||||
"Lu" "Ll" "Lt" "Lm" "Lo"
|
||||
"Mn" "Mc" "Me"
|
||||
"Nd" "Nl" "No"
|
||||
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
|
||||
"Sm" "Sc" "Sk" "So"
|
||||
"Zs" "Zl" "Zp"
|
||||
"Cc" "Cf" "Cs" "Co" "Cn" } ;
|
||||
"Cc" "Cf" "Cs" "Co" } ;
|
||||
|
||||
: unicode-chars HEX: 2FA1E ;
|
||||
: num-chars HEX: 2FA1E ;
|
||||
! the maximum unicode char in the first 3 planes
|
||||
|
||||
: process-category ( data -- category-listing )
|
||||
2 swap (process-data)
|
||||
unicode-chars <byte-array> swap dupd swap [
|
||||
>r over unicode-chars >= [ r> 3drop ]
|
||||
[ categories index swap r> set-nth ] if
|
||||
] curry assoc-each ;
|
||||
: ?set-nth ( val index seq -- )
|
||||
2dup bounds-check? [ set-nth ] [ 3drop ] if ;
|
||||
|
||||
:: fill-ranges ( table -- table )
|
||||
name-map >alist sort-values keys
|
||||
[ [ "first>" tail? ] [ "last>" tail? ] or? ] filter
|
||||
2 group [
|
||||
[ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi
|
||||
[ swap table ?set-nth ] curry each
|
||||
] assoc-each table ;
|
||||
|
||||
:: process-category ( data -- category-listing )
|
||||
[let | table [ num-chars <byte-array> ] |
|
||||
2 data (process-data) [| char cat |
|
||||
cat categories index char table ?set-nth
|
||||
] assoc-each table fill-ranges ] ;
|
||||
|
||||
: ascii-lower ( string -- lower )
|
||||
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
|
||||
|
@ -131,37 +163,18 @@ C: <code-point> code-point
|
|||
[ length 5 = ] filter
|
||||
[ [ set-code-point ] each ] H{ } make-assoc ;
|
||||
|
||||
VALUE: simple-lower
|
||||
VALUE: simple-upper
|
||||
VALUE: simple-title
|
||||
VALUE: canonical-map
|
||||
VALUE: combine-map
|
||||
VALUE: class-map
|
||||
VALUE: compatibility-map
|
||||
VALUE: category-map
|
||||
VALUE: name-map
|
||||
VALUE: special-casing
|
||||
VALUE: properties
|
||||
|
||||
: canonical-entry ( char -- seq ) canonical-map at ;
|
||||
: combine-chars ( a b -- char/f ) combine-map hash2 ;
|
||||
: compatibility-entry ( char -- seq ) compatibility-map at ;
|
||||
: combining-class ( char -- n ) class-map at ;
|
||||
: non-starter? ( char -- ? ) class-map key? ;
|
||||
: name>char ( string -- char ) name-map at ;
|
||||
: char>name ( char -- string ) name-map value-at ;
|
||||
: property? ( char property -- ? ) properties at interval-key? ;
|
||||
|
||||
load-data
|
||||
dup process-names \ name-map set-value
|
||||
13 over process-data \ simple-lower set-value
|
||||
12 over process-data tuck \ simple-upper set-value
|
||||
14 over process-data swapd assoc-union \ simple-title set-value
|
||||
dup process-combining \ class-map set-value
|
||||
dup process-canonical \ canonical-map set-value
|
||||
\ combine-map set-value
|
||||
dup process-compatibility \ compatibility-map set-value
|
||||
process-category \ category-map set-value
|
||||
load-data {
|
||||
[ process-names \ name-map set-value ]
|
||||
[ 13 swap process-data \ simple-lower set-value ]
|
||||
[ 12 swap process-data \ simple-upper set-value ]
|
||||
[ 14 swap process-data
|
||||
simple-upper assoc-union \ simple-title set-value ]
|
||||
[ process-combining \ class-map set-value ]
|
||||
[ process-canonical \ canonical-map set-value
|
||||
\ combine-map set-value ]
|
||||
[ process-compatibility \ compatibility-map set-value ]
|
||||
[ process-category \ category-map set-value ]
|
||||
} cleave
|
||||
|
||||
load-special-casing \ special-casing set-value
|
||||
|
||||
|
|
Loading…
Reference in New Issue