Fixing Unicode collation, except for Korean

db4
Daniel Ehrenberg 2008-05-25 02:51:27 -05:00
parent 8ea775f9b5
commit 5646d067e4
4 changed files with 99 additions and 49 deletions

View File

@ -5,3 +5,7 @@ USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
printable? alpha? control? uncased? character? printable? alpha? control? uncased? character?
} [ execute ] with map ] unit-test } [ execute ] with map ] unit-test
[ "Nd" ] [ CHAR: 3 category ] unit-test [ "Nd" ] [ CHAR: 3 category ] unit-test
[ "Lo" ] [ HEX: 3400 category ] unit-test
[ "Lo" ] [ HEX: 3450 category ] unit-test
[ "Lo" ] [ HEX: 4DB5 category ] unit-test
[ "Cs" ] [ HEX: DD00 category ] unit-test

View File

@ -21,3 +21,5 @@ IN: unicode.collation.tests
: failures : failures
parse-test dup 2 <clumps> parse-test dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-filter dup assoc-size ; [ string<=> +lt+ = not ] assoc-filter dup assoc-size ;
[ 7 ] [ failures 2nip ] unit-test

View File

@ -1,7 +1,9 @@
USING: sequences io.files io.encodings.ascii kernel values USING: sequences io.files io.encodings.ascii kernel values
splitting accessors math.parser ascii io assocs strings math splitting accessors math.parser ascii io assocs strings math
namespaces sorting combinators math.order arrays namespaces sorting combinators math.order arrays
unicode.normalize unicode.data combinators.lib locals ; unicode.normalize unicode.data combinators.lib locals
unicode.syntax macros sequences.deep words unicode.breaks
quotations ;
IN: unicode.collation IN: unicode.collation
VALUE: ducet VALUE: ducet
@ -27,9 +29,12 @@ TUPLE: weight primary secondary tertiary ignorable? ;
ascii <file-reader> parse-ducet \ ducet set-value ascii <file-reader> parse-ducet \ ducet set-value
: base ( char -- base ) : base ( char -- base )
dup "Unified_Ideograph" property? {
[ -16 shift zero? HEX: FB40 HEX: FB80 ? ] { [ dup HEX: 3400 HEX: 4DB5 between? ] [ drop HEX: FB80 ] } ! Extension A
[ drop HEX: FBC0 ] if ; { [ dup HEX: 20000 HEX: 2A6D6 between? ] [ drop HEX: FB80 ] } ! Extension B
{ [ dup HEX: 4E00 HEX: 9FC3 between? ] [ drop HEX: FB40 ] } ! CJK
[ drop HEX: FBC0 ] ! Other
} cond ;
: AAAA ( char -- weight ) : AAAA ( char -- weight )
[ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ; [ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ;
@ -37,8 +42,12 @@ ascii <file-reader> parse-ducet \ ducet set-value
: BBBB ( char -- weight ) : BBBB ( char -- weight )
HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ; HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ;
: illegal? ( char -- ? )
[ "Noncharacter_Code_Point" property? ]
[ category "Cs" = ] or? ;
: derive-weight ( char -- weights ) : derive-weight ( char -- weights )
first dup "Noncharacter_Code_Point" property? first dup illegal?
[ drop { } ] [ drop { } ]
[ [ AAAA ] [ BBBB ] bi 2array ] if ; [ [ AAAA ] [ BBBB ] bi 2array ] if ;
@ -67,12 +76,34 @@ ascii <file-reader> parse-ducet \ ducet set-value
[ drop ] [ 1string , ] if [ drop ] [ 1string , ] if
] if ; ] if ;
: terminator 1 0 0 f weight boa ;
MACRO: const ( seq -- seq )
[ dup word? [ execute ] when ] deep-map 1quotation ;
! : char, ( char -- )
! [
! building get peek [ first ] bi@ dup jamo? [
! over jamo? [
! [ grapheme-class ] bi@ swap 2array
! { { T L } { V L } { V T } } const
! member? [ terminator , ] when
! ] [ 2drop terminator , ] if
! ] [ 2drop ] if
! ] [ , ] bi ;
! : insert-terminators ( graphemes -- graphemes )
! Insert a terminator between hangul syllables
! [ unclip , [ char, ] each ] { } make ;
: string>graphemes ( string -- graphemes ) : string>graphemes ( string -- graphemes )
[ [ add ] each ] { } make ; [ [ add ] each ] { } make ; ! insert-terminators ;
: graphemes>weights ( graphemes -- weights ) : graphemes>weights ( graphemes -- weights )
[ dup ducet at [ ] [ derive-weight ] ?if ] [
{ } map-as concat ; dup weight? [ 1array ] ! From tailoring
[ dup ducet at [ ] [ derive-weight ] ?if ] if
] { } map-as concat ;
: append-weights ( weights quot -- ) : append-weights ( weights quot -- )
swap [ ignorable?>> not ] filter swap [ ignorable?>> not ] filter

View File

@ -2,9 +2,30 @@ USING: assocs math kernel sequences io.files hashtables
quotations splitting arrays math.parser hash2 math.order quotations splitting arrays math.parser hash2 math.order
byte-arrays words namespaces words compiler.units parser byte-arrays words namespaces words compiler.units parser
io.encodings.ascii values interval-maps ascii sets assocs.lib io.encodings.ascii values interval-maps ascii sets assocs.lib
combinators.lib ; combinators.lib combinators locals math.ranges sorting ;
IN: unicode.data IN: unicode.data
VALUE: simple-lower
VALUE: simple-upper
VALUE: simple-title
VALUE: canonical-map
VALUE: combine-map
VALUE: class-map
VALUE: compatibility-map
VALUE: category-map
VALUE: name-map
VALUE: special-casing
VALUE: properties
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compatibility-entry ( char -- seq ) compatibility-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) class-map key? ;
: name>char ( string -- char ) name-map at ;
: char>name ( char -- string ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
! Convenience functions ! Convenience functions
: ?between? ( n/f from to -- ? ) : ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ; pick [ between? ] [ 3drop f ] if ;
@ -69,23 +90,34 @@ IN: unicode.data
: categories ( -- names ) : categories ( -- names )
! For non-existent characters, use Cn ! For non-existent characters, use Cn
{ "Lu" "Ll" "Lt" "Lm" "Lo" { "Cn"
"Lu" "Ll" "Lt" "Lm" "Lo"
"Mn" "Mc" "Me" "Mn" "Mc" "Me"
"Nd" "Nl" "No" "Nd" "Nl" "No"
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po" "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
"Sm" "Sc" "Sk" "So" "Sm" "Sc" "Sk" "So"
"Zs" "Zl" "Zp" "Zs" "Zl" "Zp"
"Cc" "Cf" "Cs" "Co" "Cn" } ; "Cc" "Cf" "Cs" "Co" } ;
: unicode-chars HEX: 2FA1E ; : num-chars HEX: 2FA1E ;
! the maximum unicode char in the first 3 planes ! the maximum unicode char in the first 3 planes
: process-category ( data -- category-listing ) : ?set-nth ( val index seq -- )
2 swap (process-data) 2dup bounds-check? [ set-nth ] [ 3drop ] if ;
unicode-chars <byte-array> swap dupd swap [
>r over unicode-chars >= [ r> 3drop ] :: fill-ranges ( table -- table )
[ categories index swap r> set-nth ] if name-map >alist sort-values keys
] curry assoc-each ; [ [ "first>" tail? ] [ "last>" tail? ] or? ] filter
2 group [
[ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi
[ swap table ?set-nth ] curry each
] assoc-each table ;
:: process-category ( data -- category-listing )
[let | table [ num-chars <byte-array> ] |
2 data (process-data) [| char cat |
cat categories index char table ?set-nth
] assoc-each table fill-ranges ] ;
: ascii-lower ( string -- lower ) : ascii-lower ( string -- lower )
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ; [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
@ -131,37 +163,18 @@ C: <code-point> code-point
[ length 5 = ] filter [ length 5 = ] filter
[ [ set-code-point ] each ] H{ } make-assoc ; [ [ set-code-point ] each ] H{ } make-assoc ;
VALUE: simple-lower load-data {
VALUE: simple-upper [ process-names \ name-map set-value ]
VALUE: simple-title [ 13 swap process-data \ simple-lower set-value ]
VALUE: canonical-map [ 12 swap process-data \ simple-upper set-value ]
VALUE: combine-map [ 14 swap process-data
VALUE: class-map simple-upper assoc-union \ simple-title set-value ]
VALUE: compatibility-map [ process-combining \ class-map set-value ]
VALUE: category-map [ process-canonical \ canonical-map set-value
VALUE: name-map \ combine-map set-value ]
VALUE: special-casing [ process-compatibility \ compatibility-map set-value ]
VALUE: properties [ process-category \ category-map set-value ]
} cleave
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compatibility-entry ( char -- seq ) compatibility-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) class-map key? ;
: name>char ( string -- char ) name-map at ;
: char>name ( char -- string ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
load-data
dup process-names \ name-map set-value
13 over process-data \ simple-lower set-value
12 over process-data tuck \ simple-upper set-value
14 over process-data swapd assoc-union \ simple-title set-value
dup process-combining \ class-map set-value
dup process-canonical \ canonical-map set-value
\ combine-map set-value
dup process-compatibility \ compatibility-map set-value
process-category \ category-map set-value
load-special-casing \ special-casing set-value load-special-casing \ special-casing set-value