Fixing Unicode collation, except for Korean

db4
Daniel Ehrenberg 2008-05-25 02:51:27 -05:00
parent 8ea775f9b5
commit 5646d067e4
4 changed files with 99 additions and 49 deletions

View File

@ -5,3 +5,7 @@ USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
printable? alpha? control? uncased? character?
} [ execute ] with map ] unit-test
[ "Nd" ] [ CHAR: 3 category ] unit-test
[ "Lo" ] [ HEX: 3400 category ] unit-test
[ "Lo" ] [ HEX: 3450 category ] unit-test
[ "Lo" ] [ HEX: 4DB5 category ] unit-test
[ "Cs" ] [ HEX: DD00 category ] unit-test

View File

@ -21,3 +21,5 @@ IN: unicode.collation.tests
: failures
parse-test dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-filter dup assoc-size ;
[ 7 ] [ failures 2nip ] unit-test

View File

@ -1,7 +1,9 @@
USING: sequences io.files io.encodings.ascii kernel values
splitting accessors math.parser ascii io assocs strings math
namespaces sorting combinators math.order arrays
unicode.normalize unicode.data combinators.lib locals ;
unicode.normalize unicode.data combinators.lib locals
unicode.syntax macros sequences.deep words unicode.breaks
quotations ;
IN: unicode.collation
VALUE: ducet
@ -27,9 +29,12 @@ TUPLE: weight primary secondary tertiary ignorable? ;
ascii <file-reader> parse-ducet \ ducet set-value
: base ( char -- base )
dup "Unified_Ideograph" property?
[ -16 shift zero? HEX: FB40 HEX: FB80 ? ]
[ drop HEX: FBC0 ] if ;
{
{ [ dup HEX: 3400 HEX: 4DB5 between? ] [ drop HEX: FB80 ] } ! Extension A
{ [ dup HEX: 20000 HEX: 2A6D6 between? ] [ drop HEX: FB80 ] } ! Extension B
{ [ dup HEX: 4E00 HEX: 9FC3 between? ] [ drop HEX: FB40 ] } ! CJK
[ drop HEX: FBC0 ] ! Other
} cond ;
: AAAA ( char -- weight )
[ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ;
@ -37,8 +42,12 @@ ascii <file-reader> parse-ducet \ ducet set-value
: BBBB ( char -- weight )
HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ;
: illegal? ( char -- ? )
[ "Noncharacter_Code_Point" property? ]
[ category "Cs" = ] or? ;
: derive-weight ( char -- weights )
first dup "Noncharacter_Code_Point" property?
first dup illegal?
[ drop { } ]
[ [ AAAA ] [ BBBB ] bi 2array ] if ;
@ -67,12 +76,34 @@ ascii <file-reader> parse-ducet \ ducet set-value
[ drop ] [ 1string , ] if
] if ;
: terminator 1 0 0 f weight boa ;
MACRO: const ( seq -- seq )
[ dup word? [ execute ] when ] deep-map 1quotation ;
! : char, ( char -- )
! [
! building get peek [ first ] bi@ dup jamo? [
! over jamo? [
! [ grapheme-class ] bi@ swap 2array
! { { T L } { V L } { V T } } const
! member? [ terminator , ] when
! ] [ 2drop terminator , ] if
! ] [ 2drop ] if
! ] [ , ] bi ;
! : insert-terminators ( graphemes -- graphemes )
! Insert a terminator between hangul syllables
! [ unclip , [ char, ] each ] { } make ;
: string>graphemes ( string -- graphemes )
[ [ add ] each ] { } make ;
[ [ add ] each ] { } make ; ! insert-terminators ;
: graphemes>weights ( graphemes -- weights )
[ dup ducet at [ ] [ derive-weight ] ?if ]
{ } map-as concat ;
[
dup weight? [ 1array ] ! From tailoring
[ dup ducet at [ ] [ derive-weight ] ?if ] if
] { } map-as concat ;
: append-weights ( weights quot -- )
swap [ ignorable?>> not ] filter

View File

@ -2,9 +2,30 @@ USING: assocs math kernel sequences io.files hashtables
quotations splitting arrays math.parser hash2 math.order
byte-arrays words namespaces words compiler.units parser
io.encodings.ascii values interval-maps ascii sets assocs.lib
combinators.lib ;
combinators.lib combinators locals math.ranges sorting ;
IN: unicode.data
VALUE: simple-lower
VALUE: simple-upper
VALUE: simple-title
VALUE: canonical-map
VALUE: combine-map
VALUE: class-map
VALUE: compatibility-map
VALUE: category-map
VALUE: name-map
VALUE: special-casing
VALUE: properties
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compatibility-entry ( char -- seq ) compatibility-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) class-map key? ;
: name>char ( string -- char ) name-map at ;
: char>name ( char -- string ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
! Convenience functions
: ?between? ( n/f from to -- ? )
pick [ between? ] [ 3drop f ] if ;
@ -69,23 +90,34 @@ IN: unicode.data
: categories ( -- names )
! For non-existent characters, use Cn
{ "Lu" "Ll" "Lt" "Lm" "Lo"
{ "Cn"
"Lu" "Ll" "Lt" "Lm" "Lo"
"Mn" "Mc" "Me"
"Nd" "Nl" "No"
"Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
"Sm" "Sc" "Sk" "So"
"Zs" "Zl" "Zp"
"Cc" "Cf" "Cs" "Co" "Cn" } ;
"Cc" "Cf" "Cs" "Co" } ;
: unicode-chars HEX: 2FA1E ;
: num-chars HEX: 2FA1E ;
! the maximum unicode char in the first 3 planes
: process-category ( data -- category-listing )
2 swap (process-data)
unicode-chars <byte-array> swap dupd swap [
>r over unicode-chars >= [ r> 3drop ]
[ categories index swap r> set-nth ] if
] curry assoc-each ;
: ?set-nth ( val index seq -- )
2dup bounds-check? [ set-nth ] [ 3drop ] if ;
:: fill-ranges ( table -- table )
name-map >alist sort-values keys
[ [ "first>" tail? ] [ "last>" tail? ] or? ] filter
2 group [
[ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi
[ swap table ?set-nth ] curry each
] assoc-each table ;
:: process-category ( data -- category-listing )
[let | table [ num-chars <byte-array> ] |
2 data (process-data) [| char cat |
cat categories index char table ?set-nth
] assoc-each table fill-ranges ] ;
: ascii-lower ( string -- lower )
[ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
@ -131,37 +163,18 @@ C: <code-point> code-point
[ length 5 = ] filter
[ [ set-code-point ] each ] H{ } make-assoc ;
VALUE: simple-lower
VALUE: simple-upper
VALUE: simple-title
VALUE: canonical-map
VALUE: combine-map
VALUE: class-map
VALUE: compatibility-map
VALUE: category-map
VALUE: name-map
VALUE: special-casing
VALUE: properties
: canonical-entry ( char -- seq ) canonical-map at ;
: combine-chars ( a b -- char/f ) combine-map hash2 ;
: compatibility-entry ( char -- seq ) compatibility-map at ;
: combining-class ( char -- n ) class-map at ;
: non-starter? ( char -- ? ) class-map key? ;
: name>char ( string -- char ) name-map at ;
: char>name ( char -- string ) name-map value-at ;
: property? ( char property -- ? ) properties at interval-key? ;
load-data
dup process-names \ name-map set-value
13 over process-data \ simple-lower set-value
12 over process-data tuck \ simple-upper set-value
14 over process-data swapd assoc-union \ simple-title set-value
dup process-combining \ class-map set-value
dup process-canonical \ canonical-map set-value
\ combine-map set-value
dup process-compatibility \ compatibility-map set-value
process-category \ category-map set-value
load-data {
[ process-names \ name-map set-value ]
[ 13 swap process-data \ simple-lower set-value ]
[ 12 swap process-data \ simple-upper set-value ]
[ 14 swap process-data
simple-upper assoc-union \ simple-title set-value ]
[ process-combining \ class-map set-value ]
[ process-canonical \ canonical-map set-value
\ combine-map set-value ]
[ process-compatibility \ compatibility-map set-value ]
[ process-category \ category-map set-value ]
} cleave
load-special-casing \ special-casing set-value