From 5646d067e48eaedc01e5a527938aa0de431b85cb Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sun, 25 May 2008 02:51:27 -0500 Subject: [PATCH] Fixing Unicode collation, except for Korean --- .../categories/categories-tests.factor | 4 + .../unicode/collation/collation-tests.factor | 2 + extra/unicode/collation/collation.factor | 47 +++++++-- extra/unicode/data/data.factor | 95 +++++++++++-------- 4 files changed, 99 insertions(+), 49 deletions(-) diff --git a/extra/unicode/categories/categories-tests.factor b/extra/unicode/categories/categories-tests.factor index 81868709e3..e16125b642 100644 --- a/extra/unicode/categories/categories-tests.factor +++ b/extra/unicode/categories/categories-tests.factor @@ -5,3 +5,7 @@ USING: tools.test kernel unicode.categories words sequences unicode.syntax ; printable? alpha? control? uncased? character? } [ execute ] with map ] unit-test [ "Nd" ] [ CHAR: 3 category ] unit-test +[ "Lo" ] [ HEX: 3400 category ] unit-test +[ "Lo" ] [ HEX: 3450 category ] unit-test +[ "Lo" ] [ HEX: 4DB5 category ] unit-test +[ "Cs" ] [ HEX: DD00 category ] unit-test diff --git a/extra/unicode/collation/collation-tests.factor b/extra/unicode/collation/collation-tests.factor index 33c27984a6..d9395f3fff 100755 --- a/extra/unicode/collation/collation-tests.factor +++ b/extra/unicode/collation/collation-tests.factor @@ -21,3 +21,5 @@ IN: unicode.collation.tests : failures parse-test dup 2 [ string<=> +lt+ = not ] assoc-filter dup assoc-size ; + +[ 7 ] [ failures 2nip ] unit-test diff --git a/extra/unicode/collation/collation.factor b/extra/unicode/collation/collation.factor index fd1466d1dd..a889aa863c 100755 --- a/extra/unicode/collation/collation.factor +++ b/extra/unicode/collation/collation.factor @@ -1,7 +1,9 @@ USING: sequences io.files io.encodings.ascii kernel values splitting accessors math.parser ascii io assocs strings math namespaces sorting combinators math.order arrays -unicode.normalize unicode.data combinators.lib locals ; +unicode.normalize unicode.data combinators.lib locals +unicode.syntax macros sequences.deep words unicode.breaks +quotations ; IN: unicode.collation VALUE: ducet @@ -27,9 +29,12 @@ TUPLE: weight primary secondary tertiary ignorable? ; ascii parse-ducet \ ducet set-value : base ( char -- base ) - dup "Unified_Ideograph" property? - [ -16 shift zero? HEX: FB40 HEX: FB80 ? ] - [ drop HEX: FBC0 ] if ; + { + { [ dup HEX: 3400 HEX: 4DB5 between? ] [ drop HEX: FB80 ] } ! Extension A + { [ dup HEX: 20000 HEX: 2A6D6 between? ] [ drop HEX: FB80 ] } ! Extension B + { [ dup HEX: 4E00 HEX: 9FC3 between? ] [ drop HEX: FB40 ] } ! CJK + [ drop HEX: FBC0 ] ! Other + } cond ; : AAAA ( char -- weight ) [ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ; @@ -37,8 +42,12 @@ ascii parse-ducet \ ducet set-value : BBBB ( char -- weight ) HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ; +: illegal? ( char -- ? ) + [ "Noncharacter_Code_Point" property? ] + [ category "Cs" = ] or? ; + : derive-weight ( char -- weights ) - first dup "Noncharacter_Code_Point" property? + first dup illegal? [ drop { } ] [ [ AAAA ] [ BBBB ] bi 2array ] if ; @@ -67,12 +76,34 @@ ascii parse-ducet \ ducet set-value [ drop ] [ 1string , ] if ] if ; +: terminator 1 0 0 f weight boa ; + +MACRO: const ( seq -- seq ) + [ dup word? [ execute ] when ] deep-map 1quotation ; + +! : char, ( char -- ) + ! [ + ! building get peek [ first ] bi@ dup jamo? [ + ! over jamo? [ + ! [ grapheme-class ] bi@ swap 2array + ! { { T L } { V L } { V T } } const + ! member? [ terminator , ] when + ! ] [ 2drop terminator , ] if + ! ] [ 2drop ] if + ! ] [ , ] bi ; + +! : insert-terminators ( graphemes -- graphemes ) + ! Insert a terminator between hangul syllables +! [ unclip , [ char, ] each ] { } make ; + : string>graphemes ( string -- graphemes ) - [ [ add ] each ] { } make ; + [ [ add ] each ] { } make ; ! insert-terminators ; : graphemes>weights ( graphemes -- weights ) - [ dup ducet at [ ] [ derive-weight ] ?if ] - { } map-as concat ; + [ + dup weight? [ 1array ] ! From tailoring + [ dup ducet at [ ] [ derive-weight ] ?if ] if + ] { } map-as concat ; : append-weights ( weights quot -- ) swap [ ignorable?>> not ] filter diff --git a/extra/unicode/data/data.factor b/extra/unicode/data/data.factor index 706a8adef4..f9e5667947 100755 --- a/extra/unicode/data/data.factor +++ b/extra/unicode/data/data.factor @@ -2,9 +2,30 @@ USING: assocs math kernel sequences io.files hashtables quotations splitting arrays math.parser hash2 math.order byte-arrays words namespaces words compiler.units parser io.encodings.ascii values interval-maps ascii sets assocs.lib -combinators.lib ; +combinators.lib combinators locals math.ranges sorting ; IN: unicode.data +VALUE: simple-lower +VALUE: simple-upper +VALUE: simple-title +VALUE: canonical-map +VALUE: combine-map +VALUE: class-map +VALUE: compatibility-map +VALUE: category-map +VALUE: name-map +VALUE: special-casing +VALUE: properties + +: canonical-entry ( char -- seq ) canonical-map at ; +: combine-chars ( a b -- char/f ) combine-map hash2 ; +: compatibility-entry ( char -- seq ) compatibility-map at ; +: combining-class ( char -- n ) class-map at ; +: non-starter? ( char -- ? ) class-map key? ; +: name>char ( string -- char ) name-map at ; +: char>name ( char -- string ) name-map value-at ; +: property? ( char property -- ? ) properties at interval-key? ; + ! Convenience functions : ?between? ( n/f from to -- ? ) pick [ between? ] [ 3drop f ] if ; @@ -69,23 +90,34 @@ IN: unicode.data : categories ( -- names ) ! For non-existent characters, use Cn - { "Lu" "Ll" "Lt" "Lm" "Lo" + { "Cn" + "Lu" "Ll" "Lt" "Lm" "Lo" "Mn" "Mc" "Me" "Nd" "Nl" "No" "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po" "Sm" "Sc" "Sk" "So" "Zs" "Zl" "Zp" - "Cc" "Cf" "Cs" "Co" "Cn" } ; + "Cc" "Cf" "Cs" "Co" } ; -: unicode-chars HEX: 2FA1E ; +: num-chars HEX: 2FA1E ; ! the maximum unicode char in the first 3 planes -: process-category ( data -- category-listing ) - 2 swap (process-data) - unicode-chars swap dupd swap [ - >r over unicode-chars >= [ r> 3drop ] - [ categories index swap r> set-nth ] if - ] curry assoc-each ; +: ?set-nth ( val index seq -- ) + 2dup bounds-check? [ set-nth ] [ 3drop ] if ; + +:: fill-ranges ( table -- table ) + name-map >alist sort-values keys + [ [ "first>" tail? ] [ "last>" tail? ] or? ] filter + 2 group [ + [ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi + [ swap table ?set-nth ] curry each + ] assoc-each table ; + +:: process-category ( data -- category-listing ) + [let | table [ num-chars ] | + 2 data (process-data) [| char cat | + cat categories index char table ?set-nth + ] assoc-each table fill-ranges ] ; : ascii-lower ( string -- lower ) [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ; @@ -131,37 +163,18 @@ C: code-point [ length 5 = ] filter [ [ set-code-point ] each ] H{ } make-assoc ; -VALUE: simple-lower -VALUE: simple-upper -VALUE: simple-title -VALUE: canonical-map -VALUE: combine-map -VALUE: class-map -VALUE: compatibility-map -VALUE: category-map -VALUE: name-map -VALUE: special-casing -VALUE: properties - -: canonical-entry ( char -- seq ) canonical-map at ; -: combine-chars ( a b -- char/f ) combine-map hash2 ; -: compatibility-entry ( char -- seq ) compatibility-map at ; -: combining-class ( char -- n ) class-map at ; -: non-starter? ( char -- ? ) class-map key? ; -: name>char ( string -- char ) name-map at ; -: char>name ( char -- string ) name-map value-at ; -: property? ( char property -- ? ) properties at interval-key? ; - -load-data -dup process-names \ name-map set-value -13 over process-data \ simple-lower set-value -12 over process-data tuck \ simple-upper set-value -14 over process-data swapd assoc-union \ simple-title set-value -dup process-combining \ class-map set-value -dup process-canonical \ canonical-map set-value - \ combine-map set-value -dup process-compatibility \ compatibility-map set-value -process-category \ category-map set-value +load-data { + [ process-names \ name-map set-value ] + [ 13 swap process-data \ simple-lower set-value ] + [ 12 swap process-data \ simple-upper set-value ] + [ 14 swap process-data + simple-upper assoc-union \ simple-title set-value ] + [ process-combining \ class-map set-value ] + [ process-canonical \ canonical-map set-value + \ combine-map set-value ] + [ process-compatibility \ compatibility-map set-value ] + [ process-category \ category-map set-value ] +} cleave load-special-casing \ special-casing set-value