Fixing Unicode collation, except for Korean

2008-05-25 02:51:27 -05:00 · 2008-05-25 02:51:27 -05:00 · 5646d067e4
parent 8ea775f9b5
commit 5646d067e4
4 changed files with 99 additions and 49 deletions
--- a/extra/unicode/categories/categories-tests.factor
+++ b/extra/unicode/categories/categories-tests.factor
@ -5,3 +5,7 @@ USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
    printable? alpha? control? uncased? character? 
 } [ execute ] with map ] unit-test
 [ "Nd" ] [ CHAR: 3 category ] unit-test
 [ "Lo" ] [ HEX: 3400 category ] unit-test
 [ "Lo" ] [ HEX: 3450 category ] unit-test
 [ "Lo" ] [ HEX: 4DB5 category ] unit-test
 [ "Cs" ] [ HEX: DD00 category ] unit-test
--- a/extra/unicode/collation/collation-tests.factor
+++ b/extra/unicode/collation/collation-tests.factor
@ -21,3 +21,5 @@ IN: unicode.collation.tests
 : failures
    parse-test dup 2 <clumps>
    [ string<=> +lt+ = not ] assoc-filter dup assoc-size ;
 [ 7 ] [ failures 2nip ] unit-test
--- a/extra/unicode/collation/collation.factor
+++ b/extra/unicode/collation/collation.factor
@ -1,7 +1,9 @@
 USING: sequences io.files io.encodings.ascii kernel values
 splitting accessors math.parser ascii io assocs strings math
 namespaces sorting combinators math.order arrays
-unicode.normalize unicode.data combinators.lib locals ;
+unicode.normalize unicode.data combinators.lib locals
 unicode.syntax macros sequences.deep words unicode.breaks
 quotations ;
 IN: unicode.collation
 VALUE: ducet
@ -27,9 +29,12 @@ TUPLE: weight primary secondary tertiary ignorable? ;
 ascii <file-reader> parse-ducet \ ducet set-value
 : base ( char -- base )
-    dup "Unified_Ideograph" property?
+    {
-    [ -16 shift zero? HEX: FB40 HEX: FB80 ? ]
+        { [ dup HEX: 3400 HEX:  4DB5 between? ] [ drop HEX: FB80 ] } ! Extension A
-    [ drop HEX: FBC0 ] if ;
+        { [ dup HEX: 20000 HEX: 2A6D6 between? ] [ drop HEX: FB80 ] } ! Extension B
        { [ dup HEX: 4E00 HEX: 9FC3 between? ] [ drop HEX: FB40 ] } ! CJK
        [ drop HEX: FBC0 ] ! Other
    } cond ;
 : AAAA ( char -- weight )
    [ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ;
@ -37,8 +42,12 @@ ascii <file-reader> parse-ducet \ ducet set-value
 : BBBB ( char -- weight )
    HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ;
 : illegal? ( char -- ? )
    [ "Noncharacter_Code_Point" property? ]
    [ category "Cs" = ] or? ;
 : derive-weight ( char -- weights )
-    first dup "Noncharacter_Code_Point" property?
+    first dup illegal?
    [ drop { } ]
    [ [ AAAA ] [ BBBB ] bi 2array ] if ;
@ -67,12 +76,34 @@ ascii <file-reader> parse-ducet \ ducet set-value
        [ drop ] [ 1string , ] if
    ] if ;
 : terminator 1 0 0 f weight boa ;
 MACRO: const ( seq -- seq )
    [ dup word? [ execute ] when ] deep-map 1quotation ;
 ! : char, ( char -- )
    ! [
        ! building get peek [ first ] bi@ dup jamo? [
            ! over jamo? [
                ! [ grapheme-class ] bi@ swap 2array
                ! { { T L } { V L } { V T } } const
                ! member? [ terminator , ] when
            ! ] [ 2drop terminator , ] if
        ! ] [ 2drop ] if
    ! ] [ , ] bi ;
 ! : insert-terminators ( graphemes -- graphemes )
    ! Insert a terminator between hangul syllables
 !     [ unclip , [ char, ] each ] { } make ;
 : string>graphemes ( string -- graphemes )
-    [ [ add ] each ] { } make ;
+    [ [ add ] each ] { } make ; ! insert-terminators ;
 : graphemes>weights ( graphemes -- weights )
-    [ dup ducet at [ ] [ derive-weight ] ?if ]
+    [
-    { } map-as concat ;
+        dup weight? [ 1array ] ! From tailoring
        [ dup ducet at [ ] [ derive-weight ] ?if ] if
    ] { } map-as concat ;
 : append-weights ( weights quot -- )
    swap [ ignorable?>> not ] filter
--- a/extra/unicode/data/data.factor
+++ b/extra/unicode/data/data.factor
@ -2,9 +2,30 @@ USING: assocs math kernel sequences io.files hashtables
 quotations splitting arrays math.parser hash2 math.order
 byte-arrays words namespaces words compiler.units parser
 io.encodings.ascii values interval-maps ascii sets assocs.lib
-combinators.lib ;
+combinators.lib combinators locals math.ranges sorting ;
 IN: unicode.data
 VALUE: simple-lower
 VALUE: simple-upper
 VALUE: simple-title
 VALUE: canonical-map
 VALUE: combine-map
 VALUE: class-map
 VALUE: compatibility-map
 VALUE: category-map
 VALUE: name-map
 VALUE: special-casing
 VALUE: properties
 : canonical-entry ( char -- seq ) canonical-map at ;
 : combine-chars ( a b -- char/f ) combine-map hash2 ;
 : compatibility-entry ( char -- seq ) compatibility-map at  ;
 : combining-class ( char -- n ) class-map at ;
 : non-starter? ( char -- ? ) class-map key? ;
 : name>char ( string -- char ) name-map at ;
 : char>name ( char -- string ) name-map value-at ;
 : property? ( char property -- ? ) properties at interval-key? ;
 ! Convenience functions
 : ?between? ( n/f from to -- ? )
    pick [ between? ] [ 3drop f ] if ;
@ -69,23 +90,34 @@ IN: unicode.data
 : categories ( -- names )
    ! For non-existent characters, use Cn
-    { "Lu" "Ll" "Lt" "Lm" "Lo"
+    { "Cn"
      "Lu" "Ll" "Lt" "Lm" "Lo"
      "Mn" "Mc" "Me"
      "Nd" "Nl" "No"
      "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
      "Sm" "Sc" "Sk" "So"
      "Zs" "Zl" "Zp"
-      "Cc" "Cf" "Cs" "Co" "Cn" } ;
+      "Cc" "Cf" "Cs" "Co" } ;
-: unicode-chars HEX: 2FA1E ;
+: num-chars HEX: 2FA1E ;
 ! the maximum unicode char in the first 3 planes
-: process-category ( data -- category-listing )
+: ?set-nth ( val index seq -- )
-    2 swap (process-data)
+    2dup bounds-check? [ set-nth ] [ 3drop ] if ;
-    unicode-chars <byte-array> swap dupd swap [
+
-        >r over unicode-chars >= [ r> 3drop ]
+:: fill-ranges ( table -- table )
-        [ categories index swap r> set-nth ] if
+    name-map >alist sort-values keys
-    ] curry assoc-each ;
+    [ [ "first>" tail? ] [ "last>" tail? ] or? ] filter
    2 group [
        [ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi
        [ swap table ?set-nth ] curry each
    ] assoc-each table ;
 :: process-category ( data -- category-listing )
    [let | table [ num-chars <byte-array> ] |
        2 data (process-data) [| char cat |
            cat categories index char table ?set-nth
        ] assoc-each table fill-ranges ] ;
 : ascii-lower ( string -- lower )
    [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
@ -131,37 +163,18 @@ C: <code-point> code-point
    [ length 5 = ] filter
    [ [ set-code-point ] each ] H{ } make-assoc ;
-VALUE: simple-lower
+load-data {
-VALUE: simple-upper
+    [ process-names \ name-map set-value ]
-VALUE: simple-title
+    [ 13 swap process-data \ simple-lower set-value ]
-VALUE: canonical-map
+    [ 12 swap process-data \ simple-upper set-value ]
-VALUE: combine-map
+    [ 14 swap process-data
-VALUE: class-map
+        simple-upper assoc-union \ simple-title set-value ]
-VALUE: compatibility-map
+    [ process-combining \ class-map set-value ]
-VALUE: category-map
+    [ process-canonical \ canonical-map set-value
-VALUE: name-map
+        \ combine-map set-value ]
-VALUE: special-casing
+    [ process-compatibility \ compatibility-map set-value ]
-VALUE: properties
+    [ process-category \ category-map set-value ]
-
+} cleave
 : canonical-entry ( char -- seq ) canonical-map at ;
 : combine-chars ( a b -- char/f ) combine-map hash2 ;
 : compatibility-entry ( char -- seq ) compatibility-map at  ;
 : combining-class ( char -- n ) class-map at ;
 : non-starter? ( char -- ? ) class-map key? ;
 : name>char ( string -- char ) name-map at ;
 : char>name ( char -- string ) name-map value-at ;
 : property? ( char property -- ? ) properties at interval-key? ;
 load-data
 dup process-names \ name-map set-value
 13 over process-data \ simple-lower set-value
 12 over process-data tuck \ simple-upper set-value
 14 over process-data swapd assoc-union \ simple-title set-value
 dup process-combining \ class-map set-value
 dup process-canonical \ canonical-map set-value
    \ combine-map set-value
 dup process-compatibility \ compatibility-map set-value
 process-category \ category-map set-value
 load-special-casing \ special-casing set-value