Fixing Unicode collation, except for Korean

2008-05-25 02:51:27 -05:00 · 2008-05-25 02:51:27 -05:00 · 5646d067e4
parent 8ea775f9b5
commit 5646d067e4
4 changed files with 99 additions and 49 deletions
--- a/extra/unicode/categories/categories-tests.factor
+++ b/extra/unicode/categories/categories-tests.factor
@ -5,3 +5,7 @@ USING: tools.test kernel unicode.categories words sequences unicode.syntax ;
    printable? alpha? control? uncased? character? 
 } [ execute ] with map ] unit-test
 [ "Nd" ] [ CHAR: 3 category ] unit-test
+[ "Lo" ] [ HEX: 3400 category ] unit-test
+[ "Lo" ] [ HEX: 3450 category ] unit-test
+[ "Lo" ] [ HEX: 4DB5 category ] unit-test
+[ "Cs" ] [ HEX: DD00 category ] unit-test
--- a/extra/unicode/collation/collation-tests.factor
+++ b/extra/unicode/collation/collation-tests.factor
@ -21,3 +21,5 @@ IN: unicode.collation.tests
 : failures
    parse-test dup 2 <clumps>
    [ string<=> +lt+ = not ] assoc-filter dup assoc-size ;
+
+[ 7 ] [ failures 2nip ] unit-test
--- a/extra/unicode/collation/collation.factor
+++ b/extra/unicode/collation/collation.factor
@ -1,7 +1,9 @@
 USING: sequences io.files io.encodings.ascii kernel values
 splitting accessors math.parser ascii io assocs strings math
 namespaces sorting combinators math.order arrays
-unicode.normalize unicode.data combinators.lib locals ;
+unicode.normalize unicode.data combinators.lib locals
+unicode.syntax macros sequences.deep words unicode.breaks
+quotations ;
 IN: unicode.collation

 VALUE: ducet
@ -27,9 +29,12 @@ TUPLE: weight primary secondary tertiary ignorable? ;
 ascii <file-reader> parse-ducet \ ducet set-value

 : base ( char -- base )
-    dup "Unified_Ideograph" property?
-    [ -16 shift zero? HEX: FB40 HEX: FB80 ? ]
-    [ drop HEX: FBC0 ] if ;
+    {
+        { [ dup HEX: 3400 HEX:  4DB5 between? ] [ drop HEX: FB80 ] } ! Extension A
+        { [ dup HEX: 20000 HEX: 2A6D6 between? ] [ drop HEX: FB80 ] } ! Extension B
+        { [ dup HEX: 4E00 HEX: 9FC3 between? ] [ drop HEX: FB40 ] } ! CJK
+        [ drop HEX: FBC0 ] ! Other
+    } cond ;

 : AAAA ( char -- weight )
    [ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ;
@ -37,8 +42,12 @@ ascii <file-reader> parse-ducet \ ducet set-value
 : BBBB ( char -- weight )
    HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ;

+: illegal? ( char -- ? )
+    [ "Noncharacter_Code_Point" property? ]
+    [ category "Cs" = ] or? ;
+
 : derive-weight ( char -- weights )
-    first dup "Noncharacter_Code_Point" property?
+    first dup illegal?
    [ drop { } ]
    [ [ AAAA ] [ BBBB ] bi 2array ] if ;

@ -67,12 +76,34 @@ ascii <file-reader> parse-ducet \ ducet set-value
        [ drop ] [ 1string , ] if
    ] if ;

+: terminator 1 0 0 f weight boa ;
+
+MACRO: const ( seq -- seq )
+    [ dup word? [ execute ] when ] deep-map 1quotation ;
+
+! : char, ( char -- )
+    ! [
+        ! building get peek [ first ] bi@ dup jamo? [
+            ! over jamo? [
+                ! [ grapheme-class ] bi@ swap 2array
+                ! { { T L } { V L } { V T } } const
+                ! member? [ terminator , ] when
+            ! ] [ 2drop terminator , ] if
+        ! ] [ 2drop ] if
+    ! ] [ , ] bi ;
+
+! : insert-terminators ( graphemes -- graphemes )
+    ! Insert a terminator between hangul syllables
+!     [ unclip , [ char, ] each ] { } make ;
+
 : string>graphemes ( string -- graphemes )
-    [ [ add ] each ] { } make ;
+    [ [ add ] each ] { } make ; ! insert-terminators ;

 : graphemes>weights ( graphemes -- weights )
-    [ dup ducet at [ ] [ derive-weight ] ?if ]
-    { } map-as concat ;
+    [
+        dup weight? [ 1array ] ! From tailoring
+        [ dup ducet at [ ] [ derive-weight ] ?if ] if
+    ] { } map-as concat ;

 : append-weights ( weights quot -- )
    swap [ ignorable?>> not ] filter
--- a/extra/unicode/data/data.factor
+++ b/extra/unicode/data/data.factor
@ -2,9 +2,30 @@ USING: assocs math kernel sequences io.files hashtables
 quotations splitting arrays math.parser hash2 math.order
 byte-arrays words namespaces words compiler.units parser
 io.encodings.ascii values interval-maps ascii sets assocs.lib
-combinators.lib ;
+combinators.lib combinators locals math.ranges sorting ;
 IN: unicode.data

+VALUE: simple-lower
+VALUE: simple-upper
+VALUE: simple-title
+VALUE: canonical-map
+VALUE: combine-map
+VALUE: class-map
+VALUE: compatibility-map
+VALUE: category-map
+VALUE: name-map
+VALUE: special-casing
+VALUE: properties
+
+: canonical-entry ( char -- seq ) canonical-map at ;
+: combine-chars ( a b -- char/f ) combine-map hash2 ;
+: compatibility-entry ( char -- seq ) compatibility-map at  ;
+: combining-class ( char -- n ) class-map at ;
+: non-starter? ( char -- ? ) class-map key? ;
+: name>char ( string -- char ) name-map at ;
+: char>name ( char -- string ) name-map value-at ;
+: property? ( char property -- ? ) properties at interval-key? ;
+
 ! Convenience functions
 : ?between? ( n/f from to -- ? )
    pick [ between? ] [ 3drop f ] if ;
@ -69,23 +90,34 @@ IN: unicode.data

 : categories ( -- names )
    ! For non-existent characters, use Cn
-    { "Lu" "Ll" "Lt" "Lm" "Lo"
+    { "Cn"
+      "Lu" "Ll" "Lt" "Lm" "Lo"
      "Mn" "Mc" "Me"
      "Nd" "Nl" "No"
      "Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"
      "Sm" "Sc" "Sk" "So"
      "Zs" "Zl" "Zp"
-      "Cc" "Cf" "Cs" "Co" "Cn" } ;
+      "Cc" "Cf" "Cs" "Co" } ;

-: unicode-chars HEX: 2FA1E ;
+: num-chars HEX: 2FA1E ;
 ! the maximum unicode char in the first 3 planes

-: process-category ( data -- category-listing )
-    2 swap (process-data)
-    unicode-chars <byte-array> swap dupd swap [
-        >r over unicode-chars >= [ r> 3drop ]
-        [ categories index swap r> set-nth ] if
-    ] curry assoc-each ;
+: ?set-nth ( val index seq -- )
+    2dup bounds-check? [ set-nth ] [ 3drop ] if ;
+
+:: fill-ranges ( table -- table )
+    name-map >alist sort-values keys
+    [ [ "first>" tail? ] [ "last>" tail? ] or? ] filter
+    2 group [
+        [ name>char ] bi@ [ [a,b] ] [ table ?nth ] bi
+        [ swap table ?set-nth ] curry each
+    ] assoc-each table ;
+
+:: process-category ( data -- category-listing )
+    [let | table [ num-chars <byte-array> ] |
+        2 data (process-data) [| char cat |
+            cat categories index char table ?set-nth
+        ] assoc-each table fill-ranges ] ;

 : ascii-lower ( string -- lower )
    [ dup CHAR: A CHAR: Z between? [ HEX: 20 + ] when ] map ;
@ -131,37 +163,18 @@ C: <code-point> code-point
    [ length 5 = ] filter
    [ [ set-code-point ] each ] H{ } make-assoc ;

-VALUE: simple-lower
-VALUE: simple-upper
-VALUE: simple-title
-VALUE: canonical-map
-VALUE: combine-map
-VALUE: class-map
-VALUE: compatibility-map
-VALUE: category-map
-VALUE: name-map
-VALUE: special-casing
-VALUE: properties
-
-: canonical-entry ( char -- seq ) canonical-map at ;
-: combine-chars ( a b -- char/f ) combine-map hash2 ;
-: compatibility-entry ( char -- seq ) compatibility-map at  ;
-: combining-class ( char -- n ) class-map at ;
-: non-starter? ( char -- ? ) class-map key? ;
-: name>char ( string -- char ) name-map at ;
-: char>name ( char -- string ) name-map value-at ;
-: property? ( char property -- ? ) properties at interval-key? ;
-
-load-data
-dup process-names \ name-map set-value
-13 over process-data \ simple-lower set-value
-12 over process-data tuck \ simple-upper set-value
-14 over process-data swapd assoc-union \ simple-title set-value
-dup process-combining \ class-map set-value
-dup process-canonical \ canonical-map set-value
-    \ combine-map set-value
-dup process-compatibility \ compatibility-map set-value
-process-category \ category-map set-value
+load-data {
+    [ process-names \ name-map set-value ]
+    [ 13 swap process-data \ simple-lower set-value ]
+    [ 12 swap process-data \ simple-upper set-value ]
+    [ 14 swap process-data
+        simple-upper assoc-union \ simple-title set-value ]
+    [ process-combining \ class-map set-value ]
+    [ process-canonical \ canonical-map set-value
+        \ combine-map set-value ]
+    [ process-compatibility \ compatibility-map set-value ]
+    [ process-category \ category-map set-value ]
+} cleave

 load-special-casing \ special-casing set-value