Unicode collation done, except for tailoring. Normalization test suite

2008-05-25 12:40:12 -05:00 · 2008-05-25 12:40:12 -05:00 · 8ac9d9d9b2
parent 5646d067e4
commit 8ac9d9d9b2
5 changed files with 17864 additions and 24 deletions
--- a/extra/unicode/collation/collation-tests.factor
+++ b/extra/unicode/collation/collation-tests.factor
@ -10,16 +10,8 @@ IN: unicode.collation.tests
 : test-two ( str1 str2 -- )
    [ +lt+ ] -rot [ string<=> ] 2curry unit-test ;

-: find-failure
-    parse-test dup 2 <clumps>
-    [ string<=> +lt+ = not ] assoc-find drop ;
-
-: (find-failure)
-    dup 2 <clumps>
-    [ string<=> +lt+ = not ] assoc-find drop ;
-
 : failures
    parse-test dup 2 <clumps>
    [ string<=> +lt+ = not ] assoc-filter dup assoc-size ;

-[ 7 ] [ failures 2nip ] unit-test
+parse-test 2 <clumps> [ test-two ] assoc-each
--- a/extra/unicode/collation/collation.factor
+++ b/extra/unicode/collation/collation.factor
@ -76,28 +76,31 @@ ascii <file-reader> parse-ducet \ ducet set-value
        [ drop ] [ 1string , ] if
    ] if ;

+! The terminator method for Hangul syllables
 : terminator 1 0 0 f weight boa ;

 MACRO: const ( seq -- seq )
    [ dup word? [ execute ] when ] deep-map 1quotation ;

-! : char, ( char -- )
-    ! [
-        ! building get peek [ first ] bi@ dup jamo? [
-            ! over jamo? [
-                ! [ grapheme-class ] bi@ swap 2array
-                ! { { T L } { V L } { V T } } const
-                ! member? [ terminator , ] when
-            ! ] [ 2drop terminator , ] if
-        ! ] [ 2drop ] if
-    ! ] [ , ] bi ;
+: char, ( char -- )
+    [
+        building get peek [ first ] bi@ dup jamo? [
+            over jamo? [
+                [ grapheme-class ] bi@ swap 2array
+                { { T L } { V L } { V T } } const
+                member? [ terminator , ] when
+            ] [ 2drop terminator , ] if
+        ] [ 2drop ] if
+    ] [ , ] bi ;

-! : insert-terminators ( graphemes -- graphemes )
+: insert-terminators ( graphemes -- graphemes )
    ! Insert a terminator between hangul syllables
-!     [ unclip , [ char, ] each ] { } make ;
+    [ unclip , [ char, ] each ] { } make ;
+! The above code for the terminator method is not used
+! The test suite passes without it

 : string>graphemes ( string -- graphemes )
-    [ [ add ] each ] { } make ; ! insert-terminators ;
+    [ [ add ] each ] { } make ;

 : graphemes>weights ( graphemes -- weights )
    [
--- a/extra/unicode/normalize/NormalizationTest.txt
+++ b/extra/unicode/normalize/NormalizationTest.txt
--- a/extra/unicode/normalize/normalize-tests.factor
+++ b/extra/unicode/normalize/normalize-tests.factor
@ -1,4 +1,7 @@
-USING: unicode.normalize kernel tools.test sequences ;
+USING: unicode.normalize kernel tools.test sequences
+unicode.data io.encodings.utf8 io.files splitting math.parser
+locals math quotations assocs combinators ;
+IN: unicode.normalize.tests

 [ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test

@ -16,3 +19,26 @@ USING: unicode.normalize kernel tools.test sequences ;
 [ f ] [ 0 hangul? ] unit-test
 [ "\u001112\u001161\u0011ab" ] [ "\u00d55c" nfd ] unit-test
 [ "\u00d55c" ] [ "\u001112\u001161\u0011ab" nfc ] unit-test
+
+: parse-test ( -- tests )
+    "resource:extra/unicode/normalize/NormalizationTest.txt"
+    utf8 file-lines filter-comments
+    [ ";" split 5 head [ " " split [ hex> ] "" map-as ] map ] map ;
+
+:: assert= ( test spec quot -- )
+    spec [
+        [
+            [ 1- test nth ] bi@
+            [ 1quotation ] [ quot curry ] bi* unit-test
+        ] with each
+    ] assoc-each ;
+
+: run-line ( test -- )
+    {
+        [ { { 2 { 1 2 3 } } { 4 { 4 5 } } } [ nfc ] assert= ]
+        [ { { 3 { 1 2 3 } } { 5 { 4 5 } } } [ nfd ] assert= ]
+        [ { { 4 { 1 2 3 4 5 } } } [ nfkc ] assert= ]
+        [ { { 5 { 1 2 3 4 5 } } } [ nfkd ] assert= ]
+    } cleave ;
+
+! parse-test [ run-line ] each
--- a/extra/unicode/normalize/normalize.factor
+++ b/extra/unicode/normalize/normalize.factor
@ -27,7 +27,7 @@ IN: unicode.normalize
    hangul-base - final-count /mod final-base +
    >r medial-count /mod medial-base +
    >r initial-base + r> r>
-    dup zero? [ drop 2array ] [ 3array ] if ;
+    dup final-base = [ drop 2array ] [ 3array ] if ;

 : jamo>hangul ( initial medial final -- hangul )
    >r >r initial-base - medial-count *