diff --git a/basis/unicode/collation/collation-tests.factor b/basis/unicode/collation/collation-tests.factor index faa5a67ede..5f7705ffa3 100644 --- a/basis/unicode/collation/collation-tests.factor +++ b/basis/unicode/collation/collation-tests.factor @@ -1,6 +1,6 @@ -USING: arrays assocs fry grouping io.encodings.utf8 io.files -kernel math math.order math.parser sequences splitting -strings tools.test unicode ; +USING: arrays assocs fry grouping hash-sets io.encodings.utf8 +io.files kernel math math.order math.parser sequences sets +splitting strings tools.test unicode ; IN: unicode.collation.tests : test-equality ( str1 str2 -- ? ? ? ? ) @@ -37,9 +37,28 @@ IN: unicode.collation.tests [ " " split harvest [ hex> ] map ] map ] bi* 2array ; +! These tests actually would pass if I didn't fix up +! the ducet table for Tibetan. It took me way too long to realize +! that the Unicode committee recommends fixing Tibetan collation +! yet ships tests that collation fails if you fix it. +! (Specifically the ducet entries for { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 } +! cause these tests to fail) +: xfailed-collation-tests ( -- seq ) + HS{ + { 3958 3953 820 } + { 4018 820 3953 3968 } + { 4018 820 3968 3953 } + { 4018 820 3969 } + { 3960 3953 820 } + { 4019 820 3953 3968 } + { 4019 820 3968 3953 } + { 4019 3953 820 3968 } + } ; + : parse-collation-test-weights ( -- weights ) collation-test-lines - [ line>test-weights ] map ; + [ line>test-weights ] map + [ first xfailed-collation-tests in? ] reject ; : calculate-collation ( chars collation -- collation-calculated collation-answer ) [ >string collation-key/nfd drop ] [ { 0 } join ] bi* ; @@ -52,42 +71,18 @@ IN: unicode.collation.tests { { } } [ parse-collation-test-shifted - 2 clump + 2 clump >hash-set + + ! Remove these two expected-fail Tibetan collation comparison tests + ! They are bad tests once you fix up the ducet table with { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 } + { 4018 820 3969 } { 3959 33 } [ >string ] bi@ 2array + { 4019 3953 820 3968 } { 3961 33 } [ >string ] bi@ 2array + 2array >hash-set diff members + [ string<=> { +lt+ +eq+ } member? ] assoc-reject ] unit-test -! FIXME: ducet table is wrong -! Fixed by fixing ducet table -! { +lt+ } [ { 4019 98 } { 4019 3953 1 3968 97 } [ >string ] bi@ string<=> ] unit-test - -{ +lt+ } [ { 4018 820 3969 } { 3959 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 4019 3953 820 3968 } { 3961 33 } [ >string ] bi@ string<=> ] unit-test - - -{ { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } -[ { 3958 3953 820 } >string collation-key/nfd drop ] unit-test - -{ { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } -[ { 4018 820 3953 3968 } >string collation-key/nfd drop ] unit-test - -! { { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } -! [ { 0x0FB2 0x0334 0x0F80 0x0F71 } >string collation-key/nfd drop ] unit-test - -{ { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } -[ { 4018 820 3969 } >string collation-key/nfd drop ] unit-test - -{ { 12750 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } -[ { 3960 3953 820 } >string collation-key/nfd drop ] unit-test - -{ { 12750 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } -[ { 4019 820 3953 3968 } >string collation-key/nfd drop ] unit-test - -{ { 12750 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } -[ { 4019 820 3968 3953 } >string collation-key/nfd drop ] unit-test - -{ { 12750 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } -[ { 4019 3953 820 3968 } >string collation-key/nfd drop ] unit-test - -{ { 12722 12741 12744 7817 0 32 32 32 32 0 2 2 2 2 0 65535 65535 65535 65535 } } -[ { 4019 3953 1 3968 97 } >string collation-key/nfd drop ] unit-test -! { 0xfb3 0x0f71 0x0334 0x0f80 } \ No newline at end of file +! XXX: Once again, these tests pass if you don't +! fix up the ducet table for { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 } +! { +lt+ } [ { 4018 820 3969 } { 3959 33 } [ >string ] bi@ string<=> ] unit-test +! { +lt+ } [ { 4019 3953 820 3968 } { 3961 33 } [ >string ] bi@ string<=> ] unit-test \ No newline at end of file diff --git a/basis/unicode/collation/collation.factor b/basis/unicode/collation/collation.factor index 97bb8f486f..b6c4941b9d 100644 --- a/basis/unicode/collation/collation.factor +++ b/basis/unicode/collation/collation.factor @@ -33,7 +33,15 @@ TUPLE: weight-levels primary secondary tertiary ignorable? ; "vocab:unicode/UCA/allkeys.txt" parse-ducet ducet set-global ! https://www.unicode.org/reports/tr10/tr10-41.html#Well_Formed_DUCET -: fixup-ducet ( -- ) +! WF5 - Well-formedness 5 condition: +! https://www.unicode.org/reports/tr10/tr10-41.html#WF5 +! { "0CC6" "0CC2" "0CD5" } ! 0CD5 is not a non-starter, don't add 2-gram "0CC6" "0CC2"to ducet +! { "0DD9" "0DCF" "0DCA" } ! already in allkeys.txt file +! { "0FB2" "0F71" "0F80" } ! added below +! { "0FB3" "0F71" "0F80" } ! added below +! This breaks the unicode tests that ship in CollationTest_SHIFTED.txt +! but it's supposedly more correct. +: fixup-ducet-for-tibetan ( -- ) { { { 0x0FB2 0x0F71 } ! CE(0FB2) CE(0F71) @@ -188,9 +196,11 @@ TUPLE: weight-levels primary secondary tertiary ignorable? ; } } ducet get-global '[ swap >string _ set-at ] assoc-each ; -! Add a few missing ducet values for Tibetan +! These values actually break the collation unit tests in CollationTest_SHIFTED.txt +! So we disable those tests in favor of supposedly better collation for Tibetan. ! https://www.unicode.org/reports/tr10/tr10-41.html#Well_Formed_DUCET -fixup-ducet + +fixup-ducet-for-tibetan : tangut-block? ( char -- ? ) ! Tangut Block, Tangut Components Block