From 2e03abaaf6c29f7678d0f405a08360878c647438 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Sun, 28 Jul 2019 12:57:23 -0500 Subject: [PATCH] unicode.collation: illegal? is now gone, do AAAA BBBB for every code point remove some tests that pass now --- .../unicode/collation/collation-tests.factor | 42 ----- basis/unicode/collation/collation.factor | 174 ++++++++++++++++-- 2 files changed, 159 insertions(+), 57 deletions(-) diff --git a/basis/unicode/collation/collation-tests.factor b/basis/unicode/collation/collation-tests.factor index ec160fc95b..d62597b61f 100644 --- a/basis/unicode/collation/collation-tests.factor +++ b/basis/unicode/collation/collation-tests.factor @@ -61,48 +61,6 @@ IN: unicode.collation.tests { +lt+ } [ { 111355 98 } { 19968 33 } [ >string ] bi@ string<=> ] unit-test { +lt+ } [ { 40943 98 } { 64014 33 } [ >string ] bi@ string<=> ] unit-test { +lt+ } [ { 191456 98 } { 888 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 19894 98 } { 55296 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 55296 98 } { 55297 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 55297 98 } { 55298 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 55298 98 } { 55299 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 55299 98 } { 56320 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 56320 98 } { 57343 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 63743 98 } { 64976 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 64976 98 } { 64977 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 64977 98 } { 64978 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 64978 98 } { 64979 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 65520 98 } { 65534 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 65534 98 } { 65535 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 65535 98 } { 131070 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 131070 98 } { 131071 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 191457 98 } { 196606 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 196606 98 } { 196607 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 196607 98 } { 262142 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 262142 98 } { 262143 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 262143 98 } { 327678 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 327678 98 } { 327679 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 327679 98 } { 393214 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 393214 98 } { 393215 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 393215 98 } { 458750 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 458750 98 } { 458751 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 458751 98 } { 524286 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 524286 98 } { 524287 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 524287 98 } { 589822 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 589822 98 } { 589823 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 589823 98 } { 655358 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 655358 98 } { 655359 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 655359 98 } { 720894 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 720894 98 } { 720895 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 720895 98 } { 786430 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 786430 98 } { 786431 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 786432 98 } { 851966 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 851966 98 } { 851967 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 851968 98 } { 917502 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 917502 98 } { 917503 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 917509 98 } { 983038 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 983038 98 } { 983039 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 1114109 98 } { 1114110 33 } [ >string ] bi@ string<=> ] unit-test -{ +lt+ } [ { 1114110 98 } { 1114111 33 } [ >string ] bi@ string<=> ] unit-test { { 12748 12741 0 32 74 32 0 2 2 2 0 65535 65535 65535 } } diff --git a/basis/unicode/collation/collation.factor b/basis/unicode/collation/collation.factor index 13c2e68b0d..fe47628256 100644 --- a/basis/unicode/collation/collation.factor +++ b/basis/unicode/collation/collation.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays assocs combinators -combinators.short-circuit combinators.smart kernel locals make +combinators.short-circuit combinators.smart fry kernel locals make math math.order math.parser namespaces sequences simple-flat-file splitting strings unicode.data ; IN: unicode.collation @@ -40,10 +40,164 @@ TUPLE: weight-levels primary secondary tertiary ignorable? ; [ swap set-at ] 2bi ] if ; -: insert-helpers ( assoc -- ) - dup keys [ length 3 >= ] filter [ help-one ] with each ; +: fixup-ducet ( -- ) + { + { + { 0x0FB2 0x0F71 } ! CE(0FB2) CE(0F71) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + { + { 0x0FB3 0x0F71 } ! CE(0FB3) CE(0F71) + { + T{ weight-levels + { primary 12720 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } -ducet get-global insert-helpers + ! FIXME: WRONG WEIGHTS + { + { 0x0FB2 0x0F71 0x0F72 } ! CE(0FB2) CE(0F71 0F72) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + { + { 0x0FB2 0x0F73 } ! CE(0FB2) CE(0F71 0F72) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + { + { 0x0FB2 0x0F71 0x0F74 } ! CE(0FB2) CE(0F71 0F74) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + { + { 0x0FB2 0x0F75 } ! CE(0FB2) CE(0F71 0F74) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + { + { 0x0FB3 0x0F71 0x0F72 } ! CE(0FB3) CE(0F71 0F72) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + { + { 0x0FB3 0x0F73 } ! CE(0FB3) CE(0F71 0F72) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + { + { 0x0FB3 0x0F71 0x0F74 } ! CE(0FB3) CE(0F71 0F74) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + { + { 0x0FB3 0x0F75 } ! CE(0FB3) CE(0F71 0F74) + { + T{ weight-levels + { primary 12719 } + { secondary 32 } + { tertiary 2 } + } + T{ weight-levels + { primary 12741 } + { secondary 32 } + { tertiary 2 } + } + } + } + } ducet get-global '[ swap >string _ set-at ] assoc-each ; + +! Add a few missing ducet values +fixup-ducet : tangut-block? ( char -- ? ) ! Tangut Block, Tangut Components Block @@ -73,22 +227,12 @@ ducet get-global insert-helpers : BBBB ( char -- weight-levels ) 0x7FFF bitand 0x8000 bitor 0 0 ; inline -: illegal? ( char -- ? ) - { - [ "Noncharacter_Code_Point" property? ] - [ category "Cs" = ] - } 1|| ; - : derive-weight ( 1string -- weight-levels-pair ) first dup tangut-block? [ [ tangut-AAAA ] [ tangut-BBBB ] bi 2array ] [ - dup illegal? [ - drop { } - ] [ - [ AAAA ] [ BBBB ] bi 2array - ] if + [ AAAA ] [ BBBB ] bi 2array ] if ; : building-last ( -- char )