Unicode collation done, except for tailoring. Normalization test suite

db4
Daniel Ehrenberg 2008-05-25 12:40:12 -05:00
parent 5646d067e4
commit 8ac9d9d9b2
5 changed files with 17864 additions and 24 deletions

View File

@ -10,16 +10,8 @@ IN: unicode.collation.tests
: test-two ( str1 str2 -- )
[ +lt+ ] -rot [ string<=> ] 2curry unit-test ;
: find-failure
parse-test dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-find drop ;
: (find-failure)
dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-find drop ;
: failures
parse-test dup 2 <clumps>
[ string<=> +lt+ = not ] assoc-filter dup assoc-size ;
[ 7 ] [ failures 2nip ] unit-test
parse-test 2 <clumps> [ test-two ] assoc-each

View File

@ -76,28 +76,31 @@ ascii <file-reader> parse-ducet \ ducet set-value
[ drop ] [ 1string , ] if
] if ;
! The terminator method for Hangul syllables
: terminator 1 0 0 f weight boa ;
MACRO: const ( seq -- seq )
[ dup word? [ execute ] when ] deep-map 1quotation ;
! : char, ( char -- )
! [
! building get peek [ first ] bi@ dup jamo? [
! over jamo? [
! [ grapheme-class ] bi@ swap 2array
! { { T L } { V L } { V T } } const
! member? [ terminator , ] when
! ] [ 2drop terminator , ] if
! ] [ 2drop ] if
! ] [ , ] bi ;
: char, ( char -- )
[
building get peek [ first ] bi@ dup jamo? [
over jamo? [
[ grapheme-class ] bi@ swap 2array
{ { T L } { V L } { V T } } const
member? [ terminator , ] when
] [ 2drop terminator , ] if
] [ 2drop ] if
] [ , ] bi ;
! : insert-terminators ( graphemes -- graphemes )
: insert-terminators ( graphemes -- graphemes )
! Insert a terminator between hangul syllables
! [ unclip , [ char, ] each ] { } make ;
[ unclip , [ char, ] each ] { } make ;
! The above code for the terminator method is not used
! The test suite passes without it
: string>graphemes ( string -- graphemes )
[ [ add ] each ] { } make ; ! insert-terminators ;
[ [ add ] each ] { } make ;
: graphemes>weights ( graphemes -- weights )
[

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,7 @@
USING: unicode.normalize kernel tools.test sequences ;
USING: unicode.normalize kernel tools.test sequences
unicode.data io.encodings.utf8 io.files splitting math.parser
locals math quotations assocs combinators ;
IN: unicode.normalize.tests
[ "ab\u000323\u000302cd" ] [ "ab\u000302" "\u000323cd" string-append ] unit-test
@ -16,3 +19,26 @@ USING: unicode.normalize kernel tools.test sequences ;
[ f ] [ 0 hangul? ] unit-test
[ "\u001112\u001161\u0011ab" ] [ "\u00d55c" nfd ] unit-test
[ "\u00d55c" ] [ "\u001112\u001161\u0011ab" nfc ] unit-test
: parse-test ( -- tests )
"resource:extra/unicode/normalize/NormalizationTest.txt"
utf8 file-lines filter-comments
[ ";" split 5 head [ " " split [ hex> ] "" map-as ] map ] map ;
:: assert= ( test spec quot -- )
spec [
[
[ 1- test nth ] bi@
[ 1quotation ] [ quot curry ] bi* unit-test
] with each
] assoc-each ;
: run-line ( test -- )
{
[ { { 2 { 1 2 3 } } { 4 { 4 5 } } } [ nfc ] assert= ]
[ { { 3 { 1 2 3 } } { 5 { 4 5 } } } [ nfd ] assert= ]
[ { { 4 { 1 2 3 4 5 } } } [ nfkc ] assert= ]
[ { { 5 { 1 2 3 4 5 } } } [ nfkd ] assert= ]
} cleave ;
! parse-test [ run-line ] each

View File

@ -27,7 +27,7 @@ IN: unicode.normalize
hangul-base - final-count /mod final-base +
>r medial-count /mod medial-base +
>r initial-base + r> r>
dup zero? [ drop 2array ] [ 3array ] if ;
dup final-base = [ drop 2array ] [ 3array ] if ;
: jamo>hangul ( initial medial final -- hangul )
>r >r initial-base - medial-count *