| 
									
										
										
										
											2019-07-30 03:07:34 -04:00
										 |  |  | USING: arrays assocs fry grouping hash-sets io.encodings.utf8 | 
					
						
							|  |  |  | io.files kernel math math.order math.parser sequences sets | 
					
						
							|  |  |  | splitting strings tools.test unicode ;
 | 
					
						
							| 
									
										
										
										
											2015-06-29 19:43:15 -04:00
										 |  |  | IN: unicode.collation.tests | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : test-equality ( str1 str2 -- ? ? ? ? )
 | 
					
						
							|  |  |  |     { primary= secondary= tertiary= quaternary= } | 
					
						
							|  |  |  |     [ execute( a b -- ? ) ] 2with map
 | 
					
						
							|  |  |  |     first4 ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-07-03 12:39:59 -04:00
										 |  |  | { f f f f } [ "hello" "hi" test-equality ] unit-test | 
					
						
							|  |  |  | { t f f f } [ "hello" "h\u0000e9llo" test-equality ] unit-test | 
					
						
							|  |  |  | { t t f f } [ "hello" "HELLO" test-equality ] unit-test | 
					
						
							|  |  |  | { t t t f } [ "hello" "h e l l o." test-equality ] unit-test | 
					
						
							|  |  |  | { t t t t } [ "hello" "\0hello\0" test-equality ] unit-test | 
					
						
							|  |  |  | { { "good bye" "goodbye" "hello" "HELLO" } } | 
					
						
							| 
									
										
										
										
											2019-06-22 10:47:07 -04:00
										 |  |  | [ { "HELLO" "goodbye" "good bye" "hello" } sort-strings ] unit-test | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-28 15:45:51 -04:00
										 |  |  | : collation-test-lines ( -- lines )
 | 
					
						
							| 
									
										
										
										
											2019-06-22 10:47:07 -04:00
										 |  |  |     "vocab:unicode/UCA/CollationTest/CollationTest_SHIFTED.txt" utf8 file-lines | 
					
						
							| 
									
										
										
										
											2019-07-28 15:45:51 -04:00
										 |  |  |     [ "#" head? ] reject harvest ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-collation-test-shifted ( -- lines )
 | 
					
						
							|  |  |  |     collation-test-lines | 
					
						
							|  |  |  |     [ ";" split first " " split [ hex> ] "" map-as ] map ;
 | 
					
						
							| 
									
										
										
										
											2019-06-22 10:47:07 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : tail-from-last ( string char -- string' )
 | 
					
						
							|  |  |  |     '[ _ = ] dupd find-last drop 1 + tail ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : line>test-weights ( string -- pair )
 | 
					
						
							|  |  |  |     ";" split1 [ | 
					
						
							|  |  |  |         " " split [ hex> ] map
 | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         "#" split1 nip CHAR: [ tail-from-last | 
					
						
							|  |  |  |         "]" split1 drop
 | 
					
						
							|  |  |  |         "|" split 4 head
 | 
					
						
							|  |  |  |         [ " " split harvest [ hex> ] map ] map
 | 
					
						
							|  |  |  |     ] bi* 2array ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-30 03:07:34 -04:00
										 |  |  | ! These tests actually would pass if I didn't fix up | 
					
						
							|  |  |  | ! the ducet table for Tibetan. It took me way too long to realize | 
					
						
							|  |  |  | ! that the Unicode committee recommends fixing Tibetan collation | 
					
						
							|  |  |  | ! yet ships tests that collation fails if you fix it. | 
					
						
							|  |  |  | ! (Specifically the ducet entries for { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 } | 
					
						
							|  |  |  | ! cause these tests to fail) | 
					
						
							|  |  |  | : xfailed-collation-tests ( -- seq )
 | 
					
						
							|  |  |  |     HS{ | 
					
						
							|  |  |  |         { 3958 3953 820 } | 
					
						
							|  |  |  |         { 4018 820 3953 3968 } | 
					
						
							|  |  |  |         { 4018 820 3968 3953 } | 
					
						
							|  |  |  |         { 4018 820 3969 } | 
					
						
							|  |  |  |         { 3960 3953 820 } | 
					
						
							|  |  |  |         { 4019 820 3953 3968 } | 
					
						
							|  |  |  |         { 4019 820 3968 3953 } | 
					
						
							|  |  |  |         { 4019 3953 820 3968 } | 
					
						
							|  |  |  |     } ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-22 10:47:07 -04:00
										 |  |  | : parse-collation-test-weights ( -- weights )
 | 
					
						
							| 
									
										
										
										
											2019-07-28 15:45:51 -04:00
										 |  |  |     collation-test-lines | 
					
						
							| 
									
										
										
										
											2019-07-30 03:07:34 -04:00
										 |  |  |     [ line>test-weights ] map
 | 
					
						
							|  |  |  |     [ first xfailed-collation-tests in? ] reject ;
 | 
					
						
							| 
									
										
										
										
											2019-06-22 10:47:07 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : calculate-collation ( chars collation -- collation-calculated collation-answer )
 | 
					
						
							|  |  |  |     [ >string collation-key/nfd drop ] [ { 0 } join ] bi* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : find-bad-collations ( pairs -- seq )
 | 
					
						
							| 
									
										
										
										
											2019-07-28 15:45:51 -04:00
										 |  |  |     [ first2 calculate-collation sequence= ] reject ;
 | 
					
						
							| 
									
										
										
										
											2019-06-22 10:47:07 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | { { } } | 
					
						
							|  |  |  | [ parse-collation-test-weights find-bad-collations ] unit-test | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | { { } } [ | 
					
						
							|  |  |  |     parse-collation-test-shifted | 
					
						
							| 
									
										
										
										
											2019-07-30 03:07:34 -04:00
										 |  |  |     2 clump >hash-set | 
					
						
							| 
									
										
										
										
											2019-07-06 10:32:47 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-30 03:07:34 -04:00
										 |  |  |     ! Remove these two expected-fail Tibetan collation comparison tests | 
					
						
							|  |  |  |     ! They are bad tests once you fix up the ducet table with { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 } | 
					
						
							|  |  |  |     { 4018 820 3969 } { 3959 33 } [ >string ] bi@ 2array
 | 
					
						
							|  |  |  |     { 4019 3953 820 3968 } { 3961 33 } [ >string ] bi@ 2array
 | 
					
						
							|  |  |  |     2array >hash-set diff members | 
					
						
							| 
									
										
										
										
											2019-07-06 10:32:47 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-30 03:07:34 -04:00
										 |  |  |     [ string<=> { +lt+ +eq+ } member? ] assoc-reject | 
					
						
							|  |  |  | ] unit-test | 
					
						
							| 
									
										
										
										
											2019-07-06 10:32:47 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-30 03:07:34 -04:00
										 |  |  | ! XXX: Once again, these tests pass if you don't | 
					
						
							|  |  |  | ! fix up the ducet table for { 0x0FB2 0x0F71 } and { 0x0FB3 0x0F71 } | 
					
						
							|  |  |  | ! { +lt+ } [ { 4018 820 3969 } { 3959 33 } [ >string ] bi@ string<=> ] unit-test | 
					
						
							|  |  |  | ! { +lt+ } [ { 4019 3953 820 3968 } { 3961 33 } [ >string ] bi@ string<=> ] unit-test |