| 
									
										
										
										
											2008-10-05 19:36:56 -04:00
										 |  |  | ! Copyright (C) 2008 Daniel Ehrenberg. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2008-06-25 04:33:58 -04:00
										 |  |  | USING: combinators.short-circuit sequences io.files | 
					
						
							|  |  |  | io.encodings.ascii kernel values splitting accessors math.parser | 
					
						
							| 
									
										
										
										
											2008-09-10 23:11:40 -04:00
										 |  |  | ascii io assocs strings math namespaces make sorting combinators | 
					
						
							| 
									
										
										
										
											2008-06-25 04:33:58 -04:00
										 |  |  | math.order arrays unicode.normalize unicode.data locals | 
					
						
							| 
									
										
										
										
											2008-05-25 03:51:27 -04:00
										 |  |  | unicode.syntax macros sequences.deep words unicode.breaks | 
					
						
							| 
									
										
										
										
											2009-01-11 20:41:48 -05:00
										 |  |  | quotations combinators.short-circuit ;
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | IN: unicode.collation | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-06-01 12:24:17 -04:00
										 |  |  | <PRIVATE
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | VALUE: ducet | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TUPLE: weight primary secondary tertiary ignorable? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-weight ( string -- weight )
 | 
					
						
							|  |  |  |     "]" split but-last [ | 
					
						
							|  |  |  |         weight new swap rest unclip CHAR: * = swapd >>ignorable? | 
					
						
							|  |  |  |         swap "." split first3 [ hex> ] tri@
 | 
					
						
							|  |  |  |         [ >>primary ] [ >>secondary ] [ >>tertiary ] tri*
 | 
					
						
							|  |  |  |     ] map ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-line ( line -- code-poing weight )
 | 
					
						
							|  |  |  |     ";" split1 [ [ blank? ] trim ] bi@
 | 
					
						
							|  |  |  |     [ " " split [ hex> ] "" map-as ] [ parse-weight ] bi* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-15 21:45:06 -05:00
										 |  |  | : parse-ducet ( file -- ducet )
 | 
					
						
							|  |  |  |     ascii file-lines filter-comments | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  |     [ parse-line ] H{ } map>assoc ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-15 21:45:06 -05:00
										 |  |  | "vocab:unicode/collation/allkeys.txt" parse-ducet to: ducet | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-25 16:41:26 -04:00
										 |  |  | ! Fix up table for long contractions | 
					
						
							|  |  |  | : help-one ( assoc key -- )
 | 
					
						
							|  |  |  |     ! Need to be more general? Not for DUCET, apparently | 
					
						
							|  |  |  |     2 head 2dup swap key? [ 2drop ] [ | 
					
						
							|  |  |  |         [ [ 1string swap at ] with { } map-as concat ] | 
					
						
							|  |  |  |         [ swap set-at ] 2bi
 | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : insert-helpers ( assoc -- )
 | 
					
						
							|  |  |  |     dup keys [ length 3 >= ] filter
 | 
					
						
							|  |  |  |     [ help-one ] with each ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ducet insert-helpers | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-24 18:34:01 -04:00
										 |  |  | : base ( char -- base )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 03:51:27 -04:00
										 |  |  |     { | 
					
						
							|  |  |  |         { [ dup HEX: 3400 HEX:  4DB5 between? ] [ drop HEX: FB80 ] } ! Extension A | 
					
						
							|  |  |  |         { [ dup HEX: 20000 HEX: 2A6D6 between? ] [ drop HEX: FB80 ] } ! Extension B | 
					
						
							|  |  |  |         { [ dup HEX: 4E00 HEX: 9FC3 between? ] [ drop HEX: FB40 ] } ! CJK | 
					
						
							|  |  |  |         [ drop HEX: FBC0 ] ! Other | 
					
						
							|  |  |  |     } cond ;
 | 
					
						
							| 
									
										
										
										
											2008-05-24 18:34:01 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : AAAA ( char -- weight )
 | 
					
						
							|  |  |  |     [ base ] [ -15 shift ] bi + HEX: 20 2 f weight boa ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : BBBB ( char -- weight )
 | 
					
						
							|  |  |  |     HEX: 7FFF bitand HEX: 8000 bitor 0 0 f weight boa ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-25 03:51:27 -04:00
										 |  |  | : illegal? ( char -- ? )
 | 
					
						
							| 
									
										
										
										
											2008-06-13 02:51:46 -04:00
										 |  |  |     { [ "Noncharacter_Code_Point" property? ] [ category "Cs" = ] } 1|| ;
 | 
					
						
							| 
									
										
										
										
											2008-05-25 03:51:27 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-24 18:34:01 -04:00
										 |  |  | : derive-weight ( char -- weights )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 03:51:27 -04:00
										 |  |  |     first dup illegal? | 
					
						
							| 
									
										
										
										
											2008-05-24 18:34:01 -04:00
										 |  |  |     [ drop { } ] | 
					
						
							|  |  |  |     [ [ AAAA ] [ BBBB ] bi 2array ] if ;
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-24 13:17:08 -04:00
										 |  |  | : last ( -- char )
 | 
					
						
							|  |  |  |     building get empty? [ 0 ] [ building get peek peek ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : blocked? ( char -- ? )
 | 
					
						
							| 
									
										
										
										
											2009-01-11 20:41:48 -05:00
										 |  |  |     combining-class dup { 0 f } member?
 | 
					
						
							|  |  |  |     [ drop last non-starter? ] | 
					
						
							|  |  |  |     [ last combining-class = ] if ;
 | 
					
						
							| 
									
										
										
										
											2008-05-24 13:17:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : possible-bases ( -- slice-of-building )
 | 
					
						
							| 
									
										
										
										
											2009-01-11 20:41:48 -05:00
										 |  |  |     building get dup [ first non-starter? not ] find-last
 | 
					
						
							| 
									
										
										
										
											2008-05-24 13:17:08 -04:00
										 |  |  |     drop [ 0 ] unless* tail-slice ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | :: ?combine ( char slice i -- ? )
 | 
					
						
							|  |  |  |     [let | str [ i slice nth char suffix ] | | 
					
						
							|  |  |  |         str ducet key? dup
 | 
					
						
							|  |  |  |         [ str i slice set-nth ] when
 | 
					
						
							|  |  |  |     ] ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : add ( char -- )
 | 
					
						
							|  |  |  |     dup blocked? [ 1string , ] [ | 
					
						
							|  |  |  |         dup possible-bases dup length
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:19:07 -05:00
										 |  |  |         [ ?combine ] with with any?
 | 
					
						
							| 
									
										
										
										
											2008-05-24 13:17:08 -04:00
										 |  |  |         [ drop ] [ 1string , ] if
 | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : string>graphemes ( string -- graphemes )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 13:40:12 -04:00
										 |  |  |     [ [ add ] each ] { } make ;
 | 
					
						
							| 
									
										
										
										
											2008-05-24 13:17:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : graphemes>weights ( graphemes -- weights )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 03:51:27 -04:00
										 |  |  |     [ | 
					
						
							|  |  |  |         dup weight? [ 1array ] ! From tailoring | 
					
						
							|  |  |  |         [ dup ducet at [ ] [ derive-weight ] ?if ] if
 | 
					
						
							|  |  |  |     ] { } map-as concat ;
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : append-weights ( weights quot -- )
 | 
					
						
							| 
									
										
										
										
											2008-10-05 19:26:24 -04:00
										 |  |  |     [ [ ignorable?>> not ] filter ] dip
 | 
					
						
							|  |  |  |     map [ zero? not ] filter % 0 , ; inline
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : variable-weight ( weight -- )
 | 
					
						
							|  |  |  |     dup ignorable?>> [ primary>> ] [ drop HEX: FFFF ] if , ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : weights>bytes ( weights -- byte-array )
 | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         { | 
					
						
							|  |  |  |             [ [ primary>> ] append-weights ] | 
					
						
							|  |  |  |             [ [ secondary>> ] append-weights ] | 
					
						
							|  |  |  |             [ [ tertiary>> ] append-weights ] | 
					
						
							|  |  |  |             [ [ variable-weight ] each ] | 
					
						
							|  |  |  |         } cleave
 | 
					
						
							|  |  |  |     ] { } make ;
 | 
					
						
							| 
									
										
										
										
											2008-06-01 12:24:17 -04:00
										 |  |  | PRIVATE>
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : completely-ignorable? ( weight -- ? )
 | 
					
						
							|  |  |  |     [ primary>> ] [ secondary>> ] [ tertiary>> ] tri
 | 
					
						
							|  |  |  |     [ zero? ] tri@ and and ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : filter-ignorable ( weights -- weights' )
 | 
					
						
							| 
									
										
										
										
											2008-12-04 11:19:18 -05:00
										 |  |  |     f swap [ | 
					
						
							| 
									
										
										
										
											2009-01-23 19:20:47 -05:00
										 |  |  |         [ nip ] [ primary>> zero? and ] 2bi
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  |         [ swap ignorable?>> or ] | 
					
						
							|  |  |  |         [ swap completely-ignorable? or not ] 2bi
 | 
					
						
							|  |  |  |     ] filter nip ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : collation-key ( string -- key )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 21:21:39 -04:00
										 |  |  |     nfd string>graphemes graphemes>weights | 
					
						
							|  |  |  |     filter-ignorable weights>bytes ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-06-01 12:24:17 -04:00
										 |  |  | <PRIVATE
 | 
					
						
							| 
									
										
										
										
											2008-05-25 21:21:39 -04:00
										 |  |  | : insensitive= ( str1 str2 levels-removed -- ? )
 | 
					
						
							|  |  |  |     [ | 
					
						
							| 
									
										
										
										
											2008-10-05 19:26:24 -04:00
										 |  |  |         [ collation-key ] dip
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:19:07 -05:00
										 |  |  |         [ [ 0 = not ] trim-tail but-last ] times
 | 
					
						
							| 
									
										
										
										
											2008-05-25 21:21:39 -04:00
										 |  |  |     ] curry bi@ = ;
 | 
					
						
							| 
									
										
										
										
											2008-06-01 12:24:17 -04:00
										 |  |  | PRIVATE>
 | 
					
						
							| 
									
										
										
										
											2008-05-25 16:41:26 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : primary= ( str1 str2 -- ? )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 21:21:39 -04:00
										 |  |  |     3 insensitive= ;
 | 
					
						
							| 
									
										
										
										
											2008-05-25 16:41:26 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : secondary= ( str1 str2 -- ? )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 21:21:39 -04:00
										 |  |  |     2 insensitive= ;
 | 
					
						
							| 
									
										
										
										
											2008-05-25 16:41:26 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : tertiary= ( str1 str2 -- ? )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 21:21:39 -04:00
										 |  |  |     1 insensitive= ;
 | 
					
						
							| 
									
										
										
										
											2008-05-25 16:41:26 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : quaternary= ( str1 str2 -- ? )
 | 
					
						
							| 
									
										
										
										
											2008-05-25 21:21:39 -04:00
										 |  |  |     0 insensitive= ;
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-06-01 12:24:17 -04:00
										 |  |  | <PRIVATE
 | 
					
						
							| 
									
										
										
										
											2008-05-25 16:41:26 -04:00
										 |  |  | : w/collation-key ( str -- {str,key} )
 | 
					
						
							| 
									
										
										
										
											2008-06-01 14:50:12 -04:00
										 |  |  |     [ collation-key ] keep 2array ;
 | 
					
						
							| 
									
										
										
										
											2008-06-01 12:24:17 -04:00
										 |  |  | PRIVATE>
 | 
					
						
							| 
									
										
										
										
											2008-05-25 16:41:26 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | : sort-strings ( strings -- sorted )
 | 
					
						
							| 
									
										
										
										
											2008-10-05 19:26:24 -04:00
										 |  |  |     [ w/collation-key ] map natural-sort values ;
 | 
					
						
							| 
									
										
										
										
											2008-05-20 17:57:53 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : string<=> ( str1 str2 -- <=> )
 | 
					
						
							| 
									
										
										
										
											2008-06-01 14:50:12 -04:00
										 |  |  |     [ w/collation-key ] compare ;
 |