| 
									
										
										
										
											2008-10-05 19:36:56 -04:00
										 |  |  | ! Copyright (C) 2008 Daniel Ehrenberg. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2008-09-10 21:07:00 -04:00
										 |  |  | USING: sequences namespaces make unicode.data kernel math arrays | 
					
						
							| 
									
										
										
										
											2008-08-30 15:05:53 -04:00
										 |  |  | locals sorting.insertion accessors ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | IN: unicode.normalize | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! Conjoining Jamo behavior | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : hangul-base HEX: ac00 ; inline
 | 
					
						
							|  |  |  | : hangul-end HEX: D7AF ; inline
 | 
					
						
							|  |  |  | : initial-base HEX: 1100 ; inline
 | 
					
						
							|  |  |  | : medial-base HEX: 1161 ; inline
 | 
					
						
							|  |  |  | : final-base HEX: 11a7 ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : initial-count 19 ; inline
 | 
					
						
							|  |  |  | : medial-count 21 ; inline
 | 
					
						
							|  |  |  | : final-count 28 ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
 | 
					
						
							|  |  |  | : jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! These numbers come from UAX 29 | 
					
						
							|  |  |  | : initial? ( ch -- ? )
 | 
					
						
							| 
									
										
										
										
											2008-04-05 08:57:26 -04:00
										 |  |  |     dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | : medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
 | 
					
						
							|  |  |  | : final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : hangul>jamo ( hangul -- jamo-string )
 | 
					
						
							|  |  |  |     hangul-base - final-count /mod final-base +
 | 
					
						
							|  |  |  |     >r medial-count /mod medial-base +
 | 
					
						
							|  |  |  |     >r initial-base + r> r> | 
					
						
							| 
									
										
										
										
											2008-05-25 13:40:12 -04:00
										 |  |  |     dup final-base = [ drop 2array ] [ 3array ] if ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : jamo>hangul ( initial medial final -- hangul )
 | 
					
						
							|  |  |  |     >r >r initial-base - medial-count *
 | 
					
						
							|  |  |  |     r> medial-base - + final-count *
 | 
					
						
							|  |  |  |     r> final-base - + hangul-base + ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! Normalization -- Decomposition  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : reorder-slice ( string start -- slice done? )
 | 
					
						
							| 
									
										
										
										
											2008-04-26 00:17:08 -04:00
										 |  |  |     2dup swap [ non-starter? not ] find-from drop
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  |     [ [ over length ] unless* rot <slice> ] keep not ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : reorder-next ( string i -- new-i done? )
 | 
					
						
							| 
									
										
										
										
											2008-04-26 00:17:08 -04:00
										 |  |  |     over [ non-starter? ] find-from drop [ | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  |         reorder-slice | 
					
						
							| 
									
										
										
										
											2008-08-30 15:05:53 -04:00
										 |  |  |         >r dup [ combining-class ] insertion-sort to>> r> | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  |     ] [ length t ] if* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : reorder-loop ( string start -- )
 | 
					
						
							|  |  |  |     dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : reorder ( string -- )
 | 
					
						
							|  |  |  |     0 reorder-loop ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : reorder-back ( string i -- )
 | 
					
						
							| 
									
										
										
										
											2008-04-26 00:17:08 -04:00
										 |  |  |     over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-24 18:34:01 -04:00
										 |  |  | :: decompose ( string quot -- decomposed )
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  |     ! When there are 8 and 32-bit strings, this'll be | 
					
						
							|  |  |  |     ! equivalent to clone on 8 and the contents of the last | 
					
						
							|  |  |  |     ! main quotation on 32. | 
					
						
							| 
									
										
										
										
											2008-05-24 18:34:01 -04:00
										 |  |  |     string [ 127 < ] all? [ string ] [ | 
					
						
							|  |  |  |         [ | 
					
						
							|  |  |  |             string [ | 
					
						
							|  |  |  |                 dup hangul? [ hangul>jamo % ] | 
					
						
							|  |  |  |                 [ dup quot call [ % ] [ , ] ?if ] if
 | 
					
						
							|  |  |  |             ] each
 | 
					
						
							|  |  |  |         ] "" make | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  |         dup reorder | 
					
						
							|  |  |  |     ] if ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : nfd ( string -- string )
 | 
					
						
							|  |  |  |     [ canonical-entry ] decompose ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : nfkd ( string -- string )
 | 
					
						
							| 
									
										
										
										
											2008-05-24 18:34:01 -04:00
										 |  |  |     [ compatibility-entry ] decompose ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : string-append ( s1 s2 -- string )
 | 
					
						
							|  |  |  |     ! This could be more optimized, | 
					
						
							|  |  |  |     ! but in practice, it'll almost always just be append | 
					
						
							|  |  |  |     [ append ] keep
 | 
					
						
							|  |  |  |     0 over ?nth non-starter? | 
					
						
							|  |  |  |     [ length dupd reorder-back ] [ drop ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ! Normalization -- Composition | 
					
						
							|  |  |  | SYMBOL: main-str | 
					
						
							|  |  |  | SYMBOL: ind | 
					
						
							|  |  |  | SYMBOL: after | 
					
						
							|  |  |  | SYMBOL: char | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : get-str ( i -- ch ) ind get + main-str get ?nth ;
 | 
					
						
							|  |  |  | : current ( -- ch ) 0 get-str ;
 | 
					
						
							|  |  |  | : to ( -- ) ind inc ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : initial-medial? ( -- ? )
 | 
					
						
							|  |  |  |     current initial? [ 1 get-str medial? ] [ f ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : --final? ( -- ? )
 | 
					
						
							|  |  |  |     2 get-str final? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : imf, ( -- )
 | 
					
						
							|  |  |  |     current to current to current jamo>hangul , ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : im, ( -- )
 | 
					
						
							|  |  |  |     current to current 0 jamo>hangul , ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : compose-jamo ( -- )
 | 
					
						
							|  |  |  |     initial-medial? [ | 
					
						
							|  |  |  |         --final? [ imf, ] [ im, ] if
 | 
					
						
							|  |  |  |     ] when to current jamo? [ compose-jamo ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : pass-combining ( -- )
 | 
					
						
							|  |  |  |     current non-starter? [ current , to pass-combining ] when ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : try-compose ( last-class char current-class -- )
 | 
					
						
							|  |  |  |     swapd = [ after get push ] [ | 
					
						
							|  |  |  |         char get over combine-chars | 
					
						
							|  |  |  |         [ nip char set ] [ after get push ] if*
 | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : compose-iter ( n -- )
 | 
					
						
							|  |  |  |     current [ | 
					
						
							|  |  |  |         dup combining-class dup
 | 
					
						
							|  |  |  |         [ [ try-compose ] keep to compose-iter ] [ 3drop ] if
 | 
					
						
							|  |  |  |     ] [ drop ] if* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : ?new-after ( -- )
 | 
					
						
							|  |  |  |     after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : (compose) ( -- )
 | 
					
						
							|  |  |  |     current [ | 
					
						
							|  |  |  |         dup jamo? [ drop compose-jamo ] [ | 
					
						
							|  |  |  |             char set to ?new-after | 
					
						
							|  |  |  |             0 compose-iter | 
					
						
							|  |  |  |             char get , after get % | 
					
						
							|  |  |  |             to | 
					
						
							|  |  |  |         ] if (compose) | 
					
						
							|  |  |  |     ] when* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : compose ( str -- comp )
 | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         main-str set
 | 
					
						
							|  |  |  |         0 ind set
 | 
					
						
							|  |  |  |         SBUF" " clone after set
 | 
					
						
							|  |  |  |         pass-combining (compose) | 
					
						
							| 
									
										
										
										
											2008-01-29 14:33:14 -05:00
										 |  |  |     ] "" make ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : nfc ( string -- nfc )
 | 
					
						
							|  |  |  |     nfd compose ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : nfkc ( string -- nfkc )
 | 
					
						
							|  |  |  |     nfkc compose ;
 |