| 
									
										
										
										
											2008-10-05 19:36:56 -04:00
										 |  |  | ! Copyright (C) 2008 Daniel Ehrenberg. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | USING: ascii sequences namespaces make unicode.data kernel math arrays | 
					
						
							| 
									
										
										
										
											2009-01-08 20:07:46 -05:00
										 |  |  | locals sorting.insertion accessors assocs math.order combinators | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | unicode.syntax strings sbufs hints combinators.short-circuit vectors ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | IN: unicode.normalize | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 18:59:01 -05:00
										 |  |  | <PRIVATE
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | ! Conjoining Jamo behavior | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-08 00:13:04 -05:00
										 |  |  | CONSTANT: hangul-base HEX: ac00 | 
					
						
							|  |  |  | CONSTANT: hangul-end HEX: D7AF | 
					
						
							|  |  |  | CONSTANT: initial-base HEX: 1100
 | 
					
						
							|  |  |  | CONSTANT: medial-base HEX: 1161
 | 
					
						
							|  |  |  | CONSTANT: final-base HEX: 11a7 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-08 00:13:04 -05:00
										 |  |  | CONSTANT: initial-count 19
 | 
					
						
							|  |  |  | CONSTANT: medial-count 21
 | 
					
						
							|  |  |  | CONSTANT: final-count 28
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : ?between? ( n/f from to -- ? )
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |     pick [ between? ] [ 3drop f ] if ; inline
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | : hangul? ( ch -- ? ) hangul-base hangul-end ?between? ; inline
 | 
					
						
							|  |  |  | : jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ; inline
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | ! These numbers come from UAX 29 | 
					
						
							|  |  |  | : initial? ( ch -- ? )
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |     dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ; inline
 | 
					
						
							|  |  |  | : medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ; inline
 | 
					
						
							|  |  |  | : final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ; inline
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : hangul>jamo ( hangul -- jamo-string )
 | 
					
						
							|  |  |  |     hangul-base - final-count /mod final-base +
 | 
					
						
							| 
									
										
										
										
											2008-12-03 09:46:16 -05:00
										 |  |  |     [ | 
					
						
							|  |  |  |         medial-count /mod medial-base +
 | 
					
						
							|  |  |  |         [ initial-base + ] dip
 | 
					
						
							|  |  |  |     ] dip
 | 
					
						
							| 
									
										
										
										
											2008-05-25 13:40:12 -04:00
										 |  |  |     dup final-base = [ drop 2array ] [ 3array ] if ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : jamo>hangul ( initial medial final -- hangul )
 | 
					
						
							| 
									
										
										
										
											2008-12-03 09:46:16 -05:00
										 |  |  |     [ | 
					
						
							|  |  |  |         [ initial-base - medial-count * ] dip
 | 
					
						
							|  |  |  |         medial-base - + final-count *
 | 
					
						
							|  |  |  |     ] dip final-base - + hangul-base + ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | ! Normalization -- Decomposition  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : reorder-slice ( string start -- slice done? )
 | 
					
						
							| 
									
										
										
										
											2008-04-26 00:17:08 -04:00
										 |  |  |     2dup swap [ non-starter? not ] find-from drop
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |     [ [ over length ] unless* rot <slice> ] keep not ; inline
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : reorder-next ( string i -- new-i done? )
 | 
					
						
							| 
									
										
										
										
											2008-04-26 00:17:08 -04:00
										 |  |  |     over [ non-starter? ] find-from drop [ | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  |         reorder-slice | 
					
						
							| 
									
										
										
										
											2008-12-03 09:46:16 -05:00
										 |  |  |         [ dup [ combining-class ] insertion-sort to>> ] dip
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |     ] [ length t ] if* ; inline
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : reorder-loop ( string start -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |     dupd reorder-next [ 2drop ] [ reorder-loop ] if ; inline recursive
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : reorder ( string -- )
 | 
					
						
							|  |  |  |     0 reorder-loop ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : reorder-back ( string i -- )
 | 
					
						
							| 
									
										
										
										
											2008-04-26 00:17:08 -04:00
										 |  |  |     over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-24 18:34:01 -04:00
										 |  |  | :: decompose ( string quot -- decomposed )
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |     string length <sbuf> :> out | 
					
						
							|  |  |  |     string [ | 
					
						
							|  |  |  |         >fixnum dup ascii? [ out push ] [ | 
					
						
							| 
									
										
										
										
											2009-01-08 23:23:39 -05:00
										 |  |  |             dup hangul? [ hangul>jamo out push-all ] | 
					
						
							|  |  |  |             [ dup quot call [ out push-all ] [ out push ] ?if ] if
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |         ] if
 | 
					
						
							|  |  |  |     ] each
 | 
					
						
							|  |  |  |     out "" like dup reorder ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : with-string ( str quot -- str )
 | 
					
						
							|  |  |  |     over aux>> [ call ] [ drop ] if ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : (nfd) ( string -- nfd )
 | 
					
						
							|  |  |  |     [ canonical-entry ] decompose ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | HINTS: (nfd) string ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  | : (nfkd) ( string -- nfkd )
 | 
					
						
							|  |  |  |     [ compatibility-entry ] decompose ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | HINTS: (nfkd) string ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 18:59:01 -05:00
										 |  |  | PRIVATE>
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : nfd ( string -- nfd )
 | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  |     [ (nfd) ] with-string ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 18:59:01 -05:00
										 |  |  | : nfkd ( string -- nfkd )
 | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  |     [ (nfkd) ] with-string ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : string-append ( s1 s2 -- string )
 | 
					
						
							|  |  |  |     [ append ] keep
 | 
					
						
							|  |  |  |     0 over ?nth non-starter? | 
					
						
							|  |  |  |     [ length dupd reorder-back ] [ drop ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | HINTS: string-append string string ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 18:59:01 -05:00
										 |  |  | <PRIVATE
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | ! Normalization -- Composition | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | : initial-medial? ( str i -- ? )
 | 
					
						
							|  |  |  |     { [ swap nth initial? ] [ 1+ swap ?nth medial? ] } 2&& ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : --final? ( str i -- ? )
 | 
					
						
							|  |  |  |     2 + swap ?nth final? ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : imf, ( str i -- str i )
 | 
					
						
							|  |  |  |     [ tail-slice first3 jamo>hangul , ] | 
					
						
							|  |  |  |     [ 3 + ] 2bi ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : im, ( str i -- str i )
 | 
					
						
							|  |  |  |     [ tail-slice first2 final-base jamo>hangul , ] | 
					
						
							|  |  |  |     [ 2 + ] 2bi ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : compose-jamo ( str i -- str i )
 | 
					
						
							|  |  |  |     2dup initial-medial? [ | 
					
						
							|  |  |  |         2dup --final? [ imf, ] [ im, ] if
 | 
					
						
							|  |  |  |     ] [ 2dup swap nth , 1+ ] if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : pass-combining ( str -- str i )
 | 
					
						
							|  |  |  |     dup [ non-starter? not ] find drop
 | 
					
						
							|  |  |  |     [ dup length ] unless*
 | 
					
						
							|  |  |  |     2dup head-slice % ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | TUPLE: compose-state i str char after last-class ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : get-str ( state i -- ch )
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     swap [ i>> + ] [ str>> ] bi ?nth ; inline
 | 
					
						
							|  |  |  | : current ( state -- ch ) 0 get-str ; inline
 | 
					
						
							|  |  |  | : to ( state -- state ) [ 1+ ] change-i ; inline
 | 
					
						
							|  |  |  | : push-after ( ch state -- state ) [ ?push ] change-after ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | :: try-compose ( state new-char current-class -- state )
 | 
					
						
							|  |  |  |     state last-class>> current-class =
 | 
					
						
							|  |  |  |     [ new-char state push-after ] [ | 
					
						
							|  |  |  |         state char>> new-char combine-chars | 
					
						
							|  |  |  |         [ state swap >>char ] [ | 
					
						
							|  |  |  |             new-char state push-after | 
					
						
							|  |  |  |             current-class >>last-class | 
					
						
							|  |  |  |         ] if*
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     ] if ; inline
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-06 17:25:12 -05:00
										 |  |  | DEFER: compose-iter | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-02 17:11:16 -05:00
										 |  |  | : try-noncombining ( state char -- state )
 | 
					
						
							|  |  |  |     [ drop ] [ [ char>> ] dip combine-chars ] 2bi
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     [ >>char to f >>last-class compose-iter ] when* ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-06 17:25:12 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | : compose-iter ( state -- state )
 | 
					
						
							|  |  |  |     dup current [ | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  |         dup combining-class { | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |             { f [ drop ] } | 
					
						
							|  |  |  |             { 0 [ | 
					
						
							|  |  |  |                 over last-class>> | 
					
						
							| 
									
										
										
										
											2009-02-02 17:11:16 -05:00
										 |  |  |                 [ drop ] [ try-noncombining ] if ] } | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  |             [ try-compose to compose-iter ] | 
					
						
							|  |  |  |         } case
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     ] when* ; inline recursive
 | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | : compose-combining ( ch str i -- str i )
 | 
					
						
							|  |  |  |     compose-state new
 | 
					
						
							|  |  |  |         swap >>i | 
					
						
							|  |  |  |         swap >>str | 
					
						
							|  |  |  |         swap >>char | 
					
						
							|  |  |  |     compose-iter | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     { [ char>> , ] [ after>> % ] [ str>> ] [ i>> ] } cleave ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | :: (compose) ( str i -- )
 | 
					
						
							|  |  |  |     i str ?nth [ | 
					
						
							|  |  |  |         dup jamo? [ drop str i compose-jamo ] [ | 
					
						
							|  |  |  |             i 1+ str ?nth combining-class | 
					
						
							|  |  |  |             [ str i 1+ compose-combining ] [ , str i 1+ ] if
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  |         ] if (compose) | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  |     ] when* ; inline recursive
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-08 00:54:19 -05:00
										 |  |  | : combine ( str -- comp )
 | 
					
						
							| 
									
										
										
										
											2009-01-09 15:03:33 -05:00
										 |  |  |     [ pass-combining (compose) ] "" make ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-09 16:53:35 -05:00
										 |  |  | HINTS: combine string ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-07 18:59:01 -05:00
										 |  |  | PRIVATE>
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | : nfc ( string -- nfc )
 | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  |     [ (nfd) combine ] with-string ;
 | 
					
						
							| 
									
										
										
										
											2008-01-09 14:44:07 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : nfkc ( string -- nfkc )
 | 
					
						
							| 
									
										
										
										
											2009-01-08 18:56:52 -05:00
										 |  |  |     [ (nfkd) combine ] with-string ;
 |