2008-10-05 19:36:56 -04:00
|
|
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
|
|
|
! See http://factorcode.org/license.txt for BSD license.
|
2008-09-10 21:07:00 -04:00
|
|
|
USING: sequences namespaces make unicode.data kernel math arrays
|
2009-01-08 20:07:46 -05:00
|
|
|
locals sorting.insertion accessors assocs math.order combinators
|
2009-01-08 23:23:39 -05:00
|
|
|
unicode.syntax strings sbufs ;
|
2008-01-09 14:44:07 -05:00
|
|
|
IN: unicode.normalize
|
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
<PRIVATE
|
2008-01-09 14:44:07 -05:00
|
|
|
! Conjoining Jamo behavior
|
|
|
|
|
2009-01-08 00:13:04 -05:00
|
|
|
CONSTANT: hangul-base HEX: ac00
|
|
|
|
CONSTANT: hangul-end HEX: D7AF
|
|
|
|
CONSTANT: initial-base HEX: 1100
|
|
|
|
CONSTANT: medial-base HEX: 1161
|
|
|
|
CONSTANT: final-base HEX: 11a7
|
2008-01-09 14:44:07 -05:00
|
|
|
|
2009-01-08 00:13:04 -05:00
|
|
|
CONSTANT: initial-count 19
|
|
|
|
CONSTANT: medial-count 21
|
|
|
|
CONSTANT: final-count 28
|
|
|
|
|
|
|
|
: ?between? ( n/f from to -- ? )
|
|
|
|
pick [ between? ] [ 3drop f ] if ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
|
|
|
: hangul? ( ch -- ? ) hangul-base hangul-end ?between? ;
|
|
|
|
: jamo? ( ch -- ? ) HEX: 1100 HEX: 11FF ?between? ;
|
|
|
|
|
|
|
|
! These numbers come from UAX 29
|
|
|
|
: initial? ( ch -- ? )
|
2008-04-05 08:57:26 -04:00
|
|
|
dup HEX: 1100 HEX: 1159 ?between? [ ] [ HEX: 115F = ] ?if ;
|
2008-01-09 14:44:07 -05:00
|
|
|
: medial? ( ch -- ? ) HEX: 1160 HEX: 11A2 ?between? ;
|
|
|
|
: final? ( ch -- ? ) HEX: 11A8 HEX: 11F9 ?between? ;
|
|
|
|
|
|
|
|
: hangul>jamo ( hangul -- jamo-string )
|
|
|
|
hangul-base - final-count /mod final-base +
|
2008-12-03 09:46:16 -05:00
|
|
|
[
|
|
|
|
medial-count /mod medial-base +
|
|
|
|
[ initial-base + ] dip
|
|
|
|
] dip
|
2008-05-25 13:40:12 -04:00
|
|
|
dup final-base = [ drop 2array ] [ 3array ] if ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
|
|
|
: jamo>hangul ( initial medial final -- hangul )
|
2008-12-03 09:46:16 -05:00
|
|
|
[
|
|
|
|
[ initial-base - medial-count * ] dip
|
|
|
|
medial-base - + final-count *
|
|
|
|
] dip final-base - + hangul-base + ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
|
|
|
! Normalization -- Decomposition
|
|
|
|
|
|
|
|
: reorder-slice ( string start -- slice done? )
|
2008-04-26 00:17:08 -04:00
|
|
|
2dup swap [ non-starter? not ] find-from drop
|
2008-01-09 14:44:07 -05:00
|
|
|
[ [ over length ] unless* rot <slice> ] keep not ;
|
|
|
|
|
|
|
|
: reorder-next ( string i -- new-i done? )
|
2008-04-26 00:17:08 -04:00
|
|
|
over [ non-starter? ] find-from drop [
|
2008-01-09 14:44:07 -05:00
|
|
|
reorder-slice
|
2008-12-03 09:46:16 -05:00
|
|
|
[ dup [ combining-class ] insertion-sort to>> ] dip
|
2008-01-09 14:44:07 -05:00
|
|
|
] [ length t ] if* ;
|
|
|
|
|
|
|
|
: reorder-loop ( string start -- )
|
|
|
|
dupd reorder-next [ 2drop ] [ reorder-loop ] if ;
|
|
|
|
|
|
|
|
: reorder ( string -- )
|
|
|
|
0 reorder-loop ;
|
|
|
|
|
|
|
|
: reorder-back ( string i -- )
|
2008-04-26 00:17:08 -04:00
|
|
|
over [ non-starter? not ] find-last-from drop ?1+ reorder-next 2drop ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
2008-05-24 18:34:01 -04:00
|
|
|
:: decompose ( string quot -- decomposed )
|
2009-01-08 23:23:39 -05:00
|
|
|
[let | out [ string length <sbuf> ] |
|
2009-01-08 18:56:52 -05:00
|
|
|
string [
|
2009-01-08 23:23:39 -05:00
|
|
|
dup hangul? [ hangul>jamo out push-all ]
|
|
|
|
[ dup quot call [ out push-all ] [ out push ] ?if ] if
|
|
|
|
] each out >string
|
|
|
|
] dup reorder ;
|
2009-01-08 18:56:52 -05:00
|
|
|
|
|
|
|
: with-string ( str quot -- str )
|
|
|
|
over aux>> [ call ] [ drop ] if ; inline
|
|
|
|
|
|
|
|
: (nfd) ( string -- nfd )
|
|
|
|
[ canonical-entry ] decompose ;
|
|
|
|
|
|
|
|
: (nfkd) ( string -- nfkd )
|
|
|
|
[ compatibility-entry ] decompose ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
PRIVATE>
|
|
|
|
|
|
|
|
: nfd ( string -- nfd )
|
2009-01-08 18:56:52 -05:00
|
|
|
[ (nfd) ] with-string ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
: nfkd ( string -- nfkd )
|
2009-01-08 18:56:52 -05:00
|
|
|
[ (nfkd) ] with-string ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
|
|
|
: string-append ( s1 s2 -- string )
|
|
|
|
[ append ] keep
|
|
|
|
0 over ?nth non-starter?
|
|
|
|
[ length dupd reorder-back ] [ drop ] if ;
|
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
<PRIVATE
|
|
|
|
|
2008-01-09 14:44:07 -05:00
|
|
|
! Normalization -- Composition
|
|
|
|
SYMBOL: main-str
|
|
|
|
SYMBOL: ind
|
|
|
|
SYMBOL: after
|
|
|
|
SYMBOL: char
|
|
|
|
|
|
|
|
: get-str ( i -- ch ) ind get + main-str get ?nth ;
|
|
|
|
: current ( -- ch ) 0 get-str ;
|
|
|
|
: to ( -- ) ind inc ;
|
|
|
|
|
|
|
|
: initial-medial? ( -- ? )
|
|
|
|
current initial? [ 1 get-str medial? ] [ f ] if ;
|
|
|
|
|
|
|
|
: --final? ( -- ? )
|
|
|
|
2 get-str final? ;
|
|
|
|
|
|
|
|
: imf, ( -- )
|
|
|
|
current to current to current jamo>hangul , ;
|
|
|
|
|
|
|
|
: im, ( -- )
|
2009-01-06 17:25:12 -05:00
|
|
|
current to current final-base jamo>hangul , ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
|
|
|
: compose-jamo ( -- )
|
|
|
|
initial-medial? [
|
|
|
|
--final? [ imf, ] [ im, ] if
|
2009-01-06 17:25:12 -05:00
|
|
|
] [ current , ] if to ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
|
|
|
: pass-combining ( -- )
|
|
|
|
current non-starter? [ current , to pass-combining ] when ;
|
|
|
|
|
2009-01-05 23:19:14 -05:00
|
|
|
:: try-compose ( last-class new-char current-class -- new-class )
|
|
|
|
last-class current-class = [ new-char after get push last-class ] [
|
|
|
|
char get new-char combine-chars
|
|
|
|
[ char set last-class ]
|
|
|
|
[ new-char after get push current-class ] if*
|
2008-01-09 14:44:07 -05:00
|
|
|
] if ;
|
|
|
|
|
2009-01-06 17:25:12 -05:00
|
|
|
DEFER: compose-iter
|
|
|
|
|
|
|
|
: try-noncombining ( char -- )
|
|
|
|
char get swap combine-chars
|
|
|
|
[ char set to f compose-iter ] when* ;
|
|
|
|
|
2009-01-05 23:19:14 -05:00
|
|
|
: compose-iter ( last-class -- )
|
2008-01-09 14:44:07 -05:00
|
|
|
current [
|
2009-01-08 18:56:52 -05:00
|
|
|
dup combining-class {
|
|
|
|
{ f [ 2drop ] }
|
|
|
|
{ 0 [ swap [ drop ] [ try-noncombining ] if ] }
|
|
|
|
[ try-compose to compose-iter ]
|
|
|
|
} case
|
2008-01-09 14:44:07 -05:00
|
|
|
] [ drop ] if* ;
|
|
|
|
|
|
|
|
: ?new-after ( -- )
|
|
|
|
after [ dup empty? [ drop SBUF" " clone ] unless ] change ;
|
|
|
|
|
2009-01-08 18:56:52 -05:00
|
|
|
: compose-combining ( ch -- )
|
|
|
|
char set to ?new-after
|
|
|
|
f compose-iter
|
|
|
|
char get , after get % ;
|
|
|
|
|
2008-01-09 14:44:07 -05:00
|
|
|
: (compose) ( -- )
|
|
|
|
current [
|
|
|
|
dup jamo? [ drop compose-jamo ] [
|
2009-01-08 18:56:52 -05:00
|
|
|
1 get-str combining-class
|
|
|
|
[ compose-combining ] [ , to ] if
|
2008-01-09 14:44:07 -05:00
|
|
|
] if (compose)
|
|
|
|
] when* ;
|
|
|
|
|
2009-01-08 00:54:19 -05:00
|
|
|
: combine ( str -- comp )
|
2008-01-09 14:44:07 -05:00
|
|
|
[
|
|
|
|
main-str set
|
|
|
|
0 ind set
|
|
|
|
SBUF" " clone after set
|
|
|
|
pass-combining (compose)
|
2008-01-29 14:33:14 -05:00
|
|
|
] "" make ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
2009-01-07 18:59:01 -05:00
|
|
|
PRIVATE>
|
|
|
|
|
2008-01-09 14:44:07 -05:00
|
|
|
: nfc ( string -- nfc )
|
2009-01-08 18:56:52 -05:00
|
|
|
[ (nfd) combine ] with-string ;
|
2008-01-09 14:44:07 -05:00
|
|
|
|
|
|
|
: nfkc ( string -- nfkc )
|
2009-01-08 18:56:52 -05:00
|
|
|
[ (nfkd) combine ] with-string ;
|