Factoring out EUC code
parent
5aba91a0c5
commit
0bff96a990
|
@ -1,14 +1,10 @@
|
|||
! Copyright (C) 2009 Yun, Jonghyouk.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: help.syntax help.markup ;
|
||||
IN: io.encodings.korean
|
||||
IN: io.encodings.euc-kr
|
||||
|
||||
ARTICLE: "io.encodings.korean" "Korean text encodings"
|
||||
"The " { $vocab-link "io.encodings.korean" } " vocabulary implements encodings used for Korean text besides the standard UTF encodings for Unicode strings."
|
||||
{ $subsection cp949 } ;
|
||||
ABOUT: euc-kr
|
||||
|
||||
ABOUT: "io.encodings.korean"
|
||||
|
||||
HELP: cp949
|
||||
{ $class-description "This encoding class implements Microsoft's code page #949 encoding, also called Unified Hangul Code or ks_c_5601-1987, UHC. CP949 is extended version of EUC-KR and downward-compatibility to EUC-KR. " }
|
||||
HELP: euc-kr
|
||||
{ $class-description "This encoding class implements Microsoft's code page #949 encoding, also called Unified Hangul Code or ks_c_5601-1987, UHC. CP949 is extended version of EUC-KR and downward-compatibility to EUC-KR, in practice." }
|
||||
{ $see-also "encodings-introduction" } ;
|
|
@ -0,0 +1,8 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: io.encodings.iana io.encodings.euc ;
|
||||
IN: io.encodings.euc-kr
|
||||
|
||||
EUC: euc-kr "vocab:io/encodings/euc-kr/data/cp949.txt"
|
||||
|
||||
euc-kr "EUC-KR" register-encoding
|
|
@ -0,0 +1,50 @@
|
|||
! Copyright (C) 2009 Yun, Jonghyouk.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: arrays byte-arrays io io.encodings io.encodings.euc-kr assocs
|
||||
io.encodings.string io.streams.string io.encodings.euc.private words
|
||||
kernel locals multiline namespaces sequences strings tools.test ;
|
||||
IN: io.encodings.euc.tests
|
||||
|
||||
: euc-kr>unicode ( ch -- ch/f )
|
||||
euc-kr euc-table word-prop at ;
|
||||
|
||||
: unicode>euc-kr ( ch -- ch/f )
|
||||
euc-kr euc-table word-prop value-at ;
|
||||
|
||||
[ f ] [ HEX: 80 euc-kr>unicode ] unit-test
|
||||
[ f ] [ HEX: ff euc-kr>unicode ] unit-test
|
||||
[ HEX: ac02 ] [ HEX: 8141 euc-kr>unicode ] unit-test
|
||||
[ HEX: 7f ] [ HEX: 7f euc-kr>unicode ] unit-test
|
||||
[ HEX: c724 ] [ HEX: c0b1 euc-kr>unicode ] unit-test
|
||||
|
||||
[ HEX: 8141 ] [ HEX: ac02 unicode>euc-kr ] unit-test
|
||||
[ HEX: 7f ] [ HEX: 7f unicode>euc-kr ] unit-test
|
||||
[ HEX: c0b1 ] [ HEX: c724 unicode>euc-kr ] unit-test
|
||||
|
||||
: phrase-unicode ( -- s )
|
||||
"\u00b3d9\u00d574\u00bb3c\u00acfc \u00bc31\u00b450\u00c0b0\u00c774!" ;
|
||||
|
||||
: phrase-euc-kr ( -- s )
|
||||
{
|
||||
HEX: b5 HEX: bf HEX: c7 HEX: d8
|
||||
HEX: b9 HEX: b0 HEX: b0 HEX: fa
|
||||
HEX: 20 HEX: b9 HEX: e9 HEX: b5
|
||||
HEX: ce HEX: bb HEX: ea HEX: c0
|
||||
HEX: cc HEX: 21
|
||||
} ;
|
||||
|
||||
: phrase-unicode>euc-kr ( -- s )
|
||||
phrase-unicode euc-kr encode ;
|
||||
|
||||
: phrase-euc-kr>unicode ( -- s )
|
||||
phrase-euc-kr euc-kr decode ;
|
||||
|
||||
[ t ] [ phrase-unicode>euc-kr >array phrase-euc-kr = ] unit-test
|
||||
|
||||
[ t ] [ phrase-euc-kr>unicode phrase-unicode = ] unit-test
|
||||
|
||||
[ t ] [ phrase-euc-kr 1 head* euc-kr decode phrase-unicode 1 head* = ] unit-test
|
||||
|
||||
[ t ] [ phrase-euc-kr 3 head* euc-kr decode phrase-unicode 2 head* = ] unit-test
|
||||
|
||||
[ t ] [ phrase-euc-kr 2 head* euc-kr decode phrase-unicode 2 head* CHAR: replacement-character suffix = ] unit-test
|
|
@ -0,0 +1,66 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg, Jonghyouk Yun.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel io.encodings accessors assocs sequences biassocs generic
|
||||
math.order simple-flat-file io io.binary byte-arrays locals combinators
|
||||
words classes.singleton fry classes.parser parser quotations ;
|
||||
IN: io.encodings.euc
|
||||
|
||||
TUPLE: euc { table biassoc } ;
|
||||
|
||||
<PRIVATE
|
||||
|
||||
: byte? ( ch -- ? )
|
||||
HEX: 0 HEX: ff between? ;
|
||||
|
||||
M: euc encode-char ( char stream encoding -- )
|
||||
swapd table>> value-at dup byte?
|
||||
[ swap stream-write1 ] [
|
||||
h>b/b swap 2byte-array
|
||||
swap stream-write
|
||||
] if ;
|
||||
|
||||
: euc-multibyte? ( ch -- ? )
|
||||
HEX: 81 HEX: fe between? ;
|
||||
|
||||
:: decode-multibyte ( ch stream encoding -- char )
|
||||
stream stream-read1
|
||||
[ ch swap 2byte-array be> encoding table>> at ]
|
||||
[ replacement-char ] if* ;
|
||||
|
||||
M:: euc decode-char ( stream encoding -- char/f )
|
||||
stream stream-read1
|
||||
{
|
||||
{ [ dup not ] [ drop f ] }
|
||||
{ [ dup euc-multibyte? ] [ stream encoding decode-multibyte ] }
|
||||
[ encoding table>> at ]
|
||||
} cond ;
|
||||
|
||||
: define-method ( class word definition -- )
|
||||
[ create-method ] dip define ;
|
||||
|
||||
SYMBOL: euc-table
|
||||
|
||||
: setup-euc ( word file-name -- singleton-class biassoc )
|
||||
[ dup define-singleton-class ]
|
||||
[ flat-file>biassoc ] bi* ;
|
||||
|
||||
:: define-recursive-methods ( class data words -- )
|
||||
words [| word |
|
||||
class word [ drop data word execute ] define-method
|
||||
] each ;
|
||||
|
||||
: euc-methods ( singleton-class biassoc -- )
|
||||
[ euc-table set-word-prop ] [
|
||||
euc boa
|
||||
{ <encoder> <decoder> }
|
||||
define-recursive-methods
|
||||
] 2bi ;
|
||||
|
||||
: define-euc ( word file-name -- )
|
||||
setup-euc euc-methods ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: EUC:
|
||||
! EUC: euc-kr "vocab:io/encodings/korean/cp949.txt"
|
||||
CREATE-CLASS scan-object define-euc ; parsing
|
|
@ -1,46 +0,0 @@
|
|||
! Copyright (C) 2009 Yun, Jonghyouk.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: arrays byte-arrays io io.encodings io.encodings.korean
|
||||
io.encodings.korean.private io.encodings.string io.streams.string
|
||||
kernel locals multiline namespaces sequences strings tools.test ;
|
||||
IN: io.encodings.korean.tests
|
||||
|
||||
! convert cp949 <> unicode
|
||||
|
||||
[ f ] [ HEX: 80 cp949>unicode ] unit-test
|
||||
[ f ] [ HEX: ff cp949>unicode ] unit-test
|
||||
[ HEX: ac02 ] [ HEX: 8141 cp949>unicode ] unit-test
|
||||
[ HEX: 7f ] [ HEX: 7f cp949>unicode ] unit-test
|
||||
[ HEX: c724 ] [ HEX: c0b1 cp949>unicode ] unit-test
|
||||
|
||||
[ HEX: 8141 ] [ HEX: ac02 unicode>cp949 ] unit-test
|
||||
[ HEX: 7f ] [ HEX: 7f unicode>cp949 ] unit-test
|
||||
[ HEX: c0b1 ] [ HEX: c724 unicode>cp949 ] unit-test
|
||||
|
||||
: phrase-unicode ( -- s )
|
||||
"\u00b3d9\u00d574\u00bb3c\u00acfc \u00bc31\u00b450\u00c0b0\u00c774!" ;
|
||||
|
||||
: phrase-cp949 ( -- s )
|
||||
{
|
||||
HEX: b5 HEX: bf HEX: c7 HEX: d8
|
||||
HEX: b9 HEX: b0 HEX: b0 HEX: fa
|
||||
HEX: 20 HEX: b9 HEX: e9 HEX: b5
|
||||
HEX: ce HEX: bb HEX: ea HEX: c0
|
||||
HEX: cc HEX: 21
|
||||
} ;
|
||||
|
||||
: phrase-unicode>cp949 ( -- s )
|
||||
phrase-unicode cp949 encode ;
|
||||
|
||||
: phrase-cp949>unicode ( -- s )
|
||||
phrase-cp949 cp949 decode ;
|
||||
|
||||
[ t ] [ phrase-unicode>cp949 >array phrase-cp949 = ] unit-test
|
||||
|
||||
[ t ] [ phrase-cp949>unicode phrase-unicode = ] unit-test
|
||||
|
||||
[ t ] [ phrase-cp949 1 head* cp949 decode phrase-unicode 1 head* = ] unit-test
|
||||
|
||||
[ t ] [ phrase-cp949 3 head* cp949 decode phrase-unicode 2 head* = ] unit-test
|
||||
|
||||
[ t ] [ phrase-cp949 2 head* cp949 decode phrase-unicode 2 head* CHAR: replacement-character suffix = ] unit-test
|
|
@ -1,81 +0,0 @@
|
|||
! Copyright (C) 2009 Yun, Jonghyouk.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: assocs byte-arrays combinators io io.encodings
|
||||
io.encodings.ascii io.encodings.iana io.files kernel locals math
|
||||
math.order math.parser memoize multiline sequences splitting
|
||||
values hashtables io.binary ;
|
||||
IN: io.encodings.korean
|
||||
|
||||
! TODO: migrate to common code-table parser (by Dan).
|
||||
|
||||
SINGLETON: cp949
|
||||
|
||||
cp949 "EUC-KR" register-encoding
|
||||
|
||||
<PRIVATE
|
||||
|
||||
! parse cp949.txt > table
|
||||
|
||||
: cp949.txt-lines ( -- seq )
|
||||
! "cp949.txt" from ...
|
||||
! <http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT>
|
||||
"vocab:io/encodings/korean/data/cp949.txt"
|
||||
ascii file-lines ;
|
||||
|
||||
: drop-comments ( seq -- newseq )
|
||||
[ "#" split1 drop ] map harvest ;
|
||||
|
||||
: split-column ( line -- columns )
|
||||
"\t" split 2 head ;
|
||||
|
||||
: parse-hex ( s -- n )
|
||||
2 short tail hex> ;
|
||||
|
||||
: parse-line ( line -- code-unicode )
|
||||
split-column [ parse-hex ] map ;
|
||||
|
||||
: process-codetable-lines ( lines -- assoc )
|
||||
drop-comments [ parse-line ] map ;
|
||||
|
||||
! convert cp949 <> unicode
|
||||
|
||||
MEMO: cp949>unicode-table ( -- hashtable )
|
||||
cp949.txt-lines process-codetable-lines >hashtable ;
|
||||
|
||||
MEMO: unicode>cp949-table ( -- hashtable )
|
||||
cp949>unicode-table [ swap ] assoc-map ;
|
||||
|
||||
unicode>cp949-table drop
|
||||
|
||||
: cp949>unicode ( b -- u )
|
||||
cp949>unicode-table at ;
|
||||
|
||||
: unicode>cp949 ( u -- b )
|
||||
unicode>cp949-table at ;
|
||||
|
||||
: cp949-1st? ( n -- ? )
|
||||
dup [ HEX: 81 HEX: fe between? ] when ;
|
||||
|
||||
: byte? ( n -- ? )
|
||||
0 HEX: ff between? ;
|
||||
|
||||
M:: cp949 encode-char ( char stream encoding -- )
|
||||
char unicode>cp949 byte?
|
||||
[ char 1byte-array stream stream-write ] [
|
||||
char unicode>cp949
|
||||
h>b/b swap 2byte-array
|
||||
stream stream-write
|
||||
] if ;
|
||||
|
||||
: decode-char-step2 ( c stream -- char )
|
||||
stream-read1
|
||||
[ 2byte-array be> cp949>unicode ]
|
||||
[ drop replacement-char ] if* ;
|
||||
|
||||
M:: cp949 decode-char ( stream encoding -- char/f )
|
||||
stream stream-read1
|
||||
{
|
||||
{ [ dup not ] [ drop f ] }
|
||||
{ [ dup cp949-1st? ] [ stream decode-char-step2 ] }
|
||||
[ ]
|
||||
} cond ;
|
Loading…
Reference in New Issue