Factoring out EUC code

db4
Daniel Ehrenberg 2009-03-02 18:22:55 -06:00
parent 5aba91a0c5
commit 0bff96a990
9 changed files with 128 additions and 135 deletions

View File

@ -1,14 +1,10 @@
! Copyright (C) 2009 Yun, Jonghyouk.
! See http://factorcode.org/license.txt for BSD license.
USING: help.syntax help.markup ;
IN: io.encodings.korean
IN: io.encodings.euc-kr
ARTICLE: "io.encodings.korean" "Korean text encodings"
"The " { $vocab-link "io.encodings.korean" } " vocabulary implements encodings used for Korean text besides the standard UTF encodings for Unicode strings."
{ $subsection cp949 } ;
ABOUT: euc-kr
ABOUT: "io.encodings.korean"
HELP: cp949
{ $class-description "This encoding class implements Microsoft's code page #949 encoding, also called Unified Hangul Code or ks_c_5601-1987, UHC. CP949 is extended version of EUC-KR and downward-compatibility to EUC-KR. " }
HELP: euc-kr
{ $class-description "This encoding class implements Microsoft's code page #949 encoding, also called Unified Hangul Code or ks_c_5601-1987, UHC. CP949 is extended version of EUC-KR and downward-compatibility to EUC-KR, in practice." }
{ $see-also "encodings-introduction" } ;

View File

@ -0,0 +1,8 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: io.encodings.iana io.encodings.euc ;
IN: io.encodings.euc-kr
EUC: euc-kr "vocab:io/encodings/euc-kr/data/cp949.txt"
euc-kr "EUC-KR" register-encoding

View File

@ -0,0 +1,50 @@
! Copyright (C) 2009 Yun, Jonghyouk.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays byte-arrays io io.encodings io.encodings.euc-kr assocs
io.encodings.string io.streams.string io.encodings.euc.private words
kernel locals multiline namespaces sequences strings tools.test ;
IN: io.encodings.euc.tests
: euc-kr>unicode ( ch -- ch/f )
euc-kr euc-table word-prop at ;
: unicode>euc-kr ( ch -- ch/f )
euc-kr euc-table word-prop value-at ;
[ f ] [ HEX: 80 euc-kr>unicode ] unit-test
[ f ] [ HEX: ff euc-kr>unicode ] unit-test
[ HEX: ac02 ] [ HEX: 8141 euc-kr>unicode ] unit-test
[ HEX: 7f ] [ HEX: 7f euc-kr>unicode ] unit-test
[ HEX: c724 ] [ HEX: c0b1 euc-kr>unicode ] unit-test
[ HEX: 8141 ] [ HEX: ac02 unicode>euc-kr ] unit-test
[ HEX: 7f ] [ HEX: 7f unicode>euc-kr ] unit-test
[ HEX: c0b1 ] [ HEX: c724 unicode>euc-kr ] unit-test
: phrase-unicode ( -- s )
"\u00b3d9\u00d574\u00bb3c\u00acfc \u00bc31\u00b450\u00c0b0\u00c774!" ;
: phrase-euc-kr ( -- s )
{
HEX: b5 HEX: bf HEX: c7 HEX: d8
HEX: b9 HEX: b0 HEX: b0 HEX: fa
HEX: 20 HEX: b9 HEX: e9 HEX: b5
HEX: ce HEX: bb HEX: ea HEX: c0
HEX: cc HEX: 21
} ;
: phrase-unicode>euc-kr ( -- s )
phrase-unicode euc-kr encode ;
: phrase-euc-kr>unicode ( -- s )
phrase-euc-kr euc-kr decode ;
[ t ] [ phrase-unicode>euc-kr >array phrase-euc-kr = ] unit-test
[ t ] [ phrase-euc-kr>unicode phrase-unicode = ] unit-test
[ t ] [ phrase-euc-kr 1 head* euc-kr decode phrase-unicode 1 head* = ] unit-test
[ t ] [ phrase-euc-kr 3 head* euc-kr decode phrase-unicode 2 head* = ] unit-test
[ t ] [ phrase-euc-kr 2 head* euc-kr decode phrase-unicode 2 head* CHAR: replacement-character suffix = ] unit-test

View File

@ -0,0 +1,66 @@
! Copyright (C) 2009 Daniel Ehrenberg, Jonghyouk Yun.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel io.encodings accessors assocs sequences biassocs generic
math.order simple-flat-file io io.binary byte-arrays locals combinators
words classes.singleton fry classes.parser parser quotations ;
IN: io.encodings.euc
TUPLE: euc { table biassoc } ;
<PRIVATE
: byte? ( ch -- ? )
HEX: 0 HEX: ff between? ;
M: euc encode-char ( char stream encoding -- )
swapd table>> value-at dup byte?
[ swap stream-write1 ] [
h>b/b swap 2byte-array
swap stream-write
] if ;
: euc-multibyte? ( ch -- ? )
HEX: 81 HEX: fe between? ;
:: decode-multibyte ( ch stream encoding -- char )
stream stream-read1
[ ch swap 2byte-array be> encoding table>> at ]
[ replacement-char ] if* ;
M:: euc decode-char ( stream encoding -- char/f )
stream stream-read1
{
{ [ dup not ] [ drop f ] }
{ [ dup euc-multibyte? ] [ stream encoding decode-multibyte ] }
[ encoding table>> at ]
} cond ;
: define-method ( class word definition -- )
[ create-method ] dip define ;
SYMBOL: euc-table
: setup-euc ( word file-name -- singleton-class biassoc )
[ dup define-singleton-class ]
[ flat-file>biassoc ] bi* ;
:: define-recursive-methods ( class data words -- )
words [| word |
class word [ drop data word execute ] define-method
] each ;
: euc-methods ( singleton-class biassoc -- )
[ euc-table set-word-prop ] [
euc boa
{ <encoder> <decoder> }
define-recursive-methods
] 2bi ;
: define-euc ( word file-name -- )
setup-euc euc-methods ;
PRIVATE>
: EUC:
! EUC: euc-kr "vocab:io/encodings/korean/cp949.txt"
CREATE-CLASS scan-object define-euc ; parsing

View File

@ -1,46 +0,0 @@
! Copyright (C) 2009 Yun, Jonghyouk.
! See http://factorcode.org/license.txt for BSD license.
USING: arrays byte-arrays io io.encodings io.encodings.korean
io.encodings.korean.private io.encodings.string io.streams.string
kernel locals multiline namespaces sequences strings tools.test ;
IN: io.encodings.korean.tests
! convert cp949 <> unicode
[ f ] [ HEX: 80 cp949>unicode ] unit-test
[ f ] [ HEX: ff cp949>unicode ] unit-test
[ HEX: ac02 ] [ HEX: 8141 cp949>unicode ] unit-test
[ HEX: 7f ] [ HEX: 7f cp949>unicode ] unit-test
[ HEX: c724 ] [ HEX: c0b1 cp949>unicode ] unit-test
[ HEX: 8141 ] [ HEX: ac02 unicode>cp949 ] unit-test
[ HEX: 7f ] [ HEX: 7f unicode>cp949 ] unit-test
[ HEX: c0b1 ] [ HEX: c724 unicode>cp949 ] unit-test
: phrase-unicode ( -- s )
"\u00b3d9\u00d574\u00bb3c\u00acfc \u00bc31\u00b450\u00c0b0\u00c774!" ;
: phrase-cp949 ( -- s )
{
HEX: b5 HEX: bf HEX: c7 HEX: d8
HEX: b9 HEX: b0 HEX: b0 HEX: fa
HEX: 20 HEX: b9 HEX: e9 HEX: b5
HEX: ce HEX: bb HEX: ea HEX: c0
HEX: cc HEX: 21
} ;
: phrase-unicode>cp949 ( -- s )
phrase-unicode cp949 encode ;
: phrase-cp949>unicode ( -- s )
phrase-cp949 cp949 decode ;
[ t ] [ phrase-unicode>cp949 >array phrase-cp949 = ] unit-test
[ t ] [ phrase-cp949>unicode phrase-unicode = ] unit-test
[ t ] [ phrase-cp949 1 head* cp949 decode phrase-unicode 1 head* = ] unit-test
[ t ] [ phrase-cp949 3 head* cp949 decode phrase-unicode 2 head* = ] unit-test
[ t ] [ phrase-cp949 2 head* cp949 decode phrase-unicode 2 head* CHAR: replacement-character suffix = ] unit-test

View File

@ -1,81 +0,0 @@
! Copyright (C) 2009 Yun, Jonghyouk.
! See http://factorcode.org/license.txt for BSD license.
USING: assocs byte-arrays combinators io io.encodings
io.encodings.ascii io.encodings.iana io.files kernel locals math
math.order math.parser memoize multiline sequences splitting
values hashtables io.binary ;
IN: io.encodings.korean
! TODO: migrate to common code-table parser (by Dan).
SINGLETON: cp949
cp949 "EUC-KR" register-encoding
<PRIVATE
! parse cp949.txt > table
: cp949.txt-lines ( -- seq )
! "cp949.txt" from ...
! <http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT>
"vocab:io/encodings/korean/data/cp949.txt"
ascii file-lines ;
: drop-comments ( seq -- newseq )
[ "#" split1 drop ] map harvest ;
: split-column ( line -- columns )
"\t" split 2 head ;
: parse-hex ( s -- n )
2 short tail hex> ;
: parse-line ( line -- code-unicode )
split-column [ parse-hex ] map ;
: process-codetable-lines ( lines -- assoc )
drop-comments [ parse-line ] map ;
! convert cp949 <> unicode
MEMO: cp949>unicode-table ( -- hashtable )
cp949.txt-lines process-codetable-lines >hashtable ;
MEMO: unicode>cp949-table ( -- hashtable )
cp949>unicode-table [ swap ] assoc-map ;
unicode>cp949-table drop
: cp949>unicode ( b -- u )
cp949>unicode-table at ;
: unicode>cp949 ( u -- b )
unicode>cp949-table at ;
: cp949-1st? ( n -- ? )
dup [ HEX: 81 HEX: fe between? ] when ;
: byte? ( n -- ? )
0 HEX: ff between? ;
M:: cp949 encode-char ( char stream encoding -- )
char unicode>cp949 byte?
[ char 1byte-array stream stream-write ] [
char unicode>cp949
h>b/b swap 2byte-array
stream stream-write
] if ;
: decode-char-step2 ( c stream -- char )
stream-read1
[ 2byte-array be> cp949>unicode ]
[ drop replacement-char ] if* ;
M:: cp949 decode-char ( stream encoding -- char/f )
stream stream-read1
{
{ [ dup not ] [ drop f ] }
{ [ dup cp949-1st? ] [ stream decode-char-step2 ] }
[ ]
} cond ;