diff --git a/basis/io/encodings/korean/korean-tests.factor b/basis/io/encodings/korean/korean-tests.factor index d8acaf71a8..b39aa866d1 100644 --- a/basis/io/encodings/korean/korean-tests.factor +++ b/basis/io/encodings/korean/korean-tests.factor @@ -5,33 +5,22 @@ io.encodings.korean.private io.encodings.string io.streams.string kernel locals multiline namespaces sequences strings tools.test ; IN: io.encodings.korean.tests +! convert cp949 <> unicode +[ f ] [ HEX: 80 cp949>unicode ] unit-test +[ f ] [ HEX: ff cp949>unicode ] unit-test +[ HEX: ac02 ] [ HEX: 8141 cp949>unicode ] unit-test +[ HEX: 7f ] [ HEX: 7f cp949>unicode ] unit-test +[ HEX: c724 ] [ HEX: c0b1 cp949>unicode ] unit-test -! convert cp949 <-> unicode +[ HEX: 8141 ] [ HEX: ac02 unicode>cp949 ] unit-test +[ HEX: 7f ] [ HEX: 7f unicode>cp949 ] unit-test +[ HEX: c0b1 ] [ HEX: c724 unicode>cp949 ] unit-test -[ f ] [ HEX: 80 (cp949->unicode) ] unit-test -[ f ] [ HEX: ff (cp949->unicode) ] unit-test -[ HEX: ac02 ] [ HEX: 8141 (cp949->unicode) ] unit-test -[ HEX: 7f ] [ HEX: 7f (cp949->unicode) ] unit-test -[ HEX: c724 ] [ HEX: c0b1 (cp949->unicode) ] unit-test - -[ HEX: 8141 ] [ HEX: ac02 (unicode->cp949) ] unit-test -[ HEX: 7f ] [ HEX: 7f (unicode->cp949) ] unit-test -[ HEX: c0b1 ] [ HEX: c724 (unicode->cp949) ] unit-test - - -! byte manip. -[ HEX: beaf ] [ HEX: be HEX: af (2b->1mb) ] unit-test -[ HEX: be ] [ HEX: beaf (1mb->1st) ] unit-test -[ HEX: af ] [ HEX: beaf (1mb->2nd) ] unit-test -[ HEX: be HEX: af ] [ HEX: beaf (1mb->2b) ] unit-test - - -! -: (t-phrase-unicode) ( -- s ) +: phrase-unicode ( -- s ) "\u00b3d9\u00d574\u00bb3c\u00acfc \u00bc31\u00b450\u00c0b0\u00c774!" ; -: (t-phrase-cp949) ( -- s ) +: phrase-cp949 ( -- s ) { HEX: b5 HEX: bf HEX: c7 HEX: d8 HEX: b9 HEX: b0 HEX: b0 HEX: fa @@ -40,19 +29,18 @@ IN: io.encodings.korean.tests HEX: cc HEX: 21 } ; -: (t-phrase-unicode->cp949) ( -- s ) - (t-phrase-unicode) cp949 encode ; +: phrase-unicode>cp949 ( -- s ) + phrase-unicode cp949 encode ; -: (t-phrase-cp949->unicode) ( -- s ) - (t-phrase-cp949) cp949 decode ; +: phrase-cp949>unicode ( -- s ) + phrase-cp949 cp949 decode ; +[ t ] [ phrase-unicode>cp949 >array phrase-cp949 = ] unit-test -[ t ] [ (t-phrase-unicode->cp949) >array (t-phrase-cp949) = ] unit-test +[ t ] [ phrase-cp949>unicode phrase-unicode = ] unit-test -[ t ] [ (t-phrase-cp949->unicode) (t-phrase-unicode) = ] unit-test +[ t ] [ phrase-cp949 1 head* cp949 decode phrase-unicode 1 head* = ] unit-test +[ t ] [ phrase-cp949 3 head* cp949 decode phrase-unicode 2 head* = ] unit-test - - - -! EOF +[ t ] [ phrase-cp949 2 head* cp949 decode phrase-unicode 2 head* CHAR: replacement-character suffix = ] unit-test diff --git a/basis/io/encodings/korean/korean.factor b/basis/io/encodings/korean/korean.factor index ab77c79f89..4f387d8987 100644 --- a/basis/io/encodings/korean/korean.factor +++ b/basis/io/encodings/korean/korean.factor @@ -3,125 +3,77 @@ USING: assocs byte-arrays combinators io io.encodings io.encodings.ascii io.encodings.iana io.files kernel locals math math.order math.parser memoize multiline sequences splitting -values hashtables ; +values hashtables io.binary ; IN: io.encodings.korean - SINGLETON: cp949 -ALIAS: ms949 cp949 -ALIAS: euc-kr cp949 -ALIAS: euckr cp949 - cp949 "EUC-KR" register-encoding - - table +! parse cp949.txt > table -: (cp949.txt-lines) ( -- seq ) +: cp949.txt-lines ( -- seq ) ! "cp949.txt" from ... ! "resource:basis/io/encodings/korean/data/cp949.txt" ascii file-lines ; -: (PCL-drop-comments) ( seq -- newseq ) +: drop-comments ( seq -- newseq ) [ "#" split1 drop ] map harvest ; -: (PCL-split-column) ( line -- columns ) +: split-column ( line -- columns ) "\t" split 2 head ; -: (PCL-parse-hex) ( s -- n ) +: parse-hex ( s -- n ) 2 short tail hex> ; -: (PCL-parse-line) ( line -- code-unicode ) - (PCL-split-column) - [ (PCL-parse-hex) ] map ; +: parse-line ( line -- code-unicode ) + split-column [ parse-hex ] map ; -: (process-codetable-lines) ( lines -- assoc ) - (PCL-drop-comments) - [ (PCL-parse-line) ] map ; +: process-codetable-lines ( lines -- assoc ) + drop-comments [ parse-line ] map ; +! convert cp949 <> unicode -! convert cp949 <-> unicode +MEMO: cp949>unicode-table ( -- hashtable ) + cp949.txt-lines process-codetable-lines >hashtable ; -: (cp949.txt>alist) ( -- alist ) - (cp949.txt-lines) (process-codetable-lines) ; +MEMO: unicode>cp949-table ( -- hashtable ) + cp949>unicode-table [ swap ] assoc-map ; -: (make-cp949->unicode-table) ( alist -- h ) - >hashtable ; +unicode>cp949-table drop -: (make-unicode->cp949-table) ( alist -- h ) - [ reverse ] map >hashtable ; +: cp949>unicode ( b -- u ) + cp949>unicode-table at ; -VALUE: cp949->unicode-table -VALUE: unicode->cp949-table +: unicode>cp949 ( u -- b ) + unicode>cp949-table at ; -(cp949.txt>alist) dup -(make-cp949->unicode-table) to: cp949->unicode-table -(make-unicode->cp949-table) to: unicode->cp949-table +: cp949-1st? ( n -- ? ) + dup [ HEX: 81 HEX: fe between? ] when ; - -MEMO: (cp949->unicode) ( b -- u ) - cp949->unicode-table at ; - -MEMO: (unicode->cp949) ( u -- b ) - unicode->cp949-table at ; - -:: (2b->1mb) ( c1 c2 -- mb ) - c1 8 shift c2 + ; - -:: (1mb->1st) ( mb -- c1 ) - mb HEX: ff00 bitand -8 shift ; - -:: (1mb->2nd) ( mb -- c2 ) - mb HEX: ff bitand ; - -:: (1mb->2b) ( mb -- c1 c2 ) - mb (1mb->1st) - mb (1mb->2nd) ; - -: (cp949-1st?) ( n -- ? ) - dup f = not - [ HEX: 81 HEX: fe between? ] when ; - -: (1byte-unicode?) ( n -- ? ) +: byte? ( n -- ? ) 0 HEX: ff between? ; - - M:: cp949 encode-char ( char stream encoding -- ) - char (unicode->cp949) (1byte-unicode?) - [ char 1byte-array - stream stream-write ] - [ char (unicode->cp949) - (1mb->2b) 2byte-array - stream stream-write ] - if ; + char unicode>cp949 byte? + [ char 1byte-array stream stream-write ] [ + char unicode>cp949 + h>b/b swap 2byte-array + stream stream-write + ] if ; - -: (eof?) ( n -- ? ) 0 = ; - -: (decode-char-step2) ( c stream -- char/f ) - stream-read1 (2b->1mb) (cp949->unicode) ; +: decode-char-step2 ( c stream -- char ) + stream-read1 + [ 2byte-array be> cp949>unicode ] + [ drop replacement-char ] if* ; M:: cp949 decode-char ( stream encoding -- char/f ) stream stream-read1 { - { [ dup f = ] [ drop f ] } - { [ dup (eof?) ] [ drop replacement-char ] } - { [ dup (cp949-1st?) ] [ stream (decode-char-step2) ] } + { [ dup not ] [ drop f ] } + { [ dup cp949-1st? ] [ stream decode-char-step2 ] } [ ] } cond ; - - -! TODO: - -! TODO: - - - - -! EOF