Cleanup and bug fix in io.encodings.korean
							parent
							
								
									66c2efb378
								
							
						
					
					
						commit
						523e0d993d
					
				| 
						 | 
				
			
			@ -5,33 +5,22 @@ io.encodings.korean.private io.encodings.string io.streams.string
 | 
			
		|||
kernel locals multiline namespaces sequences strings tools.test ;
 | 
			
		||||
IN: io.encodings.korean.tests
 | 
			
		||||
 | 
			
		||||
! convert cp949 <> unicode
 | 
			
		||||
 | 
			
		||||
[ f ] [ HEX: 80 cp949>unicode ] unit-test
 | 
			
		||||
[ f ] [ HEX: ff cp949>unicode ] unit-test
 | 
			
		||||
[ HEX: ac02 ] [ HEX: 8141 cp949>unicode ] unit-test
 | 
			
		||||
[ HEX: 7f ] [ HEX: 7f cp949>unicode ] unit-test
 | 
			
		||||
[ HEX: c724 ] [ HEX: c0b1 cp949>unicode ] unit-test
 | 
			
		||||
 | 
			
		||||
! convert cp949 <-> unicode
 | 
			
		||||
[ HEX: 8141 ] [ HEX: ac02 unicode>cp949 ] unit-test
 | 
			
		||||
[ HEX: 7f ] [ HEX: 7f unicode>cp949 ] unit-test
 | 
			
		||||
[ HEX: c0b1 ] [ HEX: c724 unicode>cp949 ] unit-test
 | 
			
		||||
 | 
			
		||||
[ f ] [ HEX: 80 (cp949->unicode) ] unit-test
 | 
			
		||||
[ f ] [ HEX: ff (cp949->unicode) ] unit-test
 | 
			
		||||
[ HEX: ac02 ] [ HEX: 8141 (cp949->unicode) ] unit-test
 | 
			
		||||
[ HEX: 7f ] [ HEX: 7f (cp949->unicode) ] unit-test
 | 
			
		||||
[ HEX: c724 ] [ HEX: c0b1 (cp949->unicode) ] unit-test
 | 
			
		||||
 | 
			
		||||
[ HEX: 8141 ] [ HEX: ac02 (unicode->cp949) ] unit-test
 | 
			
		||||
[ HEX: 7f ] [ HEX: 7f (unicode->cp949) ] unit-test
 | 
			
		||||
[ HEX: c0b1 ] [ HEX: c724 (unicode->cp949) ] unit-test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
! byte manip.
 | 
			
		||||
[ HEX: beaf ] [ HEX: be HEX: af (2b->1mb) ] unit-test
 | 
			
		||||
[ HEX: be ] [ HEX: beaf (1mb->1st) ] unit-test
 | 
			
		||||
[ HEX: af ] [ HEX: beaf (1mb->2nd) ] unit-test
 | 
			
		||||
[ HEX: be HEX: af ] [ HEX: beaf (1mb->2b) ] unit-test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
!
 | 
			
		||||
: (t-phrase-unicode) ( -- s )
 | 
			
		||||
: phrase-unicode ( -- s )
 | 
			
		||||
    "\u00b3d9\u00d574\u00bb3c\u00acfc \u00bc31\u00b450\u00c0b0\u00c774!" ;
 | 
			
		||||
 | 
			
		||||
: (t-phrase-cp949) ( -- s )
 | 
			
		||||
: phrase-cp949 ( -- s )
 | 
			
		||||
    {
 | 
			
		||||
        HEX: b5 HEX: bf HEX: c7 HEX: d8
 | 
			
		||||
        HEX: b9 HEX: b0 HEX: b0 HEX: fa
 | 
			
		||||
| 
						 | 
				
			
			@ -40,19 +29,18 @@ IN: io.encodings.korean.tests
 | 
			
		|||
        HEX: cc HEX: 21
 | 
			
		||||
    } ;
 | 
			
		||||
 | 
			
		||||
: (t-phrase-unicode->cp949) ( -- s )
 | 
			
		||||
    (t-phrase-unicode) cp949 encode ;
 | 
			
		||||
: phrase-unicode>cp949 ( -- s )
 | 
			
		||||
    phrase-unicode cp949 encode ;
 | 
			
		||||
 | 
			
		||||
: (t-phrase-cp949->unicode) ( -- s )
 | 
			
		||||
    (t-phrase-cp949) cp949 decode ;
 | 
			
		||||
: phrase-cp949>unicode ( -- s )
 | 
			
		||||
    phrase-cp949 cp949 decode ;
 | 
			
		||||
 | 
			
		||||
[ t ] [ phrase-unicode>cp949 >array phrase-cp949 = ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ (t-phrase-unicode->cp949) >array (t-phrase-cp949) = ] unit-test
 | 
			
		||||
[ t ]  [ phrase-cp949>unicode phrase-unicode = ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ]  [ (t-phrase-cp949->unicode) (t-phrase-unicode) = ] unit-test
 | 
			
		||||
[ t ] [ phrase-cp949 1 head* cp949 decode phrase-unicode 1 head* = ] unit-test
 | 
			
		||||
 | 
			
		||||
[ t ] [ phrase-cp949 3 head* cp949 decode phrase-unicode 2 head* = ] unit-test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
! EOF
 | 
			
		||||
[ t ] [ phrase-cp949 2 head* cp949 decode phrase-unicode 2 head* CHAR: replacement-character suffix = ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,125 +3,77 @@
 | 
			
		|||
USING: assocs byte-arrays combinators io io.encodings
 | 
			
		||||
io.encodings.ascii io.encodings.iana io.files kernel locals math
 | 
			
		||||
math.order math.parser memoize multiline sequences splitting
 | 
			
		||||
values hashtables ;
 | 
			
		||||
values hashtables io.binary ;
 | 
			
		||||
IN: io.encodings.korean
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
SINGLETON: cp949
 | 
			
		||||
 | 
			
		||||
ALIAS: ms949 cp949
 | 
			
		||||
ALIAS: euc-kr cp949
 | 
			
		||||
ALIAS: euckr cp949
 | 
			
		||||
 | 
			
		||||
cp949 "EUC-KR" register-encoding
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
<PRIVATE
 | 
			
		||||
 | 
			
		||||
! parse cp949.txt -> table
 | 
			
		||||
! parse cp949.txt > table
 | 
			
		||||
 | 
			
		||||
: (cp949.txt-lines) ( -- seq )
 | 
			
		||||
: cp949.txt-lines ( -- seq )
 | 
			
		||||
    ! "cp949.txt" from ...
 | 
			
		||||
    ! <http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT>
 | 
			
		||||
    "resource:basis/io/encodings/korean/data/cp949.txt"
 | 
			
		||||
    ascii file-lines ;
 | 
			
		||||
 | 
			
		||||
: (PCL-drop-comments) ( seq -- newseq )
 | 
			
		||||
: drop-comments ( seq -- newseq )
 | 
			
		||||
    [ "#" split1 drop ] map harvest ;
 | 
			
		||||
 | 
			
		||||
: (PCL-split-column) ( line -- columns )
 | 
			
		||||
: split-column ( line -- columns )
 | 
			
		||||
    "\t" split 2 head ;
 | 
			
		||||
 | 
			
		||||
: (PCL-parse-hex) ( s -- n )
 | 
			
		||||
: parse-hex ( s -- n )
 | 
			
		||||
    2 short tail hex> ;
 | 
			
		||||
 | 
			
		||||
: (PCL-parse-line) ( line -- code-unicode )
 | 
			
		||||
    (PCL-split-column)
 | 
			
		||||
    [ (PCL-parse-hex) ] map ;
 | 
			
		||||
: parse-line ( line -- code-unicode )
 | 
			
		||||
    split-column [ parse-hex ] map ;
 | 
			
		||||
 | 
			
		||||
: (process-codetable-lines) ( lines -- assoc )
 | 
			
		||||
    (PCL-drop-comments)
 | 
			
		||||
    [ (PCL-parse-line) ] map ;
 | 
			
		||||
: process-codetable-lines ( lines -- assoc )
 | 
			
		||||
    drop-comments [ parse-line ] map ; 
 | 
			
		||||
 | 
			
		||||
! convert cp949 <> unicode
 | 
			
		||||
 | 
			
		||||
! convert cp949 <-> unicode
 | 
			
		||||
MEMO: cp949>unicode-table ( -- hashtable )
 | 
			
		||||
    cp949.txt-lines process-codetable-lines >hashtable ;
 | 
			
		||||
 | 
			
		||||
: (cp949.txt>alist) ( -- alist )
 | 
			
		||||
    (cp949.txt-lines) (process-codetable-lines) ;
 | 
			
		||||
MEMO: unicode>cp949-table ( -- hashtable )
 | 
			
		||||
    cp949>unicode-table [ swap ] assoc-map ;
 | 
			
		||||
 | 
			
		||||
: (make-cp949->unicode-table) ( alist -- h )
 | 
			
		||||
    >hashtable ;
 | 
			
		||||
unicode>cp949-table drop
 | 
			
		||||
 | 
			
		||||
: (make-unicode->cp949-table) ( alist -- h )
 | 
			
		||||
    [ reverse ] map >hashtable ;
 | 
			
		||||
: cp949>unicode ( b -- u )
 | 
			
		||||
    cp949>unicode-table at ;
 | 
			
		||||
 | 
			
		||||
VALUE: cp949->unicode-table
 | 
			
		||||
VALUE: unicode->cp949-table
 | 
			
		||||
: unicode>cp949 ( u -- b )
 | 
			
		||||
    unicode>cp949-table at ;
 | 
			
		||||
 | 
			
		||||
(cp949.txt>alist) dup
 | 
			
		||||
(make-cp949->unicode-table) to: cp949->unicode-table
 | 
			
		||||
(make-unicode->cp949-table) to: unicode->cp949-table
 | 
			
		||||
: cp949-1st? ( n -- ? )
 | 
			
		||||
    dup [ HEX: 81 HEX: fe between? ] when ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
MEMO: (cp949->unicode) ( b -- u )
 | 
			
		||||
    cp949->unicode-table at ;
 | 
			
		||||
 | 
			
		||||
MEMO: (unicode->cp949) ( u -- b )
 | 
			
		||||
    unicode->cp949-table at ;
 | 
			
		||||
 | 
			
		||||
:: (2b->1mb) ( c1 c2 -- mb )
 | 
			
		||||
    c1 8 shift c2 + ;
 | 
			
		||||
 | 
			
		||||
:: (1mb->1st) ( mb -- c1 )
 | 
			
		||||
    mb HEX: ff00 bitand -8 shift ;
 | 
			
		||||
 | 
			
		||||
:: (1mb->2nd) ( mb -- c2 )
 | 
			
		||||
    mb HEX: ff bitand ;
 | 
			
		||||
 | 
			
		||||
:: (1mb->2b) ( mb -- c1 c2 )
 | 
			
		||||
    mb (1mb->1st)
 | 
			
		||||
    mb (1mb->2nd) ;
 | 
			
		||||
 | 
			
		||||
: (cp949-1st?) ( n -- ? )
 | 
			
		||||
    dup f = not
 | 
			
		||||
    [ HEX: 81 HEX: fe between? ] when ;
 | 
			
		||||
 | 
			
		||||
: (1byte-unicode?) ( n -- ? )
 | 
			
		||||
: byte? ( n -- ? )
 | 
			
		||||
    0 HEX: ff between? ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
M:: cp949 encode-char ( char stream encoding -- )
 | 
			
		||||
    char (unicode->cp949) (1byte-unicode?)
 | 
			
		||||
        [ char 1byte-array
 | 
			
		||||
        stream stream-write ]
 | 
			
		||||
    [ char (unicode->cp949)
 | 
			
		||||
        (1mb->2b) 2byte-array
 | 
			
		||||
        stream stream-write ]
 | 
			
		||||
        if ;
 | 
			
		||||
    char unicode>cp949 byte?
 | 
			
		||||
    [ char 1byte-array stream stream-write ] [
 | 
			
		||||
        char unicode>cp949
 | 
			
		||||
        h>b/b swap 2byte-array
 | 
			
		||||
        stream stream-write
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
: (eof?) ( n -- ? ) 0 = ;
 | 
			
		||||
 | 
			
		||||
: (decode-char-step2) ( c stream -- char/f )
 | 
			
		||||
    stream-read1 (2b->1mb) (cp949->unicode) ;
 | 
			
		||||
: decode-char-step2 ( c stream -- char )
 | 
			
		||||
    stream-read1
 | 
			
		||||
    [ 2byte-array be> cp949>unicode ]
 | 
			
		||||
    [ drop replacement-char ] if* ;
 | 
			
		||||
 | 
			
		||||
M:: cp949 decode-char ( stream encoding -- char/f )
 | 
			
		||||
    stream stream-read1
 | 
			
		||||
    {
 | 
			
		||||
        { [ dup f = ] [ drop f ] }
 | 
			
		||||
        { [ dup (eof?) ] [ drop replacement-char ] }
 | 
			
		||||
        { [ dup (cp949-1st?) ] [ stream (decode-char-step2) ] }
 | 
			
		||||
        { [ dup not ] [ drop f ] }
 | 
			
		||||
        { [ dup cp949-1st? ] [ stream decode-char-step2 ] }
 | 
			
		||||
        [ ]
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
! TODO: <encoder>
 | 
			
		||||
 | 
			
		||||
! TODO: <decoder>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
! EOF
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue