initial 'io.encodings.korean' commit

2009-02-14 10:42:31 +09:00 · 2009-02-14 10:42:31 +09:00 · eef49a82b5
parent f3db8fdc4a
commit eef49a82b5
7 changed files with 17510 additions and 0 deletions
--- a/basis/io/encodings/korean/authors.txt
+++ b/basis/io/encodings/korean/authors.txt
@ -0,0 +1 @@
+Yun, Jonghyouk
--- a/basis/io/encodings/korean/data/cp949.txt
+++ b/basis/io/encodings/korean/data/cp949.txt
--- a/basis/io/encodings/korean/korean-docs.factor
+++ b/basis/io/encodings/korean/korean-docs.factor
--- a/basis/io/encodings/korean/korean-tests.factor
+++ b/basis/io/encodings/korean/korean-tests.factor
@ -0,0 +1,58 @@
+! Copyright (C) 2009 Yun, Jonghyouk.
+! See http://factorcode.org/license.txt for BSD license.
+USING: arrays byte-arrays io io.encodings io.encodings.korean
+io.encodings.korean.private io.encodings.string io.streams.string
+kernel locals multiline namespaces sequences strings tools.test ;
+IN: io.encodings.korean.tests
+
+
+
+! convert cp949 <-> unicode
+
+[ f ] [ HEX: 80 (cp949->unicode) ] unit-test
+[ f ] [ HEX: ff (cp949->unicode) ] unit-test
+[ HEX: ac02 ] [ HEX: 8141 (cp949->unicode) ] unit-test
+[ HEX: 7f ] [ HEX: 7f (cp949->unicode) ] unit-test
+[ HEX: c724 ] [ HEX: c0b1 (cp949->unicode) ] unit-test
+
+[ HEX: 8141 ] [ HEX: ac02 (unicode->cp949) ] unit-test
+[ HEX: 7f ] [ HEX: 7f (unicode->cp949) ] unit-test
+[ HEX: c0b1 ] [ HEX: c724 (unicode->cp949) ] unit-test
+
+
+! byte manip.
+[ HEX: beaf ] [ HEX: be HEX: af (2b->1mb) ] unit-test
+[ HEX: be ] [ HEX: beaf (1mb->1st) ] unit-test
+[ HEX: af ] [ HEX: beaf (1mb->2nd) ] unit-test
+[ HEX: be HEX: af ] [ HEX: beaf (1mb->2b) ] unit-test
+
+
+!
+: (t-phrase-unicode) ( -- s )
+    "\u00b3d9\u00d574\u00bb3c\u00acfc \u00bc31\u00b450\u00c0b0\u00c774!" ;
+
+: (t-phrase-cp949) ( -- s )
+    {
+        HEX: b5 HEX: bf HEX: c7 HEX: d8
+        HEX: b9 HEX: b0 HEX: b0 HEX: fa
+        HEX: 20 HEX: b9 HEX: e9 HEX: b5
+        HEX: ce HEX: bb HEX: ea HEX: c0
+        HEX: cc HEX: 21
+    } ;
+
+: (t-phrase-unicode->cp949) ( -- s )
+    (t-phrase-unicode) cp949 encode ;
+
+: (t-phrase-cp949->unicode) ( -- s )
+    (t-phrase-cp949) cp949 decode ;
+
+
+[ t ] [ (t-phrase-unicode->cp949) >array (t-phrase-cp949) = ] unit-test
+
+[ t ]  [ (t-phrase-cp949->unicode) (t-phrase-unicode) = ] unit-test
+
+
+
+
+
+! EOF
--- a/basis/io/encodings/korean/korean.factor
+++ b/basis/io/encodings/korean/korean.factor
@ -0,0 +1,127 @@
+! Copyright (C) 2009 Yun, Jonghyouk.
+! See http://factorcode.org/license.txt for BSD license.
+USING: assocs byte-arrays combinators io io.encodings
+io.encodings.ascii io.encodings.iana io.files kernel locals math
+math.order math.parser memoize multiline sequences splitting
+values hashtables ;
+IN: io.encodings.korean
+
+
+SINGLETON: cp949
+
+ALIAS: ms949 cp949
+ALIAS: euc-kr cp949
+ALIAS: euckr cp949
+
+cp949 "EUC-KR" register-encoding
+
+
+
+<PRIVATE
+
+! parse cp949.txt -> table
+
+: (cp949.txt-lines) ( -- seq )
+    ! "cp949.txt" from ...
+    ! <http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT>
+    "resource:work/io/encodings/korean/data/cp949.txt"
+    ascii file-lines ;
+
+: (PCL-drop-comments) ( seq -- newseq )
+    [ "#" split1 drop ] map harvest ;
+
+: (PCL-split-column) ( line -- columns )
+    "\t" split 2 head ;
+
+: (PCL-parse-hex) ( s -- n )
+    2 short tail hex> ;
+
+: (PCL-parse-line) ( line -- code-unicode )
+    (PCL-split-column)
+    [ (PCL-parse-hex) ] map ;
+
+: (process-codetable-lines) ( lines -- assoc )
+    (PCL-drop-comments)
+    [ (PCL-parse-line) ] map ;
+
+
+! convert cp949 <-> unicode
+
+: (cp949.txt>alist) ( -- alist )
+    (cp949.txt-lines) (process-codetable-lines) ;
+
+: (make-cp949->unicode-table) ( alist -- h )
+    >hashtable ;
+
+: (make-unicode->cp949-table) ( alist -- h )
+    [ reverse ] map >hashtable ;
+
+VALUE: cp949->unicode-table
+VALUE: unicode->cp949-table
+
+(cp949.txt>alist) dup
+(make-cp949->unicode-table) to: cp949->unicode-table
+(make-unicode->cp949-table) to: unicode->cp949-table
+
+
+MEMO: (cp949->unicode) ( b -- u )
+    cp949->unicode-table at ;
+
+MEMO: (unicode->cp949) ( u -- b )
+    unicode->cp949-table at ;
+
+:: (2b->1mb) ( c1 c2 -- mb )
+    c1 8 shift c2 + ;
+
+:: (1mb->1st) ( mb -- c1 )
+    mb HEX: ff00 bitand -8 shift ;
+
+:: (1mb->2nd) ( mb -- c2 )
+    mb HEX: ff bitand ;
+
+:: (1mb->2b) ( mb -- c1 c2 )
+    mb (1mb->1st)
+    mb (1mb->2nd) ;
+
+: (cp949-1st?) ( n -- ? )
+    dup f = not
+    [ HEX: 81 HEX: fe between? ] when ;
+
+: (1byte-unicode?) ( n -- ? )
+    0 HEX: ff between? ;
+
+
+
+M:: cp949 encode-char ( char stream encoding -- )
+    char (unicode->cp949) (1byte-unicode?)
+        [ char 1byte-array
+        stream stream-write ]
+    [ char (unicode->cp949)
+        (1mb->2b) 2byte-array
+        stream stream-write ]
+        if ;
+
+    
+: (eof?) ( n -- ? ) 0 = ;
+
+: (decode-char-step2) ( c stream -- char/f )
+    stream-read1 (2b->1mb) (cp949->unicode) ;
+
+M:: cp949 decode-char ( stream encoding -- char/f )
+    stream stream-read1
+    {
+        { [ dup f = ] [ drop f ] }
+        { [ dup (eof?) ] [ drop replacement-char ] }
+        { [ dup (cp949-1st?) ] [ stream (decode-char-step2) ] }
+        [ ]
+    } cond ;
+
+
+! TODO: <encoder>
+
+! TODO: <decoder>
+
+
+
+
+! EOF
--- a/basis/io/encodings/korean/summary.txt
+++ b/basis/io/encodings/korean/summary.txt
@ -0,0 +1 @@
+Korean text encodings
--- a/basis/io/encodings/korean/tags.txt
+++ b/basis/io/encodings/korean/tags.txt
@ -0,0 +1 @@
+text