Big5 encoding; simple-flat-file vocab abstracts parsing out; EUC code factored out; Asian encoding vocabs renamed

Merge branch 'master' of git://github.com/ageldama/factor Conflicts: basis/io/encodings/korean/korean-docs.factor basis/io/encodings/korean/korean-tests.factor basis/io/encodings/korean/korean.factor
2009-03-02 23:19:06 -06:00 · 2009-03-02 23:19:06 -06:00 · 8c61791364
parent 0bff96a990 43fc1d7029
commit 8c61791364
33 changed files with 55993 additions and 54 deletions
--- a/basis/io/encodings/8-bit/8-bit.factor
+++ b/basis/io/encodings/8-bit/8-bit.factor
@ -4,7 +4,7 @@ USING: math.parser arrays io.encodings sequences kernel assocs
 hashtables io.encodings.ascii generic parser classes.tuple words
 words.symbol io io.files splitting namespaces math
 compiler.units accessors classes.singleton classes.mixin
-io.encodings.iana fry ;
+io.encodings.iana fry simple-flat-file ;
 IN: io.encodings.8-bit

 <PRIVATE
@ -35,36 +35,22 @@ CONSTANT: mappings {
 : encoding-file ( file-name -- stream )
    "vocab:io/encodings/8-bit/" ".TXT" surround ;

-: process-contents ( lines -- assoc )
-    [ "#" split1 drop ] map harvest
-    [ "\t" split 2 head [ 2 short tail hex> ] map ] map ;
-
-: byte>ch ( assoc -- array )
-    256 replacement-char <array>
-    [ '[ swap _ set-nth ] assoc-each ] keep ;
-
-: ch>byte ( assoc -- newassoc )
-    [ swap ] assoc-map >hashtable ;
-
-: parse-file ( path -- byte>ch ch>byte )
-    ascii file-lines process-contents
-    [ byte>ch ] [ ch>byte ] bi ;
-
 SYMBOL: 8-bit-encodings

-TUPLE: 8-bit decode encode ;
+TUPLE: 8-bit biassoc ;

 : encode-8-bit ( char stream assoc -- )
-    swapd at*
-    [ swap stream-write1 ] [ nip encode-error ] if ; inline
+    swapd value-at
+    [ swap stream-write1 ] [ encode-error ] if* ; inline

-M: 8-bit encode-char encode>> encode-8-bit ;
+M: 8-bit encode-char biassoc>> encode-8-bit ;

-: decode-8-bit ( stream array -- char/f )
-    swap stream-read1 dup
-    [ swap nth [ replacement-char ] unless* ] [ 2drop f ] if ; inline
+: decode-8-bit ( stream assoc -- char/f )
+    swap stream-read1
+    [ swap at [ replacement-char ] unless* ]
+    [ drop f ] if* ; inline

-M: 8-bit decode-char decode>> decode-8-bit ;
+M: 8-bit decode-char biassoc>> decode-8-bit ;

 MIXIN: 8-bit-encoding

@ -87,7 +73,7 @@ PRIVATE>
        first3
        [ create-encoding ]
        [ dupd register-encoding ]
-        [ encoding-file parse-file 8-bit boa ]
+        [ encoding-file flat-file>biassoc 8-bit boa ]
        tri*
    ] H{ } map>assoc
    8-bit-encodings set-global
--- a/basis/io/encodings/big5/CP950.TXT
+++ b/basis/io/encodings/big5/CP950.TXT
--- a/basis/io/encodings/big5/big5.factor
+++ b/basis/io/encodings/big5/big5.factor
@ -0,0 +1,9 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: io.encodings.iana io.encodings.euc ;
+IN: io.encodings.big5
+
+EUC: big5 "vocab:io/encodings/big5/CP950.txt"
+
+big5 "Big5" register-encoding
+
--- a/basis/io/encodings/japanese/authors.txt
+++ b/basis/io/encodings/japanese/authors.txt
--- a/basis/io/encodings/euc-jp/euc-jp-2000-std.txt
+++ b/basis/io/encodings/euc-jp/euc-jp-2000-std.txt
--- a/basis/io/encodings/euc-jp/euc-jp.factor
+++ b/basis/io/encodings/euc-jp/euc-jp.factor
@ -0,0 +1,8 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: io.encodings.euc io.encodings.iana ;
+IN: io.encodings.euc-jp
+
+EUC: euc-jp "vocab:io/encodings/euc-jp/euc-jp-2000-std.txt"
+
+euc-jp "Extended_UNIX_Code_Packed_Format_for_Japanese" register-encoding
--- a/basis/io/encodings/euc-jp/summary.txt
+++ b/basis/io/encodings/euc-jp/summary.txt
@ -0,0 +1 @@
+EUC-JP text encoding
--- a/basis/io/encodings/gb18030/authors.txt
+++ b/basis/io/encodings/gb18030/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/io/encodings/gb18030/gb-18030-2000.xml
+++ b/basis/io/encodings/gb18030/gb-18030-2000.xml
--- a/basis/io/encodings/gb18030/gb18030-docs.factor
+++ b/basis/io/encodings/gb18030/gb18030-docs.factor
@ -1,13 +1,13 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: help.syntax help.markup ;
-IN: io.encodings.chinese
+IN: io.encodings.gb18030

-ARTICLE: "io.encodings.chinese" "Chinese text encodings"
-"The " { $vocab-link "io.encodings.chinese" } " vocabulary implements encodings used for Chinese text besides the standard UTF encodings for Unicode strings."
+ARTICLE: "io.encodings.gb18030" "GB 18030"
+"The " { $vocab-link "io.encodings.gb18030" } " vocabulary implements GB18030, a commonly used encoding for Chinese text besides the standard UTF encodings for Unicode strings."
 { $subsection gb18030 } ;

-ABOUT: "io.encodings.chinese"
+ABOUT: "io.encodings.gb18030"

 HELP: gb18030
 { $class-description "The encoding descriptor for GB 18030, a Chinese national standard for text encoding. GB 18030 consists of a unique encoding for each Unicode code point, and for this reason has been described as a UTF. It is backwards compatible with the earlier encodings GB 2312 and GBK." }
--- a/basis/io/encodings/gb18030/gb18030-tests.factor
+++ b/basis/io/encodings/gb18030/gb18030-tests.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-USING: io.encodings.chinese io.encodings.string strings tools.test arrays ;
-IN: io.encodings.chinese.tests
+USING: io.encodings.gb18030 io.encodings.string strings tools.test arrays ;
+IN: io.encodings.gb18030.tests

 [ "hello" ] [ "hello" gb18030 encode >string ] unit-test
 [ "hello" ] [ "hello" gb18030 decode ] unit-test
--- a/basis/io/encodings/gb18030/gb18030.factor
+++ b/basis/io/encodings/gb18030/gb18030.factor
@ -4,7 +4,7 @@ USING: xml xml.data kernel io io.encodings interval-maps splitting fry
 math.parser sequences combinators assocs locals accessors math arrays
 byte-arrays values io.encodings.ascii ascii io.files biassocs
 math.order combinators.short-circuit io.binary io.encodings.iana ;
-IN: io.encodings.chinese
+IN: io.encodings.gb18030

 SINGLETON: gb18030

@ -80,7 +80,7 @@ VALUE: gb>u
 VALUE: u>gb
 VALUE: mapping

-"vocab:io/encodings/chinese/gb-18030-2000.xml"
+"vocab:io/encodings/gb18030/gb-18030-2000.xml"
 ascii <file-reader> xml>gb-data
 [ ranges-u>gb to: u>gb ] [ ranges-gb>u to: gb>u ] bi
 >biassoc to: mapping
--- a/basis/io/encodings/gb18030/summary.txt
+++ b/basis/io/encodings/gb18030/summary.txt
--- a/basis/io/encodings/johab/authors.txt
+++ b/basis/io/encodings/johab/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/io/encodings/johab/johab-docs.factor
+++ b/basis/io/encodings/johab/johab-docs.factor
@ -0,0 +1,9 @@
+! Copyright (C) 2009 Yun, Jonghyouk.
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.syntax help.markup ;
+IN: io.encodings.korean.johab
+
+ABOUT: johab
+
+HELP: johab
+{ $class-description "Korean Johab encoding (KSC5601-1992). This encoding is not commonly used anymore." } ;
--- a/basis/io/encodings/johab/johab-tests.factor
+++ b/basis/io/encodings/johab/johab-tests.factor
@ -0,0 +1,34 @@
+! Copyright (C) 2009 Yun, Jonghyouk.
+! See http://factorcode.org/license.txt for BSD license.
+USING: arrays byte-arrays io io.encodings io.encodings.euc-kr assocs
+io.encodings.string io.streams.string io.encodings.euc.private words
+kernel locals multiline namespaces sequences strings tools.test ;
+IN: io.encodings.johab.tests
+
+: johab>unicode ( ch -- ch/f )
+    johab euc-table word-prop at ;
+
+: unicode>johab ( ch -- ch/f )
+    johab euc-table word-prop value-at ;
+
+! johab encodings
+[ HEX: 20 ] [ HEX: 20 johab>unicode ] unit-test
+[ HEX: 3133 ] [ HEX: 8444 johab>unicode ] unit-test
+[ HEX: 8A5D ] [ HEX: AD4F unicode>johab ] unit-test
+
+
+: phrase-johab ( -- s )
+    B{
+        149 183 208 129 162 137 137 193 32 164 130 150 129 172 101
+        183 161 33
+    } ;
+
+: phrase-johab>unicode ( -- s )
+    phrase-johab johab decode ;
+
+: phrase-unicode>johab ( -- s )
+    phrase-unicode johab encode ;
+
+[ t ] [ phrase-johab>unicode phrase-unicode = ] unit-test
+[ t ] [ phrase-unicode>johab phrase-johab = ] unit-test
+
--- a/basis/io/encodings/johab/johab.factor
+++ b/basis/io/encodings/johab/johab.factor
@ -0,0 +1,7 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USE: io.encodings.euc
+IN: io.encodings.johab
+
+EUC: johab "vocab:io/encodings/johab/johab.txt" 
+
--- a/basis/io/encodings/johab/johab.txt
+++ b/basis/io/encodings/johab/johab.txt
--- a/basis/io/encodings/johab/summary.txt
+++ b/basis/io/encodings/johab/summary.txt
@ -0,0 +1 @@
+Johab Korean text encoding
--- a/basis/io/encodings/shift-jis/CP932.txt
+++ b/basis/io/encodings/shift-jis/CP932.txt
--- a/basis/io/encodings/shift-jis/authors.txt
+++ b/basis/io/encodings/shift-jis/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/io/encodings/shift-jis/euc-0201.txt
+++ b/basis/io/encodings/shift-jis/euc-0201.txt
@ -0,0 +1,208 @@
+#
+#	Name:             JIS X 0201 (1976) to Unicode 1.1 Table
+#	Unicode version:  1.1
+#	Table version:    0.9
+#	Table format:     Format A
+#	Date:             8 March 1994
+#
+#	Copyright (c) 1991-1994 Unicode, Inc.  All Rights reserved.
+#
+#	This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
+#	No claims are made as to fitness for any particular purpose.  No
+#	warranties of any kind are expressed or implied.  The recipient
+#	agrees to determine applicability of information provided.  If this
+#	file has been provided on magnetic media by Unicode, Inc., the sole
+#	remedy for any claim will be exchange of defective media within 90
+#	days of receipt.
+#
+#	Recipient is granted the right to make copies in any form for
+#	internal distribution and to freely use the information supplied
+#	in the creation of products supporting Unicode.  Unicode, Inc.
+#	specifically excludes the right to re-distribute this file directly
+#	to third parties or other organizations whether for profit or not.
+#
+#	General notes:
+#
+#
+# This table contains one set of mappings from JIS X 0201 into Unicode.
+# Note that these data are *possible* mappings only and may not be the
+# same as those used by actual products, nor may they be the best suited
+# for all uses.  For more information on the mappings between various code
+# pages incorporating the repertoire of JIS X 0201 and Unicode, consult the
+# VENDORS mapping data.  Normative information on the mapping between
+# JIS X 0201 and Unicode may be found in the Unihan.txt file in the
+# latest Unicode Character Database.
+#
+# If you have carefully considered the fact that the mappings in
+# this table are only one possible set of mappings between JIS X 0201 and
+# Unicode and have no normative status, but still feel that you
+# have located an error in the table that requires fixing, you may
+# report any such error to errata@unicode.org.
+#
+#
+#	Format:  Three tab-separated columns
+#		Column #1 is the shift JIS code (in hex as 0xXX)
+#		Column #2 is the Unicode (in hex as 0xXXXX)
+#		Column #3 the Unicode (ISO 10646) name (follows a comment sign)
+#
+#	The entries are in JIS order
+#
+#
+0x20	0x0020	# SPACE
+0x21	0x0021	# EXCLAMATION MARK
+0x22	0x0022	# QUOTATION MARK
+0x23	0x0023	# NUMBER SIGN
+0x24	0x0024	# DOLLAR SIGN
+0x25	0x0025	# PERCENT SIGN
+0x26	0x0026	# AMPERSAND
+0x27	0x0027	# APOSTROPHE
+0x28	0x0028	# LEFT PARENTHESIS
+0x29	0x0029	# RIGHT PARENTHESIS
+0x2A	0x002A	# ASTERISK
+0x2B	0x002B	# PLUS SIGN
+0x2C	0x002C	# COMMA
+0x2D	0x002D	# HYPHEN-MINUS
+0x2E	0x002E	# FULL STOP
+0x2F	0x002F	# SOLIDUS
+0x30	0x0030	# DIGIT ZERO
+0x31	0x0031	# DIGIT ONE
+0x32	0x0032	# DIGIT TWO
+0x33	0x0033	# DIGIT THREE
+0x34	0x0034	# DIGIT FOUR
+0x35	0x0035	# DIGIT FIVE
+0x36	0x0036	# DIGIT SIX
+0x37	0x0037	# DIGIT SEVEN
+0x38	0x0038	# DIGIT EIGHT
+0x39	0x0039	# DIGIT NINE
+0x3A	0x003A	# COLON
+0x3B	0x003B	# SEMICOLON
+0x3C	0x003C	# LESS-THAN SIGN
+0x3D	0x003D	# EQUALS SIGN
+0x3E	0x003E	# GREATER-THAN SIGN
+0x3F	0x003F	# QUESTION MARK
+0x40	0x0040	# COMMERCIAL AT
+0x41	0x0041	# LATIN CAPITAL LETTER A
+0x42	0x0042	# LATIN CAPITAL LETTER B
+0x43	0x0043	# LATIN CAPITAL LETTER C
+0x44	0x0044	# LATIN CAPITAL LETTER D
+0x45	0x0045	# LATIN CAPITAL LETTER E
+0x46	0x0046	# LATIN CAPITAL LETTER F
+0x47	0x0047	# LATIN CAPITAL LETTER G
+0x48	0x0048	# LATIN CAPITAL LETTER H
+0x49	0x0049	# LATIN CAPITAL LETTER I
+0x4A	0x004A	# LATIN CAPITAL LETTER J
+0x4B	0x004B	# LATIN CAPITAL LETTER K
+0x4C	0x004C	# LATIN CAPITAL LETTER L
+0x4D	0x004D	# LATIN CAPITAL LETTER M
+0x4E	0x004E	# LATIN CAPITAL LETTER N
+0x4F	0x004F	# LATIN CAPITAL LETTER O
+0x50	0x0050	# LATIN CAPITAL LETTER P
+0x51	0x0051	# LATIN CAPITAL LETTER Q
+0x52	0x0052	# LATIN CAPITAL LETTER R
+0x53	0x0053	# LATIN CAPITAL LETTER S
+0x54	0x0054	# LATIN CAPITAL LETTER T
+0x55	0x0055	# LATIN CAPITAL LETTER U
+0x56	0x0056	# LATIN CAPITAL LETTER V
+0x57	0x0057	# LATIN CAPITAL LETTER W
+0x58	0x0058	# LATIN CAPITAL LETTER X
+0x59	0x0059	# LATIN CAPITAL LETTER Y
+0x5A	0x005A	# LATIN CAPITAL LETTER Z
+0x5B	0x005B	# LEFT SQUARE BRACKET
+0x5C	0x00A5	# YEN SIGN
+0x5D	0x005D	# RIGHT SQUARE BRACKET
+0x5E	0x005E	# CIRCUMFLEX ACCENT
+0x5F	0x005F	# LOW LINE
+0x60	0x0060	# GRAVE ACCENT
+0x61	0x0061	# LATIN SMALL LETTER A
+0x62	0x0062	# LATIN SMALL LETTER B
+0x63	0x0063	# LATIN SMALL LETTER C
+0x64	0x0064	# LATIN SMALL LETTER D
+0x65	0x0065	# LATIN SMALL LETTER E
+0x66	0x0066	# LATIN SMALL LETTER F
+0x67	0x0067	# LATIN SMALL LETTER G
+0x68	0x0068	# LATIN SMALL LETTER H
+0x69	0x0069	# LATIN SMALL LETTER I
+0x6A	0x006A	# LATIN SMALL LETTER J
+0x6B	0x006B	# LATIN SMALL LETTER K
+0x6C	0x006C	# LATIN SMALL LETTER L
+0x6D	0x006D	# LATIN SMALL LETTER M
+0x6E	0x006E	# LATIN SMALL LETTER N
+0x6F	0x006F	# LATIN SMALL LETTER O
+0x70	0x0070	# LATIN SMALL LETTER P
+0x71	0x0071	# LATIN SMALL LETTER Q
+0x72	0x0072	# LATIN SMALL LETTER R
+0x73	0x0073	# LATIN SMALL LETTER S
+0x74	0x0074	# LATIN SMALL LETTER T
+0x75	0x0075	# LATIN SMALL LETTER U
+0x76	0x0076	# LATIN SMALL LETTER V
+0x77	0x0077	# LATIN SMALL LETTER W
+0x78	0x0078	# LATIN SMALL LETTER X
+0x79	0x0079	# LATIN SMALL LETTER Y
+0x7A	0x007A	# LATIN SMALL LETTER Z
+0x7B	0x007B	# LEFT CURLY BRACKET
+0x7C	0x007C	# VERTICAL LINE
+0x7D	0x007D	# RIGHT CURLY BRACKET
+0x7E	0x203E	# OVERLINE
+0xA1	0xFF61	# HALFWIDTH IDEOGRAPHIC FULL STOP
+0xA2	0xFF62	# HALFWIDTH LEFT CORNER BRACKET
+0xA3	0xFF63	# HALFWIDTH RIGHT CORNER BRACKET
+0xA4	0xFF64	# HALFWIDTH IDEOGRAPHIC COMMA
+0xA5	0xFF65	# HALFWIDTH KATAKANA MIDDLE DOT
+0xA6	0xFF66	# HALFWIDTH KATAKANA LETTER WO
+0xA7	0xFF67	# HALFWIDTH KATAKANA LETTER SMALL A
+0xA8	0xFF68	# HALFWIDTH KATAKANA LETTER SMALL I
+0xA9	0xFF69	# HALFWIDTH KATAKANA LETTER SMALL U
+0xAA	0xFF6A	# HALFWIDTH KATAKANA LETTER SMALL E
+0xAB	0xFF6B	# HALFWIDTH KATAKANA LETTER SMALL O
+0xAC	0xFF6C	# HALFWIDTH KATAKANA LETTER SMALL YA
+0xAD	0xFF6D	# HALFWIDTH KATAKANA LETTER SMALL YU
+0xAE	0xFF6E	# HALFWIDTH KATAKANA LETTER SMALL YO
+0xAF	0xFF6F	# HALFWIDTH KATAKANA LETTER SMALL TU
+0xB0	0xFF70	# HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
+0xB1	0xFF71	# HALFWIDTH KATAKANA LETTER A
+0xB2	0xFF72	# HALFWIDTH KATAKANA LETTER I
+0xB3	0xFF73	# HALFWIDTH KATAKANA LETTER U
+0xB4	0xFF74	# HALFWIDTH KATAKANA LETTER E
+0xB5	0xFF75	# HALFWIDTH KATAKANA LETTER O
+0xB6	0xFF76	# HALFWIDTH KATAKANA LETTER KA
+0xB7	0xFF77	# HALFWIDTH KATAKANA LETTER KI
+0xB8	0xFF78	# HALFWIDTH KATAKANA LETTER KU
+0xB9	0xFF79	# HALFWIDTH KATAKANA LETTER KE
+0xBA	0xFF7A	# HALFWIDTH KATAKANA LETTER KO
+0xBB	0xFF7B	# HALFWIDTH KATAKANA LETTER SA
+0xBC	0xFF7C	# HALFWIDTH KATAKANA LETTER SI
+0xBD	0xFF7D	# HALFWIDTH KATAKANA LETTER SU
+0xBE	0xFF7E	# HALFWIDTH KATAKANA LETTER SE
+0xBF	0xFF7F	# HALFWIDTH KATAKANA LETTER SO
+0xC0	0xFF80	# HALFWIDTH KATAKANA LETTER TA
+0xC1	0xFF81	# HALFWIDTH KATAKANA LETTER TI
+0xC2	0xFF82	# HALFWIDTH KATAKANA LETTER TU
+0xC3	0xFF83	# HALFWIDTH KATAKANA LETTER TE
+0xC4	0xFF84	# HALFWIDTH KATAKANA LETTER TO
+0xC5	0xFF85	# HALFWIDTH KATAKANA LETTER NA
+0xC6	0xFF86	# HALFWIDTH KATAKANA LETTER NI
+0xC7	0xFF87	# HALFWIDTH KATAKANA LETTER NU
+0xC8	0xFF88	# HALFWIDTH KATAKANA LETTER NE
+0xC9	0xFF89	# HALFWIDTH KATAKANA LETTER NO
+0xCA	0xFF8A	# HALFWIDTH KATAKANA LETTER HA
+0xCB	0xFF8B	# HALFWIDTH KATAKANA LETTER HI
+0xCC	0xFF8C	# HALFWIDTH KATAKANA LETTER HU
+0xCD	0xFF8D	# HALFWIDTH KATAKANA LETTER HE
+0xCE	0xFF8E	# HALFWIDTH KATAKANA LETTER HO
+0xCF	0xFF8F	# HALFWIDTH KATAKANA LETTER MA
+0xD0	0xFF90	# HALFWIDTH KATAKANA LETTER MI
+0xD1	0xFF91	# HALFWIDTH KATAKANA LETTER MU
+0xD2	0xFF92	# HALFWIDTH KATAKANA LETTER ME
+0xD3	0xFF93	# HALFWIDTH KATAKANA LETTER MO
+0xD4	0xFF94	# HALFWIDTH KATAKANA LETTER YA
+0xD5	0xFF95	# HALFWIDTH KATAKANA LETTER YU
+0xD6	0xFF96	# HALFWIDTH KATAKANA LETTER YO
+0xD7	0xFF97	# HALFWIDTH KATAKANA LETTER RA
+0xD8	0xFF98	# HALFWIDTH KATAKANA LETTER RI
+0xD9	0xFF99	# HALFWIDTH KATAKANA LETTER RU
+0xDA	0xFF9A	# HALFWIDTH KATAKANA LETTER RE
+0xDB	0xFF9B	# HALFWIDTH KATAKANA LETTER RO
+0xDC	0xFF9C	# HALFWIDTH KATAKANA LETTER WA
+0xDD	0xFF9D	# HALFWIDTH KATAKANA LETTER N
+0xDE	0xFF9E	# HALFWIDTH KATAKANA VOICED SOUND MARK
+0xDF	0xFF9F	# HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
--- a/basis/io/encodings/shift-jis/euc-0208.txt
+++ b/basis/io/encodings/shift-jis/euc-0208.txt
--- a/basis/io/encodings/shift-jis/euc-0212.txt
+++ b/basis/io/encodings/shift-jis/euc-0212.txt
--- a/basis/io/encodings/shift-jis/shift-jis-docs.factor
+++ b/basis/io/encodings/shift-jis/shift-jis-docs.factor
@ -1,14 +1,14 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: help.markup help.syntax ;
-IN: io.encodings.japanese
+IN: io.encodings.shift-jis

-ARTICLE: "io.encodings.japanese" "Japanese text encodings"
-"Several encodings are used for Japanese text besides the standard UTF encodings for Unicode strings. These are mostly based on the character set defined in the JIS X 208 standard. Current coverage of encodings is incomplete."
+ARTICLE: "io.encodings.shift-jis" "Shift JIS"
+"Shift JIS is a text encoding for Japanese. There are multiple versions, depending on whether the offical standard or the modified Microsoft version is required."
 { $subsection shift-jis }
 { $subsection windows-31j } ;

-ABOUT: "io.encodings.japanese"
+ABOUT: "io.encodings.shift-jis"

 HELP: windows-31j
 { $class-description "The encoding descriptor Windows-31J, which is sometimes informally called Shift JIS. This is based on Code Page 932." }
--- a/basis/io/encodings/shift-jis/shift-jis-tests.factor
+++ b/basis/io/encodings/shift-jis/shift-jis-tests.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2009 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-USING: io.encodings.japanese tools.test io.encodings.string arrays strings ;
-IN: io.encodings.japanese.tests
+USING: io.encodings.shift-jis tools.test io.encodings.string arrays strings ;
+IN: io.encodings.shift-jis.tests

 [ { CHAR: replacement-character } ] [ { 141 } shift-jis decode >array ] unit-test
 [ "" ] [ "" shift-jis decode >string ] unit-test
--- a/basis/io/encodings/shift-jis/shift-jis.factor
+++ b/basis/io/encodings/shift-jis/shift-jis.factor
@ -3,8 +3,9 @@
 USING: sequences kernel io io.files combinators.short-circuit
 math.order values assocs io.encodings io.binary fry strings math
 io.encodings.ascii arrays byte-arrays accessors splitting
-math.parser biassocs io.encodings.iana ;
-IN: io.encodings.japanese
+math.parser biassocs io.encodings.iana
+locals multiline combinators simple-flat-file ;
+IN: io.encodings.shift-jis

 SINGLETON: shift-jis

@ -28,21 +29,11 @@ M: windows-31j <decoder> drop windows-31j-table <decoder> ;

 TUPLE: jis assoc ;

-: <jis> ( assoc -- jis )
-    [ nip ] assoc-filter
-    >biassoc jis boa ;
-
 : ch>jis ( ch tuple -- jis ) assoc>> value-at [ encode-error ] unless* ;
 : jis>ch ( jis tuple -- string ) assoc>> at replacement-char or ;

-: process-jis ( lines -- assoc )
-    [ "#" split1 drop ] map harvest [
-        "\t" split 2 head
-        [ 2 short tail hex> ] map
-    ] map ;
-
 : make-jis ( filename -- jis )
-    ascii file-lines process-jis <jis> ;
+    flat-file>biassoc [ nip ] assoc-filter jis boa ;

 "vocab:io/encodings/japanese/CP932.txt"
 make-jis to: windows-31j-table
@ -71,5 +62,3 @@ M: jis decode-char
            [ 2drop replacement-char ] if*
        ] if
    ] [ 2drop f ] if* ;
-
-PRIVATE>
--- a/basis/io/encodings/shift-jis/sjis-0208-1997-std.txt
+++ b/basis/io/encodings/shift-jis/sjis-0208-1997-std.txt
--- a/basis/io/encodings/shift-jis/summary.txt
+++ b/basis/io/encodings/shift-jis/summary.txt
--- a/basis/io/encodings/utf16n/authors.txt
+++ b/basis/io/encodings/utf16n/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/simple-flat-file/simple-flat-file-docs.factor
+++ b/basis/simple-flat-file/simple-flat-file-docs.factor
@ -0,0 +1,8 @@
+USING: help.syntax help.markup strings ;
+IN: simple-flat-file
+
+ABOUT: "simple-flat-file"
+
+ARTICLE: "simple-flat-file" "Parsing simple flat files"
+"The " { $vocab-link "simple-flat-file" } " vocabulary provides words for loading and parsing simple flat files in a particular format which is common for encoding tasks."
+{ $subsection flat-file>biassoc } ;
--- a/basis/simple-flat-file/simple-flat-file-tests.factor
+++ b/basis/simple-flat-file/simple-flat-file-tests.factor
@ -0,0 +1,23 @@
+! Copyright (C) 2009 Yun, Jonghyouk.
+! See http://factorcode.org/license.txt for BSD license.
+USING: simple-flat-file tools.test memoize ;
+IN: simple-flat-file.tests
+
+
+MEMO: <test1> ( -- code-table )
+    "vocab:simple-flat-file/test1.txt" flat-file>biassoc ;
+
+
+[ 0 ] [ 0 <test1> at ] unit-test
+[ 0 ] [ 0 <test1> value-at ] unit-test
+
+[ 3 ] [ 3 <test1> at ] unit-test
+[ 3 ] [ 3 <test1> value-at ] unit-test
+
+[ HEX: AD2A ] [ HEX: 8253 <test1> at ] unit-test
+[ HEX: 8253 ] [ HEX: AD2A <test1> value-at ] unit-test
+
+[ HEX: AD31 ] [ HEX: 8258 <test1> at ] unit-test
+[ HEX: 8258 ] [ HEX: AD31 <test1> value-at ] unit-test
+
+
--- a/basis/simple-flat-file/simple-flat-file.factor
+++ b/basis/simple-flat-file/simple-flat-file.factor
@ -0,0 +1,23 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: sequences splitting kernel math.parser io.files io.encodings.ascii biassocs ;
+IN: simple-flat-file
+
+: drop-comments ( seq -- newseq )
+    [ "#" split1 drop ] map harvest ;
+
+: split-column ( line -- columns )
+    "\t" split 2 head ;
+
+: parse-hex ( s -- n )
+    2 short tail hex> ;
+
+: parse-line ( line -- code-unicode )
+    split-column [ parse-hex ] map ;
+
+: process-codetable-lines ( lines -- assoc )
+    drop-comments [ parse-line ] map ; 
+
+: flat-file>biassoc ( filename -- biassoc )
+    ascii file-lines process-codetable-lines >biassoc ;
+