ISO 2022-JP-1 implementation

db4
Daniel Ehrenberg 2009-03-20 16:15:26 -05:00
parent b6e5bac483
commit 9de4083fc0
8 changed files with 13472 additions and 0 deletions

View File

@ -0,0 +1,208 @@
#
# Name: JIS X 0201 (1976) to Unicode 1.1 Table
# Unicode version: 1.1
# Table version: 0.9
# Table format: Format A
# Date: 8 March 1994
#
# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on magnetic media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Recipient is granted the right to make copies in any form for
# internal distribution and to freely use the information supplied
# in the creation of products supporting Unicode. Unicode, Inc.
# specifically excludes the right to re-distribute this file directly
# to third parties or other organizations whether for profit or not.
#
# General notes:
#
#
# This table contains one set of mappings from JIS X 0201 into Unicode.
# Note that these data are *possible* mappings only and may not be the
# same as those used by actual products, nor may they be the best suited
# for all uses. For more information on the mappings between various code
# pages incorporating the repertoire of JIS X 0201 and Unicode, consult the
# VENDORS mapping data. Normative information on the mapping between
# JIS X 0201 and Unicode may be found in the Unihan.txt file in the
# latest Unicode Character Database.
#
# If you have carefully considered the fact that the mappings in
# this table are only one possible set of mappings between JIS X 0201 and
# Unicode and have no normative status, but still feel that you
# have located an error in the table that requires fixing, you may
# report any such error to errata@unicode.org.
#
#
# Format: Three tab-separated columns
# Column #1 is the shift JIS code (in hex as 0xXX)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 the Unicode (ISO 10646) name (follows a comment sign)
#
# The entries are in JIS order
#
#
0x20 0x0020 # SPACE
0x21 0x0021 # EXCLAMATION MARK
0x22 0x0022 # QUOTATION MARK
0x23 0x0023 # NUMBER SIGN
0x24 0x0024 # DOLLAR SIGN
0x25 0x0025 # PERCENT SIGN
0x26 0x0026 # AMPERSAND
0x27 0x0027 # APOSTROPHE
0x28 0x0028 # LEFT PARENTHESIS
0x29 0x0029 # RIGHT PARENTHESIS
0x2A 0x002A # ASTERISK
0x2B 0x002B # PLUS SIGN
0x2C 0x002C # COMMA
0x2D 0x002D # HYPHEN-MINUS
0x2E 0x002E # FULL STOP
0x2F 0x002F # SOLIDUS
0x30 0x0030 # DIGIT ZERO
0x31 0x0031 # DIGIT ONE
0x32 0x0032 # DIGIT TWO
0x33 0x0033 # DIGIT THREE
0x34 0x0034 # DIGIT FOUR
0x35 0x0035 # DIGIT FIVE
0x36 0x0036 # DIGIT SIX
0x37 0x0037 # DIGIT SEVEN
0x38 0x0038 # DIGIT EIGHT
0x39 0x0039 # DIGIT NINE
0x3A 0x003A # COLON
0x3B 0x003B # SEMICOLON
0x3C 0x003C # LESS-THAN SIGN
0x3D 0x003D # EQUALS SIGN
0x3E 0x003E # GREATER-THAN SIGN
0x3F 0x003F # QUESTION MARK
0x40 0x0040 # COMMERCIAL AT
0x41 0x0041 # LATIN CAPITAL LETTER A
0x42 0x0042 # LATIN CAPITAL LETTER B
0x43 0x0043 # LATIN CAPITAL LETTER C
0x44 0x0044 # LATIN CAPITAL LETTER D
0x45 0x0045 # LATIN CAPITAL LETTER E
0x46 0x0046 # LATIN CAPITAL LETTER F
0x47 0x0047 # LATIN CAPITAL LETTER G
0x48 0x0048 # LATIN CAPITAL LETTER H
0x49 0x0049 # LATIN CAPITAL LETTER I
0x4A 0x004A # LATIN CAPITAL LETTER J
0x4B 0x004B # LATIN CAPITAL LETTER K
0x4C 0x004C # LATIN CAPITAL LETTER L
0x4D 0x004D # LATIN CAPITAL LETTER M
0x4E 0x004E # LATIN CAPITAL LETTER N
0x4F 0x004F # LATIN CAPITAL LETTER O
0x50 0x0050 # LATIN CAPITAL LETTER P
0x51 0x0051 # LATIN CAPITAL LETTER Q
0x52 0x0052 # LATIN CAPITAL LETTER R
0x53 0x0053 # LATIN CAPITAL LETTER S
0x54 0x0054 # LATIN CAPITAL LETTER T
0x55 0x0055 # LATIN CAPITAL LETTER U
0x56 0x0056 # LATIN CAPITAL LETTER V
0x57 0x0057 # LATIN CAPITAL LETTER W
0x58 0x0058 # LATIN CAPITAL LETTER X
0x59 0x0059 # LATIN CAPITAL LETTER Y
0x5A 0x005A # LATIN CAPITAL LETTER Z
0x5B 0x005B # LEFT SQUARE BRACKET
0x5C 0x00A5 # YEN SIGN
0x5D 0x005D # RIGHT SQUARE BRACKET
0x5E 0x005E # CIRCUMFLEX ACCENT
0x5F 0x005F # LOW LINE
0x60 0x0060 # GRAVE ACCENT
0x61 0x0061 # LATIN SMALL LETTER A
0x62 0x0062 # LATIN SMALL LETTER B
0x63 0x0063 # LATIN SMALL LETTER C
0x64 0x0064 # LATIN SMALL LETTER D
0x65 0x0065 # LATIN SMALL LETTER E
0x66 0x0066 # LATIN SMALL LETTER F
0x67 0x0067 # LATIN SMALL LETTER G
0x68 0x0068 # LATIN SMALL LETTER H
0x69 0x0069 # LATIN SMALL LETTER I
0x6A 0x006A # LATIN SMALL LETTER J
0x6B 0x006B # LATIN SMALL LETTER K
0x6C 0x006C # LATIN SMALL LETTER L
0x6D 0x006D # LATIN SMALL LETTER M
0x6E 0x006E # LATIN SMALL LETTER N
0x6F 0x006F # LATIN SMALL LETTER O
0x70 0x0070 # LATIN SMALL LETTER P
0x71 0x0071 # LATIN SMALL LETTER Q
0x72 0x0072 # LATIN SMALL LETTER R
0x73 0x0073 # LATIN SMALL LETTER S
0x74 0x0074 # LATIN SMALL LETTER T
0x75 0x0075 # LATIN SMALL LETTER U
0x76 0x0076 # LATIN SMALL LETTER V
0x77 0x0077 # LATIN SMALL LETTER W
0x78 0x0078 # LATIN SMALL LETTER X
0x79 0x0079 # LATIN SMALL LETTER Y
0x7A 0x007A # LATIN SMALL LETTER Z
0x7B 0x007B # LEFT CURLY BRACKET
0x7C 0x007C # VERTICAL LINE
0x7D 0x007D # RIGHT CURLY BRACKET
0x7E 0x203E # OVERLINE
0xA1 0xFF61 # HALFWIDTH IDEOGRAPHIC FULL STOP
0xA2 0xFF62 # HALFWIDTH LEFT CORNER BRACKET
0xA3 0xFF63 # HALFWIDTH RIGHT CORNER BRACKET
0xA4 0xFF64 # HALFWIDTH IDEOGRAPHIC COMMA
0xA5 0xFF65 # HALFWIDTH KATAKANA MIDDLE DOT
0xA6 0xFF66 # HALFWIDTH KATAKANA LETTER WO
0xA7 0xFF67 # HALFWIDTH KATAKANA LETTER SMALL A
0xA8 0xFF68 # HALFWIDTH KATAKANA LETTER SMALL I
0xA9 0xFF69 # HALFWIDTH KATAKANA LETTER SMALL U
0xAA 0xFF6A # HALFWIDTH KATAKANA LETTER SMALL E
0xAB 0xFF6B # HALFWIDTH KATAKANA LETTER SMALL O
0xAC 0xFF6C # HALFWIDTH KATAKANA LETTER SMALL YA
0xAD 0xFF6D # HALFWIDTH KATAKANA LETTER SMALL YU
0xAE 0xFF6E # HALFWIDTH KATAKANA LETTER SMALL YO
0xAF 0xFF6F # HALFWIDTH KATAKANA LETTER SMALL TU
0xB0 0xFF70 # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
0xB1 0xFF71 # HALFWIDTH KATAKANA LETTER A
0xB2 0xFF72 # HALFWIDTH KATAKANA LETTER I
0xB3 0xFF73 # HALFWIDTH KATAKANA LETTER U
0xB4 0xFF74 # HALFWIDTH KATAKANA LETTER E
0xB5 0xFF75 # HALFWIDTH KATAKANA LETTER O
0xB6 0xFF76 # HALFWIDTH KATAKANA LETTER KA
0xB7 0xFF77 # HALFWIDTH KATAKANA LETTER KI
0xB8 0xFF78 # HALFWIDTH KATAKANA LETTER KU
0xB9 0xFF79 # HALFWIDTH KATAKANA LETTER KE
0xBA 0xFF7A # HALFWIDTH KATAKANA LETTER KO
0xBB 0xFF7B # HALFWIDTH KATAKANA LETTER SA
0xBC 0xFF7C # HALFWIDTH KATAKANA LETTER SI
0xBD 0xFF7D # HALFWIDTH KATAKANA LETTER SU
0xBE 0xFF7E # HALFWIDTH KATAKANA LETTER SE
0xBF 0xFF7F # HALFWIDTH KATAKANA LETTER SO
0xC0 0xFF80 # HALFWIDTH KATAKANA LETTER TA
0xC1 0xFF81 # HALFWIDTH KATAKANA LETTER TI
0xC2 0xFF82 # HALFWIDTH KATAKANA LETTER TU
0xC3 0xFF83 # HALFWIDTH KATAKANA LETTER TE
0xC4 0xFF84 # HALFWIDTH KATAKANA LETTER TO
0xC5 0xFF85 # HALFWIDTH KATAKANA LETTER NA
0xC6 0xFF86 # HALFWIDTH KATAKANA LETTER NI
0xC7 0xFF87 # HALFWIDTH KATAKANA LETTER NU
0xC8 0xFF88 # HALFWIDTH KATAKANA LETTER NE
0xC9 0xFF89 # HALFWIDTH KATAKANA LETTER NO
0xCA 0xFF8A # HALFWIDTH KATAKANA LETTER HA
0xCB 0xFF8B # HALFWIDTH KATAKANA LETTER HI
0xCC 0xFF8C # HALFWIDTH KATAKANA LETTER HU
0xCD 0xFF8D # HALFWIDTH KATAKANA LETTER HE
0xCE 0xFF8E # HALFWIDTH KATAKANA LETTER HO
0xCF 0xFF8F # HALFWIDTH KATAKANA LETTER MA
0xD0 0xFF90 # HALFWIDTH KATAKANA LETTER MI
0xD1 0xFF91 # HALFWIDTH KATAKANA LETTER MU
0xD2 0xFF92 # HALFWIDTH KATAKANA LETTER ME
0xD3 0xFF93 # HALFWIDTH KATAKANA LETTER MO
0xD4 0xFF94 # HALFWIDTH KATAKANA LETTER YA
0xD5 0xFF95 # HALFWIDTH KATAKANA LETTER YU
0xD6 0xFF96 # HALFWIDTH KATAKANA LETTER YO
0xD7 0xFF97 # HALFWIDTH KATAKANA LETTER RA
0xD8 0xFF98 # HALFWIDTH KATAKANA LETTER RI
0xD9 0xFF99 # HALFWIDTH KATAKANA LETTER RU
0xDA 0xFF9A # HALFWIDTH KATAKANA LETTER RE
0xDB 0xFF9B # HALFWIDTH KATAKANA LETTER RO
0xDC 0xFF9C # HALFWIDTH KATAKANA LETTER WA
0xDD 0xFF9D # HALFWIDTH KATAKANA LETTER N
0xDE 0xFF9E # HALFWIDTH KATAKANA VOICED SOUND MARK
0xDF 0xFF9F # HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
Daniel Ehrenberg

View File

@ -0,0 +1,13 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.syntax help.markup ;
IN: io.encodings.iso2022
HELP: iso2022
{ $class-description "This encoding class implements ISO 2022-JP-1, a Japanese text encoding commonly used for email." }
{ $see-also "encodings-introduction" } ;
ARTICLE: "io.encodings.iso2022" "ISO 2022-JP-1 encoding"
{ $subsection iso2022 } ;
ABOUT: "io.encodings.iso2022"

View File

@ -0,0 +1,36 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: io.encodings.string io.encodings.iso2022 tools.test
io.encodings.iso2022.private literals strings byte-arrays ;
IN: io.encodings.iso2022
[ "hello" ] [ "hello" >byte-array iso2022 decode ] unit-test
[ "hello" ] [ "hello" iso2022 encode >string ] unit-test
[ "hi" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: B CHAR: i } iso2022 decode ] unit-test
[ "hi" ] [ B{ CHAR: h CHAR: i $ ESC CHAR: ( CHAR: B } iso2022 decode ] unit-test
[ "hi\u00fffd" ] [ B{ CHAR: h CHAR: i $ ESC CHAR: ( } iso2022 decode ] unit-test
[ "hi\u00fffd" ] [ B{ CHAR: h CHAR: i $ ESC } iso2022 decode ] unit-test
[ B{ CHAR: h $ ESC CHAR: ( CHAR: J HEX: D8 } ] [ "h\u00ff98" iso2022 encode ] unit-test
[ "h\u00ff98" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: J HEX: D8 } iso2022 decode ] unit-test
[ "hi" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: J CHAR: i } iso2022 decode ] unit-test
[ "h" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: J } iso2022 decode ] unit-test
[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: ( CHAR: J HEX: 80 } iso2022 decode ] unit-test
[ B{ CHAR: h $ ESC CHAR: $ CHAR: B HEX: 3E HEX: 47 } ] [ "h\u007126" iso2022 encode ] unit-test
[ "h\u007126" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: B HEX: 3E HEX: 47 } iso2022 decode ] unit-test
[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: B HEX: 3E } iso2022 decode ] unit-test
[ "h" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: B } iso2022 decode ] unit-test
[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ } iso2022 decode ] unit-test
[ "h\u00fffd" ] [ B{ CHAR: h $ ESC } iso2022 decode ] unit-test
[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: B HEX: 80 HEX: 80 } iso2022 decode ] unit-test
[ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 HEX: 54 } ] [ "h\u0058ce" iso2022 encode ] unit-test
[ "h\u0058ce" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 HEX: 54 } iso2022 decode ] unit-test
[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D HEX: 38 } iso2022 decode ] unit-test
[ "h" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D } iso2022 decode ] unit-test
[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( } iso2022 decode ] unit-test
[ "h\u00fffd" ] [ B{ CHAR: h $ ESC CHAR: $ CHAR: ( CHAR: D HEX: 70 HEX: 70 } iso2022 decode ] unit-test
[ "\u{syriac-music}" iso2022 encode ] must-fail

View File

@ -0,0 +1,107 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: io.encodings kernel sequences io simple-flat-file sets math
combinators.short-circuit io.binary values arrays assocs
locals accessors combinators literals biassocs byte-arrays ;
IN: io.encodings.iso2022
SINGLETON: iso2022
<PRIVATE
VALUE: jis201
VALUE: jis208
VALUE: jis212
"vocab:io/encodings/iso2022/201.txt" flat-file>biassoc to: jis201
"vocab:io/encodings/iso2022/208.txt" flat-file>biassoc to: jis208
"vocab:io/encodings/iso2022/212.txt" flat-file>biassoc to: jis212
VALUE: ascii
128 unique >biassoc to: ascii
TUPLE: iso2022-state type ;
: make-iso-coder ( encoding -- state )
drop ascii iso2022-state boa ;
M: iso2022 <encoder>
make-iso-coder <encoder> ;
M: iso2022 <decoder>
make-iso-coder <decoder> ;
CONSTANT: ESC HEX: 16
CONSTANT: switch-ascii B{ $ ESC CHAR: ( CHAR: B }
CONSTANT: switch-jis201 B{ $ ESC CHAR: ( CHAR: J }
CONSTANT: switch-jis208 B{ $ ESC CHAR: $ CHAR: B }
CONSTANT: switch-jis212 B{ $ ESC CHAR: $ CHAR: ( CHAR: D }
: find-type ( char -- code type )
{
{ [ dup ascii value? ] [ drop switch-ascii ascii ] }
{ [ dup jis201 value? ] [ drop switch-jis201 jis201 ] }
{ [ dup jis208 value? ] [ drop switch-jis208 jis208 ] }
{ [ dup jis212 value? ] [ drop switch-jis212 jis212 ] }
[ encode-error ]
} cond ;
: stream-write-num ( num stream -- )
over 256 >=
[ [ h>b/b swap 2byte-array ] dip stream-write ]
[ stream-write1 ] if ;
M:: iso2022-state encode-char ( char stream encoding -- )
char encoding type>> value? [
char find-type
[ stream stream-write ]
[ encoding (>>type) ] bi*
] unless
char encoding type>> value-at stream stream-write-num ;
: read-escape ( stream -- type/f )
dup stream-read1 {
{ CHAR: ( [
stream-read1 {
{ CHAR: B [ ascii ] }
{ CHAR: J [ jis201 ] }
[ drop f ]
} case
] }
{ CHAR: $ [
dup stream-read1 {
{ CHAR: @ [ drop jis208 ] } ! want: JIS X 0208-1978
{ CHAR: B [ drop jis208 ] }
{ CHAR: ( [
stream-read1 CHAR: D = jis212 f ?
] }
[ 2drop f ]
} case
] }
[ 2drop f ]
} case ;
: double-width? ( type -- ? )
{ [ jis208 eq? ] [ jis212 eq? ] } 1|| ;
: finish-decode ( num encoding -- char )
type>> at replacement-char or ;
M:: iso2022-state decode-char ( stream encoding -- char )
stream stream-read1 {
{ ESC [
stream read-escape [
encoding (>>type)
stream encoding decode-char
] [ replacement-char ] if*
] }
{ f [ f ] }
[
encoding type>> double-width? [
stream stream-read1
[ 2byte-array be> encoding finish-decode ]
[ drop replacement-char ] if*
] [ encoding finish-decode ] if
]
} case ;

View File

@ -0,0 +1 @@
ISO-2022-JP-1 text encoding