io.encodings.japanese eucjp ...
parent
ac27696bf9
commit
7903319263
|
@ -0,0 +1,208 @@
|
|||
#
|
||||
# Name: JIS X 0201 (1976) to Unicode 1.1 Table
|
||||
# Unicode version: 1.1
|
||||
# Table version: 0.9
|
||||
# Table format: Format A
|
||||
# Date: 8 March 1994
|
||||
#
|
||||
# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
|
||||
#
|
||||
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
|
||||
# No claims are made as to fitness for any particular purpose. No
|
||||
# warranties of any kind are expressed or implied. The recipient
|
||||
# agrees to determine applicability of information provided. If this
|
||||
# file has been provided on magnetic media by Unicode, Inc., the sole
|
||||
# remedy for any claim will be exchange of defective media within 90
|
||||
# days of receipt.
|
||||
#
|
||||
# Recipient is granted the right to make copies in any form for
|
||||
# internal distribution and to freely use the information supplied
|
||||
# in the creation of products supporting Unicode. Unicode, Inc.
|
||||
# specifically excludes the right to re-distribute this file directly
|
||||
# to third parties or other organizations whether for profit or not.
|
||||
#
|
||||
# General notes:
|
||||
#
|
||||
#
|
||||
# This table contains one set of mappings from JIS X 0201 into Unicode.
|
||||
# Note that these data are *possible* mappings only and may not be the
|
||||
# same as those used by actual products, nor may they be the best suited
|
||||
# for all uses. For more information on the mappings between various code
|
||||
# pages incorporating the repertoire of JIS X 0201 and Unicode, consult the
|
||||
# VENDORS mapping data. Normative information on the mapping between
|
||||
# JIS X 0201 and Unicode may be found in the Unihan.txt file in the
|
||||
# latest Unicode Character Database.
|
||||
#
|
||||
# If you have carefully considered the fact that the mappings in
|
||||
# this table are only one possible set of mappings between JIS X 0201 and
|
||||
# Unicode and have no normative status, but still feel that you
|
||||
# have located an error in the table that requires fixing, you may
|
||||
# report any such error to errata@unicode.org.
|
||||
#
|
||||
#
|
||||
# Format: Three tab-separated columns
|
||||
# Column #1 is the shift JIS code (in hex as 0xXX)
|
||||
# Column #2 is the Unicode (in hex as 0xXXXX)
|
||||
# Column #3 the Unicode (ISO 10646) name (follows a comment sign)
|
||||
#
|
||||
# The entries are in JIS order
|
||||
#
|
||||
#
|
||||
0x20 0x0020 # SPACE
|
||||
0x21 0x0021 # EXCLAMATION MARK
|
||||
0x22 0x0022 # QUOTATION MARK
|
||||
0x23 0x0023 # NUMBER SIGN
|
||||
0x24 0x0024 # DOLLAR SIGN
|
||||
0x25 0x0025 # PERCENT SIGN
|
||||
0x26 0x0026 # AMPERSAND
|
||||
0x27 0x0027 # APOSTROPHE
|
||||
0x28 0x0028 # LEFT PARENTHESIS
|
||||
0x29 0x0029 # RIGHT PARENTHESIS
|
||||
0x2A 0x002A # ASTERISK
|
||||
0x2B 0x002B # PLUS SIGN
|
||||
0x2C 0x002C # COMMA
|
||||
0x2D 0x002D # HYPHEN-MINUS
|
||||
0x2E 0x002E # FULL STOP
|
||||
0x2F 0x002F # SOLIDUS
|
||||
0x30 0x0030 # DIGIT ZERO
|
||||
0x31 0x0031 # DIGIT ONE
|
||||
0x32 0x0032 # DIGIT TWO
|
||||
0x33 0x0033 # DIGIT THREE
|
||||
0x34 0x0034 # DIGIT FOUR
|
||||
0x35 0x0035 # DIGIT FIVE
|
||||
0x36 0x0036 # DIGIT SIX
|
||||
0x37 0x0037 # DIGIT SEVEN
|
||||
0x38 0x0038 # DIGIT EIGHT
|
||||
0x39 0x0039 # DIGIT NINE
|
||||
0x3A 0x003A # COLON
|
||||
0x3B 0x003B # SEMICOLON
|
||||
0x3C 0x003C # LESS-THAN SIGN
|
||||
0x3D 0x003D # EQUALS SIGN
|
||||
0x3E 0x003E # GREATER-THAN SIGN
|
||||
0x3F 0x003F # QUESTION MARK
|
||||
0x40 0x0040 # COMMERCIAL AT
|
||||
0x41 0x0041 # LATIN CAPITAL LETTER A
|
||||
0x42 0x0042 # LATIN CAPITAL LETTER B
|
||||
0x43 0x0043 # LATIN CAPITAL LETTER C
|
||||
0x44 0x0044 # LATIN CAPITAL LETTER D
|
||||
0x45 0x0045 # LATIN CAPITAL LETTER E
|
||||
0x46 0x0046 # LATIN CAPITAL LETTER F
|
||||
0x47 0x0047 # LATIN CAPITAL LETTER G
|
||||
0x48 0x0048 # LATIN CAPITAL LETTER H
|
||||
0x49 0x0049 # LATIN CAPITAL LETTER I
|
||||
0x4A 0x004A # LATIN CAPITAL LETTER J
|
||||
0x4B 0x004B # LATIN CAPITAL LETTER K
|
||||
0x4C 0x004C # LATIN CAPITAL LETTER L
|
||||
0x4D 0x004D # LATIN CAPITAL LETTER M
|
||||
0x4E 0x004E # LATIN CAPITAL LETTER N
|
||||
0x4F 0x004F # LATIN CAPITAL LETTER O
|
||||
0x50 0x0050 # LATIN CAPITAL LETTER P
|
||||
0x51 0x0051 # LATIN CAPITAL LETTER Q
|
||||
0x52 0x0052 # LATIN CAPITAL LETTER R
|
||||
0x53 0x0053 # LATIN CAPITAL LETTER S
|
||||
0x54 0x0054 # LATIN CAPITAL LETTER T
|
||||
0x55 0x0055 # LATIN CAPITAL LETTER U
|
||||
0x56 0x0056 # LATIN CAPITAL LETTER V
|
||||
0x57 0x0057 # LATIN CAPITAL LETTER W
|
||||
0x58 0x0058 # LATIN CAPITAL LETTER X
|
||||
0x59 0x0059 # LATIN CAPITAL LETTER Y
|
||||
0x5A 0x005A # LATIN CAPITAL LETTER Z
|
||||
0x5B 0x005B # LEFT SQUARE BRACKET
|
||||
0x5C 0x00A5 # YEN SIGN
|
||||
0x5D 0x005D # RIGHT SQUARE BRACKET
|
||||
0x5E 0x005E # CIRCUMFLEX ACCENT
|
||||
0x5F 0x005F # LOW LINE
|
||||
0x60 0x0060 # GRAVE ACCENT
|
||||
0x61 0x0061 # LATIN SMALL LETTER A
|
||||
0x62 0x0062 # LATIN SMALL LETTER B
|
||||
0x63 0x0063 # LATIN SMALL LETTER C
|
||||
0x64 0x0064 # LATIN SMALL LETTER D
|
||||
0x65 0x0065 # LATIN SMALL LETTER E
|
||||
0x66 0x0066 # LATIN SMALL LETTER F
|
||||
0x67 0x0067 # LATIN SMALL LETTER G
|
||||
0x68 0x0068 # LATIN SMALL LETTER H
|
||||
0x69 0x0069 # LATIN SMALL LETTER I
|
||||
0x6A 0x006A # LATIN SMALL LETTER J
|
||||
0x6B 0x006B # LATIN SMALL LETTER K
|
||||
0x6C 0x006C # LATIN SMALL LETTER L
|
||||
0x6D 0x006D # LATIN SMALL LETTER M
|
||||
0x6E 0x006E # LATIN SMALL LETTER N
|
||||
0x6F 0x006F # LATIN SMALL LETTER O
|
||||
0x70 0x0070 # LATIN SMALL LETTER P
|
||||
0x71 0x0071 # LATIN SMALL LETTER Q
|
||||
0x72 0x0072 # LATIN SMALL LETTER R
|
||||
0x73 0x0073 # LATIN SMALL LETTER S
|
||||
0x74 0x0074 # LATIN SMALL LETTER T
|
||||
0x75 0x0075 # LATIN SMALL LETTER U
|
||||
0x76 0x0076 # LATIN SMALL LETTER V
|
||||
0x77 0x0077 # LATIN SMALL LETTER W
|
||||
0x78 0x0078 # LATIN SMALL LETTER X
|
||||
0x79 0x0079 # LATIN SMALL LETTER Y
|
||||
0x7A 0x007A # LATIN SMALL LETTER Z
|
||||
0x7B 0x007B # LEFT CURLY BRACKET
|
||||
0x7C 0x007C # VERTICAL LINE
|
||||
0x7D 0x007D # RIGHT CURLY BRACKET
|
||||
0x7E 0x203E # OVERLINE
|
||||
0xA1 0xFF61 # HALFWIDTH IDEOGRAPHIC FULL STOP
|
||||
0xA2 0xFF62 # HALFWIDTH LEFT CORNER BRACKET
|
||||
0xA3 0xFF63 # HALFWIDTH RIGHT CORNER BRACKET
|
||||
0xA4 0xFF64 # HALFWIDTH IDEOGRAPHIC COMMA
|
||||
0xA5 0xFF65 # HALFWIDTH KATAKANA MIDDLE DOT
|
||||
0xA6 0xFF66 # HALFWIDTH KATAKANA LETTER WO
|
||||
0xA7 0xFF67 # HALFWIDTH KATAKANA LETTER SMALL A
|
||||
0xA8 0xFF68 # HALFWIDTH KATAKANA LETTER SMALL I
|
||||
0xA9 0xFF69 # HALFWIDTH KATAKANA LETTER SMALL U
|
||||
0xAA 0xFF6A # HALFWIDTH KATAKANA LETTER SMALL E
|
||||
0xAB 0xFF6B # HALFWIDTH KATAKANA LETTER SMALL O
|
||||
0xAC 0xFF6C # HALFWIDTH KATAKANA LETTER SMALL YA
|
||||
0xAD 0xFF6D # HALFWIDTH KATAKANA LETTER SMALL YU
|
||||
0xAE 0xFF6E # HALFWIDTH KATAKANA LETTER SMALL YO
|
||||
0xAF 0xFF6F # HALFWIDTH KATAKANA LETTER SMALL TU
|
||||
0xB0 0xFF70 # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
0xB1 0xFF71 # HALFWIDTH KATAKANA LETTER A
|
||||
0xB2 0xFF72 # HALFWIDTH KATAKANA LETTER I
|
||||
0xB3 0xFF73 # HALFWIDTH KATAKANA LETTER U
|
||||
0xB4 0xFF74 # HALFWIDTH KATAKANA LETTER E
|
||||
0xB5 0xFF75 # HALFWIDTH KATAKANA LETTER O
|
||||
0xB6 0xFF76 # HALFWIDTH KATAKANA LETTER KA
|
||||
0xB7 0xFF77 # HALFWIDTH KATAKANA LETTER KI
|
||||
0xB8 0xFF78 # HALFWIDTH KATAKANA LETTER KU
|
||||
0xB9 0xFF79 # HALFWIDTH KATAKANA LETTER KE
|
||||
0xBA 0xFF7A # HALFWIDTH KATAKANA LETTER KO
|
||||
0xBB 0xFF7B # HALFWIDTH KATAKANA LETTER SA
|
||||
0xBC 0xFF7C # HALFWIDTH KATAKANA LETTER SI
|
||||
0xBD 0xFF7D # HALFWIDTH KATAKANA LETTER SU
|
||||
0xBE 0xFF7E # HALFWIDTH KATAKANA LETTER SE
|
||||
0xBF 0xFF7F # HALFWIDTH KATAKANA LETTER SO
|
||||
0xC0 0xFF80 # HALFWIDTH KATAKANA LETTER TA
|
||||
0xC1 0xFF81 # HALFWIDTH KATAKANA LETTER TI
|
||||
0xC2 0xFF82 # HALFWIDTH KATAKANA LETTER TU
|
||||
0xC3 0xFF83 # HALFWIDTH KATAKANA LETTER TE
|
||||
0xC4 0xFF84 # HALFWIDTH KATAKANA LETTER TO
|
||||
0xC5 0xFF85 # HALFWIDTH KATAKANA LETTER NA
|
||||
0xC6 0xFF86 # HALFWIDTH KATAKANA LETTER NI
|
||||
0xC7 0xFF87 # HALFWIDTH KATAKANA LETTER NU
|
||||
0xC8 0xFF88 # HALFWIDTH KATAKANA LETTER NE
|
||||
0xC9 0xFF89 # HALFWIDTH KATAKANA LETTER NO
|
||||
0xCA 0xFF8A # HALFWIDTH KATAKANA LETTER HA
|
||||
0xCB 0xFF8B # HALFWIDTH KATAKANA LETTER HI
|
||||
0xCC 0xFF8C # HALFWIDTH KATAKANA LETTER HU
|
||||
0xCD 0xFF8D # HALFWIDTH KATAKANA LETTER HE
|
||||
0xCE 0xFF8E # HALFWIDTH KATAKANA LETTER HO
|
||||
0xCF 0xFF8F # HALFWIDTH KATAKANA LETTER MA
|
||||
0xD0 0xFF90 # HALFWIDTH KATAKANA LETTER MI
|
||||
0xD1 0xFF91 # HALFWIDTH KATAKANA LETTER MU
|
||||
0xD2 0xFF92 # HALFWIDTH KATAKANA LETTER ME
|
||||
0xD3 0xFF93 # HALFWIDTH KATAKANA LETTER MO
|
||||
0xD4 0xFF94 # HALFWIDTH KATAKANA LETTER YA
|
||||
0xD5 0xFF95 # HALFWIDTH KATAKANA LETTER YU
|
||||
0xD6 0xFF96 # HALFWIDTH KATAKANA LETTER YO
|
||||
0xD7 0xFF97 # HALFWIDTH KATAKANA LETTER RA
|
||||
0xD8 0xFF98 # HALFWIDTH KATAKANA LETTER RI
|
||||
0xD9 0xFF99 # HALFWIDTH KATAKANA LETTER RU
|
||||
0xDA 0xFF9A # HALFWIDTH KATAKANA LETTER RE
|
||||
0xDB 0xFF9B # HALFWIDTH KATAKANA LETTER RO
|
||||
0xDC 0xFF9C # HALFWIDTH KATAKANA LETTER WA
|
||||
0xDD 0xFF9D # HALFWIDTH KATAKANA LETTER N
|
||||
0xDE 0xFF9E # HALFWIDTH KATAKANA VOICED SOUND MARK
|
||||
0xDF 0xFF9F # HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -3,7 +3,8 @@
|
|||
USING: sequences kernel io io.files combinators.short-circuit
|
||||
math.order values assocs io.encodings io.binary fry strings math
|
||||
io.encodings.ascii arrays byte-arrays accessors splitting
|
||||
math.parser biassocs io.encodings.iana ;
|
||||
math.parser biassocs io.encodings.iana io.encodings.asian
|
||||
locals ;
|
||||
IN: io.encodings.japanese
|
||||
|
||||
SINGLETON: shift-jis
|
||||
|
@ -14,6 +15,11 @@ SINGLETON: windows-31j
|
|||
|
||||
windows-31j "Windows-31J" register-encoding
|
||||
|
||||
SINGLETON: eucjp
|
||||
|
||||
! eucjp "EUCJP" register-encoding
|
||||
|
||||
|
||||
<PRIVATE
|
||||
|
||||
VALUE: shift-jis-table
|
||||
|
@ -26,6 +32,8 @@ VALUE: windows-31j-table
|
|||
M: windows-31j <encoder> drop windows-31j-table <encoder> ;
|
||||
M: windows-31j <decoder> drop windows-31j-table <decoder> ;
|
||||
|
||||
|
||||
|
||||
TUPLE: jis assoc ;
|
||||
|
||||
: <jis> ( assoc -- jis )
|
||||
|
@ -47,9 +55,11 @@ TUPLE: jis assoc ;
|
|||
"vocab:io/encodings/japanese/CP932.txt"
|
||||
make-jis to: windows-31j-table
|
||||
|
||||
|
||||
"vocab:io/encodings/japanese/sjis-0208-1997-std.txt"
|
||||
make-jis to: shift-jis-table
|
||||
|
||||
|
||||
: small? ( char -- ? )
|
||||
! ASCII range or single-byte halfwidth katakana
|
||||
{ [ 0 HEX: 7F between? ] [ HEX: A1 HEX: DF between? ] } 1|| ;
|
||||
|
@ -72,4 +82,73 @@ M: jis decode-char
|
|||
] if
|
||||
] [ 2drop f ] if* ;
|
||||
|
||||
|
||||
! EUC-JP
|
||||
|
||||
VALUE: euc-0201-table
|
||||
|
||||
VALUE: euc-0208-table
|
||||
|
||||
VALUE: euc-0212-table
|
||||
|
||||
"vocab:io/encodings/japanese/euc-0201.txt" <code-table>* to: euc-0201-table
|
||||
|
||||
"vocab:io/encodings/japanese/euc-0208.txt" <code-table>* to: euc-0208-table
|
||||
|
||||
"vocab:io/encodings/japanese/euc-0212.txt" <code-table>* to: euc-0212-table
|
||||
|
||||
|
||||
:: unicode>eucjp ( u -- n )
|
||||
u
|
||||
[ euc-0201-table u>n ]
|
||||
[ euc-0208-table u>n ]
|
||||
[ euc-0212-table u>n ]
|
||||
tri 3array harvest first
|
||||
;
|
||||
|
||||
:: eucjp>unicode ( n -- u )
|
||||
n
|
||||
[ euc-0201-table n>u ]
|
||||
[ euc-0208-table n>u ]
|
||||
[ euc-0212-table n>u ]
|
||||
tri 3array harvest
|
||||
dup length zero?
|
||||
[ drop replacement-char ]
|
||||
[ first ]
|
||||
if ;
|
||||
|
||||
|
||||
M: eucjp encode-char ( c stream encoding -- )
|
||||
drop
|
||||
[let | stream [ ]
|
||||
c [ ] |
|
||||
c unicode>eucjp small?
|
||||
[
|
||||
c stream stream-write1
|
||||
]
|
||||
[
|
||||
c unicode>eucjp
|
||||
h>b/b 2byte-array stream stream-write
|
||||
]
|
||||
if
|
||||
] ;
|
||||
|
||||
M: eucjp decode-char ( stream encoding -- char/f )
|
||||
drop
|
||||
[let | stream [ ]
|
||||
c1! [ 0 ] |
|
||||
stream stream-read1 c1!
|
||||
c1 small?
|
||||
[ c1 ]
|
||||
[
|
||||
c1
|
||||
stream stream-read1
|
||||
2byte-array be>
|
||||
eucjp>unicode
|
||||
]
|
||||
if
|
||||
] ;
|
||||
|
||||
|
||||
|
||||
PRIVATE>
|
||||
|
|
Loading…
Reference in New Issue