Shift-JIS support; minor cleanup of io.binary
parent
f660c7e7e8
commit
3c75dfd2e0
|
|
@ -45,7 +45,7 @@ IN: io.encodings.8-bit
|
||||||
: ch>byte ( assoc -- newassoc )
|
: ch>byte ( assoc -- newassoc )
|
||||||
[ swap ] assoc-map >hashtable ;
|
[ swap ] assoc-map >hashtable ;
|
||||||
|
|
||||||
: parse-file ( path -- byte>ch ch>byte )
|
: parse-file ( stream -- byte>ch ch>byte )
|
||||||
lines process-contents
|
lines process-contents
|
||||||
[ byte>ch ] [ ch>byte ] bi ;
|
[ byte>ch ] [ ch>byte ] bi ;
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1 @@
|
||||||
|
Daniel Ehrenberg
|
||||||
|
|
@ -0,0 +1,19 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: help.markup help.syntax ;
|
||||||
|
IN: io.encodings.japanese
|
||||||
|
|
||||||
|
ARTICLE: "io.encodings.japanese" "Japanese text encodings"
|
||||||
|
"Several encodings are used for Japanese text besides the standard UTF encodings for Unicode strings. These are mostly based on the character set defined in the JIS X 208 standard. Current coverage of encodings is incomplete."
|
||||||
|
{ $subsection shift-jis }
|
||||||
|
{ $subsection windows-31j } ;
|
||||||
|
|
||||||
|
ABOUT: "io.encodings.japanese"
|
||||||
|
|
||||||
|
HELP: windows-31j
|
||||||
|
{ $class-description "The encoding descriptor Windows-31J, which is sometimes informally called Shift JIS. This is based on Code Page 932." }
|
||||||
|
{ $see-also "encodings-introduction" shift-jis } ;
|
||||||
|
|
||||||
|
HELP: shift-jis
|
||||||
|
{ $class-description "The encoding descriptor for Shift JIS, or JIS X 208:1997 Appendix 1. Microsoft extensions are not included." }
|
||||||
|
{ $see-also "encodings-introduction" windows-31j } ;
|
||||||
|
|
@ -0,0 +1,17 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: io.encodings.japanese tools.test io.encodings.string arrays strings ;
|
||||||
|
IN: io.encodings.japanese.tests
|
||||||
|
|
||||||
|
[ { CHAR: replacement-character } ] [ { 141 } shift-jis decode >array ] unit-test
|
||||||
|
[ "" ] [ "" shift-jis decode >string ] unit-test
|
||||||
|
[ "" ] [ "" shift-jis encode >string ] unit-test
|
||||||
|
[ { CHAR: replacement-character } shift-jis encode ] must-fail
|
||||||
|
[ "ab¥ィ" ] [ { CHAR: a CHAR: b HEX: 5C HEX: A8 } shift-jis decode ] unit-test
|
||||||
|
[ { CHAR: a CHAR: b HEX: 5C HEX: A8 } ] [ "ab¥ィ" shift-jis encode >array ] unit-test
|
||||||
|
[ "ab\\ィ" ] [ { CHAR: a CHAR: b HEX: 5C HEX: A8 } windows-31j decode ] unit-test
|
||||||
|
[ { CHAR: a CHAR: b HEX: 5C HEX: A8 } ] [ "ab\\ィ" windows-31j encode >array ] unit-test
|
||||||
|
[ "\u000081\u0000c8" ] [ CHAR: logical-and 1string windows-31j encode >string ] unit-test
|
||||||
|
[ "\u000081\u0000c8" ] [ CHAR: logical-and 1string shift-jis encode >string ] unit-test
|
||||||
|
[ { CHAR: logical-and } ] [ "\u000081\u0000c8" windows-31j decode >array ] unit-test
|
||||||
|
[ { CHAR: logical-and } ] [ "\u000081\u0000c8" shift-jis decode >array ] unit-test
|
||||||
|
|
@ -0,0 +1,65 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: sequences kernel io io.files combinators.short-circuit
|
||||||
|
math.order values assocs io.encodings io.binary fry strings
|
||||||
|
math io.encodings.utf8 arrays accessors splitting math.parser ;
|
||||||
|
IN: io.encodings.japanese
|
||||||
|
|
||||||
|
! The code page used is Microsoft Code Page 932,
|
||||||
|
! which is a set of extensions to JIS X 0208:1997
|
||||||
|
|
||||||
|
VALUE: shift-jis
|
||||||
|
|
||||||
|
VALUE: windows-31j
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
TUPLE: jis jis>ch ch>jis ;
|
||||||
|
|
||||||
|
: <jis> ( assoc -- jis )
|
||||||
|
[ nip ] assoc-filter
|
||||||
|
[ H{ } assoc-like ]
|
||||||
|
[ [ swap ] H{ } assoc-map-as ] bi
|
||||||
|
jis boa ;
|
||||||
|
|
||||||
|
: ch>jis ( ch tuple -- jis ) ch>jis>> at [ encode-error ] unless* ;
|
||||||
|
: jis>ch ( jis tuple -- string ) jis>ch>> at replacement-char or ;
|
||||||
|
|
||||||
|
: process-jis ( lines -- assoc )
|
||||||
|
[ "#" split1 drop ] map harvest [
|
||||||
|
"\t" split 2 head
|
||||||
|
[ 2 short tail hex> ] map
|
||||||
|
] map ;
|
||||||
|
|
||||||
|
: make-jis ( filename -- jis )
|
||||||
|
utf8 file-lines process-jis <jis> ;
|
||||||
|
|
||||||
|
"resource:basis/io/encodings/japanese/CP932.txt"
|
||||||
|
make-jis to: windows-31j
|
||||||
|
|
||||||
|
"resource:basis/io/encodings/japanese/sjis-0208-1997-std.txt"
|
||||||
|
make-jis to: shift-jis
|
||||||
|
|
||||||
|
: small? ( char -- ? )
|
||||||
|
! ASCII range or single-byte halfwidth katakana
|
||||||
|
{ [ 0 HEX: 7F between? ] [ HEX: A1 HEX: DF between? ] } 1|| ;
|
||||||
|
|
||||||
|
: write-halfword ( stream halfword -- )
|
||||||
|
h>b/b swap 2array >string swap stream-write ;
|
||||||
|
|
||||||
|
M: jis encode-char
|
||||||
|
swapd ch>jis
|
||||||
|
dup small?
|
||||||
|
[ swap stream-write1 ]
|
||||||
|
[ write-halfword ] if ;
|
||||||
|
|
||||||
|
M: jis decode-char
|
||||||
|
swap dup stream-read1 [
|
||||||
|
dup small? [ nip swap jis>ch ] [
|
||||||
|
swap stream-read1
|
||||||
|
[ 2array be> swap jis>ch ]
|
||||||
|
[ 2drop replacement-char ] if*
|
||||||
|
] if
|
||||||
|
] [ 2drop f ] if* ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1 @@
|
||||||
|
Japanese text encodings
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
text
|
||||||
|
|
@ -10,3 +10,7 @@ IN: io.binary.tests
|
||||||
[ 1234 ] [ 1234 4 >le le> ] unit-test
|
[ 1234 ] [ 1234 4 >le le> ] unit-test
|
||||||
|
|
||||||
[ fixnum ] [ B{ 0 0 0 0 0 0 0 0 0 0 } be> class ] unit-test
|
[ fixnum ] [ B{ 0 0 0 0 0 0 0 0 0 0 } be> class ] unit-test
|
||||||
|
|
||||||
|
[ HEX: 56780000 HEX: 12340000 ] [ HEX: 1234000056780000 d>w/w ] unit-test
|
||||||
|
[ HEX: 5678 HEX: 1234 ] [ HEX: 12345678 w>h/h ] unit-test
|
||||||
|
[ HEX: 34 HEX: 12 ] [ HEX: 1234 h>b/b ] unit-test
|
||||||
|
|
|
||||||
|
|
@ -14,13 +14,13 @@ IN: io.binary
|
||||||
: >be ( x n -- byte-array ) >le dup reverse-here ;
|
: >be ( x n -- byte-array ) >le dup reverse-here ;
|
||||||
|
|
||||||
: d>w/w ( d -- w1 w2 )
|
: d>w/w ( d -- w1 w2 )
|
||||||
dup HEX: ffffffff bitand
|
[ HEX: ffffffff bitand ]
|
||||||
swap -32 shift HEX: ffffffff bitand ;
|
[ -32 shift HEX: ffffffff bitand ] bi ;
|
||||||
|
|
||||||
: w>h/h ( w -- h1 h2 )
|
: w>h/h ( w -- h1 h2 )
|
||||||
dup HEX: ffff bitand
|
[ HEX: ffff bitand ]
|
||||||
swap -16 shift HEX: ffff bitand ;
|
[ -16 shift HEX: ffff bitand ] bi ;
|
||||||
|
|
||||||
: h>b/b ( h -- b1 b2 )
|
: h>b/b ( h -- b1 b2 )
|
||||||
dup mask-byte
|
[ mask-byte ]
|
||||||
swap -8 shift mask-byte ;
|
[ -8 shift mask-byte ] bi ;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue