UTF-32 encoding/decoding
parent
632c17ea35
commit
ec22af4dbd
|
@ -1,3 +1,5 @@
|
||||||
|
! Copyright (C) 2008 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: help.markup help.syntax io.encodings strings ;
|
USING: help.markup help.syntax io.encodings strings ;
|
||||||
IN: io.encodings.utf16
|
IN: io.encodings.utf16
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: kernel tools.test io.encodings.utf16 arrays sbufs
|
USING: kernel tools.test io.encodings.utf16 arrays sbufs
|
||||||
io.streams.byte-array sequences io.encodings io
|
io.streams.byte-array sequences io.encodings io
|
||||||
bootstrap.unicode
|
|
||||||
io.encodings.string alien.c-types alien.strings accessors classes ;
|
io.encodings.string alien.c-types alien.strings accessors classes ;
|
||||||
IN: io.encodings.utf16.tests
|
IN: io.encodings.utf16.tests
|
||||||
|
|
||||||
|
@ -15,7 +16,6 @@ IN: io.encodings.utf16.tests
|
||||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
||||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
|
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
|
||||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
|
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
|
||||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
|
||||||
|
|
||||||
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
|
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
|
||||||
|
|
||||||
|
|
|
@ -105,10 +105,6 @@ M: utf16le encode-char ( char stream encoding -- )
|
||||||
|
|
||||||
: bom-be B{ HEX: fe HEX: ff } ; inline
|
: bom-be B{ HEX: fe HEX: ff } ; inline
|
||||||
|
|
||||||
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
|
|
||||||
|
|
||||||
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
|
||||||
|
|
||||||
: bom>le/be ( bom -- le/be )
|
: bom>le/be ( bom -- le/be )
|
||||||
dup bom-le sequence= [ drop utf16le ] [
|
dup bom-le sequence= [ drop utf16le ] [
|
||||||
bom-be sequence= [ utf16be ] [ missing-bom ] if
|
bom-be sequence= [ utf16be ] [ missing-bom ] if
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Daniel Ehrenberg
|
|
@ -0,0 +1 @@
|
||||||
|
UTF32 encoding/decoding
|
|
@ -0,0 +1 @@
|
||||||
|
text
|
|
@ -0,0 +1,27 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: help.markup help.syntax io.encodings strings ;
|
||||||
|
IN: io.encodings.utf32
|
||||||
|
|
||||||
|
ARTICLE: "io.encodings.utf32" "UTF-32 encoding"
|
||||||
|
"The UTF-32 encoding is a fixed-width encoding. Unicode code points are encoded as 4 byte sequences. There are three encoding descriptor classes for working with UTF-32, depending on endianness or the presence of a BOM:"
|
||||||
|
{ $subsection utf32 }
|
||||||
|
{ $subsection utf32le }
|
||||||
|
{ $subsection utf32be } ;
|
||||||
|
|
||||||
|
ABOUT: "io.encodings.utf32"
|
||||||
|
|
||||||
|
HELP: utf32le
|
||||||
|
{ $class-description "The encoding descriptor for UTF-32LE, that is, UTF-32 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
|
||||||
|
{ $see-also "encodings-introduction" } ;
|
||||||
|
|
||||||
|
HELP: utf32be
|
||||||
|
{ $class-description "The encoding descriptor for UTF-32BE, that is, UTF-32 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
|
||||||
|
{ $see-also "encodings-introduction" } ;
|
||||||
|
|
||||||
|
HELP: utf32
|
||||||
|
{ $class-description "The encoding descriptor for UTF-32, that is, UTF-32 with a byte order mark. This is the most useful for general input and output in UTF-32. Streams can be made which read or write wth this encoding." }
|
||||||
|
{ $see-also "encodings-introduction" } ;
|
||||||
|
|
||||||
|
{ utf32 utf32le utf32be } related-words
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: kernel tools.test io.encodings.utf32 arrays sbufs
|
||||||
|
io.streams.byte-array sequences io.encodings io
|
||||||
|
io.encodings.string alien.c-types alien.strings accessors classes ;
|
||||||
|
IN: io.encodings.utf32.tests
|
||||||
|
|
||||||
|
[ { CHAR: x } ] [ { 0 0 0 CHAR: x } utf32be decode >array ] unit-test
|
||||||
|
[ { HEX: 1D11E } ] [ { 0 1 HEX: D1 HEX: 1E } utf32be decode >array ] unit-test
|
||||||
|
[ { CHAR: replacement-character } ] [ { 0 1 HEX: D1 } utf32be decode >array ] unit-test
|
||||||
|
[ { CHAR: replacement-character } ] [ { 0 1 } utf32be decode >array ] unit-test
|
||||||
|
[ { CHAR: replacement-character } ] [ { 0 } utf32be decode >array ] unit-test
|
||||||
|
[ { } ] [ { } utf32be decode >array ] unit-test
|
||||||
|
|
||||||
|
[ { 0 0 0 CHAR: x 0 1 HEX: D1 HEX: 1E } ] [ { CHAR: x HEX: 1d11e } utf32be encode >array ] unit-test
|
||||||
|
|
||||||
|
[ { CHAR: x } ] [ { CHAR: x 0 0 0 } utf32le decode >array ] unit-test
|
||||||
|
[ { HEX: 1d11e } ] [ { HEX: 1e HEX: d1 1 0 } utf32le decode >array ] unit-test
|
||||||
|
[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 1 } utf32le decode >array ] unit-test
|
||||||
|
[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 } utf32le decode >array ] unit-test
|
||||||
|
[ { CHAR: replacement-character } ] [ { HEX: 1e } utf32le decode >array ] unit-test
|
||||||
|
[ { } ] [ { } utf32le decode >array ] unit-test
|
||||||
|
|
||||||
|
[ { 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32le encode >array ] unit-test
|
||||||
|
|
||||||
|
[ { CHAR: x } ] [ { HEX: ff HEX: fe 0 0 CHAR: x 0 0 0 } utf32 decode >array ] unit-test
|
||||||
|
[ { CHAR: x } ] [ { 0 0 HEX: fe HEX: ff 0 0 0 CHAR: x } utf32 decode >array ] unit-test
|
||||||
|
|
||||||
|
[ { HEX: ff HEX: fe 0 0 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32 encode >array ] unit-test
|
||||||
|
|
|
@ -0,0 +1,87 @@
|
||||||
|
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: math kernel io.encodings combinators io io.encodings.utf16
|
||||||
|
generalizations sequences ;
|
||||||
|
IN: io.encodings.utf32
|
||||||
|
|
||||||
|
SINGLETON: utf32be
|
||||||
|
|
||||||
|
SINGLETON: utf32le
|
||||||
|
|
||||||
|
SINGLETON: utf32
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
: 4spin ( a b c d -- b c d a )
|
||||||
|
4 nrev ; inline
|
||||||
|
|
||||||
|
! Decoding
|
||||||
|
|
||||||
|
: stream-read4 ( stream -- a b c d )
|
||||||
|
{
|
||||||
|
[ stream-read1 ]
|
||||||
|
[ stream-read1 ]
|
||||||
|
[ stream-read1 ]
|
||||||
|
[ stream-read1 ]
|
||||||
|
} cleave ;
|
||||||
|
|
||||||
|
: with-replacement ( _ _ _ ch quot -- new-ch )
|
||||||
|
[ 3drop replacement-char ] if* ; inline
|
||||||
|
|
||||||
|
: >char ( d c b a -- abcd )
|
||||||
|
[
|
||||||
|
24 shift -roll [
|
||||||
|
16 shift -rot [
|
||||||
|
8 shift swap [
|
||||||
|
bitor bitor bitor
|
||||||
|
] with-replacement
|
||||||
|
] with-replacement
|
||||||
|
] with-replacement
|
||||||
|
] with-replacement ;
|
||||||
|
|
||||||
|
M: utf32be decode-char
|
||||||
|
drop stream-read4 4spin
|
||||||
|
[ >char ] [ 3drop f ] if* ;
|
||||||
|
|
||||||
|
M: utf32le decode-char
|
||||||
|
drop stream-read4 4 npick
|
||||||
|
[ >char ] [ 2drop 2drop f ] if ;
|
||||||
|
|
||||||
|
! Encoding
|
||||||
|
|
||||||
|
: split-off ( ab -- b a )
|
||||||
|
[ HEX: FF bitand ] [ -8 shift ] bi ;
|
||||||
|
|
||||||
|
: char> ( abcd -- d b c a )
|
||||||
|
split-off split-off split-off ;
|
||||||
|
|
||||||
|
: stream-write4 ( d c b a stream -- )
|
||||||
|
{
|
||||||
|
[ stream-write1 ]
|
||||||
|
[ stream-write1 ]
|
||||||
|
[ stream-write1 ]
|
||||||
|
[ stream-write1 ]
|
||||||
|
} cleave ;
|
||||||
|
|
||||||
|
M: utf32be encode-char
|
||||||
|
drop [ char> ] dip stream-write4 ;
|
||||||
|
|
||||||
|
M: utf32le encode-char
|
||||||
|
drop [ char> 4spin ] dip stream-write4 ;
|
||||||
|
|
||||||
|
! UTF-32
|
||||||
|
|
||||||
|
: bom-le B{ HEX: ff HEX: fe 0 0 } ; inline
|
||||||
|
|
||||||
|
: bom-be B{ 0 0 HEX: fe HEX: ff } ; inline
|
||||||
|
|
||||||
|
: bom>le/be ( bom -- le/be )
|
||||||
|
dup bom-le sequence= [ drop utf32le ] [
|
||||||
|
bom-be sequence= [ utf32be ] [ missing-bom ] if
|
||||||
|
] if ;
|
||||||
|
|
||||||
|
M: utf32 <decoder> ( stream utf32 -- decoder )
|
||||||
|
drop 4 over stream-read bom>le/be <decoder> ;
|
||||||
|
|
||||||
|
M: utf32 <encoder> ( stream utf32 -- encoder )
|
||||||
|
drop bom-le over stream-write utf32le <encoder> ;
|
Loading…
Reference in New Issue