UTF-32 encoding/decoding
parent
632c17ea35
commit
ec22af4dbd
|
@ -1,3 +1,5 @@
|
|||
! Copyright (C) 2008 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: help.markup help.syntax io.encodings strings ;
|
||||
IN: io.encodings.utf16
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel tools.test io.encodings.utf16 arrays sbufs
|
||||
io.streams.byte-array sequences io.encodings io
|
||||
bootstrap.unicode
|
||||
io.encodings.string alien.c-types alien.strings accessors classes ;
|
||||
IN: io.encodings.utf16.tests
|
||||
|
||||
|
@ -15,7 +16,6 @@ IN: io.encodings.utf16.tests
|
|||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
|
||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
||||
|
||||
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
|
||||
|
||||
|
|
|
@ -105,10 +105,6 @@ M: utf16le encode-char ( char stream encoding -- )
|
|||
|
||||
: bom-be B{ HEX: fe HEX: ff } ; inline
|
||||
|
||||
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
|
||||
|
||||
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
||||
|
||||
: bom>le/be ( bom -- le/be )
|
||||
dup bom-le sequence= [ drop utf16le ] [
|
||||
bom-be sequence= [ utf16be ] [ missing-bom ] if
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Daniel Ehrenberg
|
|
@ -0,0 +1 @@
|
|||
UTF32 encoding/decoding
|
|
@ -0,0 +1 @@
|
|||
text
|
|
@ -0,0 +1,27 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: help.markup help.syntax io.encodings strings ;
|
||||
IN: io.encodings.utf32
|
||||
|
||||
ARTICLE: "io.encodings.utf32" "UTF-32 encoding"
|
||||
"The UTF-32 encoding is a fixed-width encoding. Unicode code points are encoded as 4 byte sequences. There are three encoding descriptor classes for working with UTF-32, depending on endianness or the presence of a BOM:"
|
||||
{ $subsection utf32 }
|
||||
{ $subsection utf32le }
|
||||
{ $subsection utf32be } ;
|
||||
|
||||
ABOUT: "io.encodings.utf32"
|
||||
|
||||
HELP: utf32le
|
||||
{ $class-description "The encoding descriptor for UTF-32LE, that is, UTF-32 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
|
||||
{ $see-also "encodings-introduction" } ;
|
||||
|
||||
HELP: utf32be
|
||||
{ $class-description "The encoding descriptor for UTF-32BE, that is, UTF-32 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
|
||||
{ $see-also "encodings-introduction" } ;
|
||||
|
||||
HELP: utf32
|
||||
{ $class-description "The encoding descriptor for UTF-32, that is, UTF-32 with a byte order mark. This is the most useful for general input and output in UTF-32. Streams can be made which read or write wth this encoding." }
|
||||
{ $see-also "encodings-introduction" } ;
|
||||
|
||||
{ utf32 utf32le utf32be } related-words
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel tools.test io.encodings.utf32 arrays sbufs
|
||||
io.streams.byte-array sequences io.encodings io
|
||||
io.encodings.string alien.c-types alien.strings accessors classes ;
|
||||
IN: io.encodings.utf32.tests
|
||||
|
||||
[ { CHAR: x } ] [ { 0 0 0 CHAR: x } utf32be decode >array ] unit-test
|
||||
[ { HEX: 1D11E } ] [ { 0 1 HEX: D1 HEX: 1E } utf32be decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 1 HEX: D1 } utf32be decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 1 } utf32be decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 } utf32be decode >array ] unit-test
|
||||
[ { } ] [ { } utf32be decode >array ] unit-test
|
||||
|
||||
[ { 0 0 0 CHAR: x 0 1 HEX: D1 HEX: 1E } ] [ { CHAR: x HEX: 1d11e } utf32be encode >array ] unit-test
|
||||
|
||||
[ { CHAR: x } ] [ { CHAR: x 0 0 0 } utf32le decode >array ] unit-test
|
||||
[ { HEX: 1d11e } ] [ { HEX: 1e HEX: d1 1 0 } utf32le decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 1 } utf32le decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 } utf32le decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { HEX: 1e } utf32le decode >array ] unit-test
|
||||
[ { } ] [ { } utf32le decode >array ] unit-test
|
||||
|
||||
[ { 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32le encode >array ] unit-test
|
||||
|
||||
[ { CHAR: x } ] [ { HEX: ff HEX: fe 0 0 CHAR: x 0 0 0 } utf32 decode >array ] unit-test
|
||||
[ { CHAR: x } ] [ { 0 0 HEX: fe HEX: ff 0 0 0 CHAR: x } utf32 decode >array ] unit-test
|
||||
|
||||
[ { HEX: ff HEX: fe 0 0 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32 encode >array ] unit-test
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: math kernel io.encodings combinators io io.encodings.utf16
|
||||
generalizations sequences ;
|
||||
IN: io.encodings.utf32
|
||||
|
||||
SINGLETON: utf32be
|
||||
|
||||
SINGLETON: utf32le
|
||||
|
||||
SINGLETON: utf32
|
||||
|
||||
<PRIVATE
|
||||
|
||||
: 4spin ( a b c d -- b c d a )
|
||||
4 nrev ; inline
|
||||
|
||||
! Decoding
|
||||
|
||||
: stream-read4 ( stream -- a b c d )
|
||||
{
|
||||
[ stream-read1 ]
|
||||
[ stream-read1 ]
|
||||
[ stream-read1 ]
|
||||
[ stream-read1 ]
|
||||
} cleave ;
|
||||
|
||||
: with-replacement ( _ _ _ ch quot -- new-ch )
|
||||
[ 3drop replacement-char ] if* ; inline
|
||||
|
||||
: >char ( d c b a -- abcd )
|
||||
[
|
||||
24 shift -roll [
|
||||
16 shift -rot [
|
||||
8 shift swap [
|
||||
bitor bitor bitor
|
||||
] with-replacement
|
||||
] with-replacement
|
||||
] with-replacement
|
||||
] with-replacement ;
|
||||
|
||||
M: utf32be decode-char
|
||||
drop stream-read4 4spin
|
||||
[ >char ] [ 3drop f ] if* ;
|
||||
|
||||
M: utf32le decode-char
|
||||
drop stream-read4 4 npick
|
||||
[ >char ] [ 2drop 2drop f ] if ;
|
||||
|
||||
! Encoding
|
||||
|
||||
: split-off ( ab -- b a )
|
||||
[ HEX: FF bitand ] [ -8 shift ] bi ;
|
||||
|
||||
: char> ( abcd -- d b c a )
|
||||
split-off split-off split-off ;
|
||||
|
||||
: stream-write4 ( d c b a stream -- )
|
||||
{
|
||||
[ stream-write1 ]
|
||||
[ stream-write1 ]
|
||||
[ stream-write1 ]
|
||||
[ stream-write1 ]
|
||||
} cleave ;
|
||||
|
||||
M: utf32be encode-char
|
||||
drop [ char> ] dip stream-write4 ;
|
||||
|
||||
M: utf32le encode-char
|
||||
drop [ char> 4spin ] dip stream-write4 ;
|
||||
|
||||
! UTF-32
|
||||
|
||||
: bom-le B{ HEX: ff HEX: fe 0 0 } ; inline
|
||||
|
||||
: bom-be B{ 0 0 HEX: fe HEX: ff } ; inline
|
||||
|
||||
: bom>le/be ( bom -- le/be )
|
||||
dup bom-le sequence= [ drop utf32le ] [
|
||||
bom-be sequence= [ utf32be ] [ missing-bom ] if
|
||||
] if ;
|
||||
|
||||
M: utf32 <decoder> ( stream utf32 -- decoder )
|
||||
drop 4 over stream-read bom>le/be <decoder> ;
|
||||
|
||||
M: utf32 <encoder> ( stream utf32 -- encoder )
|
||||
drop bom-le over stream-write utf32le <encoder> ;
|
Loading…
Reference in New Issue