UTF-32 encoding/decoding

db4
Daniel Ehrenberg 2009-02-01 15:36:07 -06:00
parent 632c17ea35
commit ec22af4dbd
9 changed files with 151 additions and 6 deletions

View File

@ -1,3 +1,5 @@
! Copyright (C) 2008 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax io.encodings strings ;
IN: io.encodings.utf16

View File

@ -1,6 +1,7 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel tools.test io.encodings.utf16 arrays sbufs
io.streams.byte-array sequences io.encodings io
bootstrap.unicode
io.encodings.string alien.c-types alien.strings accessors classes ;
IN: io.encodings.utf16.tests
@ -15,7 +16,6 @@ IN: io.encodings.utf16.tests
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test

View File

@ -105,10 +105,6 @@ M: utf16le encode-char ( char stream encoding -- )
: bom-be B{ HEX: fe HEX: ff } ; inline
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
: bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [
bom-be sequence= [ utf16be ] [ missing-bom ] if

View File

@ -0,0 +1 @@
Daniel Ehrenberg

View File

@ -0,0 +1 @@
UTF32 encoding/decoding

View File

@ -0,0 +1 @@
text

View File

@ -0,0 +1,27 @@
! Copyright (C) 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax io.encodings strings ;
IN: io.encodings.utf32
ARTICLE: "io.encodings.utf32" "UTF-32 encoding"
"The UTF-32 encoding is a fixed-width encoding. Unicode code points are encoded as 4 byte sequences. There are three encoding descriptor classes for working with UTF-32, depending on endianness or the presence of a BOM:"
{ $subsection utf32 }
{ $subsection utf32le }
{ $subsection utf32be } ;
ABOUT: "io.encodings.utf32"
HELP: utf32le
{ $class-description "The encoding descriptor for UTF-32LE, that is, UTF-32 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
{ $see-also "encodings-introduction" } ;
HELP: utf32be
{ $class-description "The encoding descriptor for UTF-32BE, that is, UTF-32 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
{ $see-also "encodings-introduction" } ;
HELP: utf32
{ $class-description "The encoding descriptor for UTF-32, that is, UTF-32 with a byte order mark. This is the most useful for general input and output in UTF-32. Streams can be made which read or write wth this encoding." }
{ $see-also "encodings-introduction" } ;
{ utf32 utf32le utf32be } related-words

View File

@ -0,0 +1,30 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: kernel tools.test io.encodings.utf32 arrays sbufs
io.streams.byte-array sequences io.encodings io
io.encodings.string alien.c-types alien.strings accessors classes ;
IN: io.encodings.utf32.tests
[ { CHAR: x } ] [ { 0 0 0 CHAR: x } utf32be decode >array ] unit-test
[ { HEX: 1D11E } ] [ { 0 1 HEX: D1 HEX: 1E } utf32be decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { 0 1 HEX: D1 } utf32be decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { 0 1 } utf32be decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { 0 } utf32be decode >array ] unit-test
[ { } ] [ { } utf32be decode >array ] unit-test
[ { 0 0 0 CHAR: x 0 1 HEX: D1 HEX: 1E } ] [ { CHAR: x HEX: 1d11e } utf32be encode >array ] unit-test
[ { CHAR: x } ] [ { CHAR: x 0 0 0 } utf32le decode >array ] unit-test
[ { HEX: 1d11e } ] [ { HEX: 1e HEX: d1 1 0 } utf32le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 1 } utf32le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 } utf32le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { HEX: 1e } utf32le decode >array ] unit-test
[ { } ] [ { } utf32le decode >array ] unit-test
[ { 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32le encode >array ] unit-test
[ { CHAR: x } ] [ { HEX: ff HEX: fe 0 0 CHAR: x 0 0 0 } utf32 decode >array ] unit-test
[ { CHAR: x } ] [ { 0 0 HEX: fe HEX: ff 0 0 0 CHAR: x } utf32 decode >array ] unit-test
[ { HEX: ff HEX: fe 0 0 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32 encode >array ] unit-test

View File

@ -0,0 +1,87 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel io.encodings combinators io io.encodings.utf16
generalizations sequences ;
IN: io.encodings.utf32
SINGLETON: utf32be
SINGLETON: utf32le
SINGLETON: utf32
<PRIVATE
: 4spin ( a b c d -- b c d a )
4 nrev ; inline
! Decoding
: stream-read4 ( stream -- a b c d )
{
[ stream-read1 ]
[ stream-read1 ]
[ stream-read1 ]
[ stream-read1 ]
} cleave ;
: with-replacement ( _ _ _ ch quot -- new-ch )
[ 3drop replacement-char ] if* ; inline
: >char ( d c b a -- abcd )
[
24 shift -roll [
16 shift -rot [
8 shift swap [
bitor bitor bitor
] with-replacement
] with-replacement
] with-replacement
] with-replacement ;
M: utf32be decode-char
drop stream-read4 4spin
[ >char ] [ 3drop f ] if* ;
M: utf32le decode-char
drop stream-read4 4 npick
[ >char ] [ 2drop 2drop f ] if ;
! Encoding
: split-off ( ab -- b a )
[ HEX: FF bitand ] [ -8 shift ] bi ;
: char> ( abcd -- d b c a )
split-off split-off split-off ;
: stream-write4 ( d c b a stream -- )
{
[ stream-write1 ]
[ stream-write1 ]
[ stream-write1 ]
[ stream-write1 ]
} cleave ;
M: utf32be encode-char
drop [ char> ] dip stream-write4 ;
M: utf32le encode-char
drop [ char> 4spin ] dip stream-write4 ;
! UTF-32
: bom-le B{ HEX: ff HEX: fe 0 0 } ; inline
: bom-be B{ 0 0 HEX: fe HEX: ff } ; inline
: bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf32le ] [
bom-be sequence= [ utf32be ] [ missing-bom ] if
] if ;
M: utf32 <decoder> ( stream utf32 -- decoder )
drop 4 over stream-read bom>le/be <decoder> ;
M: utf32 <encoder> ( stream utf32 -- encoder )
drop bom-le over stream-write utf32le <encoder> ;