UTF-32 encoding/decoding

2009-02-01 15:36:07 -06:00 · 2009-02-01 15:36:07 -06:00 · ec22af4dbd
parent 632c17ea35
commit ec22af4dbd
9 changed files with 151 additions and 6 deletions
--- a/basis/io/encodings/utf16/utf16-docs.factor
+++ b/basis/io/encodings/utf16/utf16-docs.factor
@ -1,3 +1,5 @@
+! Copyright (C) 2008 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
 USING: help.markup help.syntax io.encodings strings ;
 IN: io.encodings.utf16

--- a/basis/io/encodings/utf16/utf16-tests.factor
+++ b/basis/io/encodings/utf16/utf16-tests.factor
@ -1,6 +1,7 @@
+! Copyright (C) 2008 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
 USING: kernel tools.test io.encodings.utf16 arrays sbufs
 io.streams.byte-array sequences io.encodings io
-bootstrap.unicode
 io.encodings.string alien.c-types alien.strings accessors classes ;
 IN: io.encodings.utf16.tests

@ -15,7 +16,6 @@ IN: io.encodings.utf16.tests
 [ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
 [ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
 [ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
-[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test

 [ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test

--- a/basis/io/encodings/utf16/utf16.factor
+++ b/basis/io/encodings/utf16/utf16.factor
@ -105,10 +105,6 @@ M: utf16le encode-char ( char stream encoding -- )

 : bom-be B{ HEX: fe HEX: ff } ; inline

-: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
-
-: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
-
 : bom>le/be ( bom -- le/be )
    dup bom-le sequence= [ drop utf16le ] [
        bom-be sequence= [ utf16be ] [ missing-bom ] if
--- a/basis/io/encodings/utf32/authors.txt
+++ b/basis/io/encodings/utf32/authors.txt
@ -0,0 +1 @@
+Daniel Ehrenberg
--- a/basis/io/encodings/utf32/summary.txt
+++ b/basis/io/encodings/utf32/summary.txt
@ -0,0 +1 @@
+UTF32 encoding/decoding
--- a/basis/io/encodings/utf32/tags.txt
+++ b/basis/io/encodings/utf32/tags.txt
@ -0,0 +1 @@
+text
--- a/basis/io/encodings/utf32/utf32-docs.factor
+++ b/basis/io/encodings/utf32/utf32-docs.factor
@ -0,0 +1,27 @@
+! Copyright (C) 2009 Daniel Ehrenberg
+! See http://factorcode.org/license.txt for BSD license.
+USING: help.markup help.syntax io.encodings strings ;
+IN: io.encodings.utf32
+
+ARTICLE: "io.encodings.utf32" "UTF-32 encoding"
+"The UTF-32 encoding is a fixed-width encoding. Unicode code points are encoded as 4 byte sequences. There are three encoding descriptor classes for working with UTF-32, depending on endianness or the presence of a BOM:"
+{ $subsection utf32 }
+{ $subsection utf32le }
+{ $subsection utf32be } ;
+
+ABOUT: "io.encodings.utf32"
+
+HELP: utf32le
+{ $class-description "The encoding descriptor for UTF-32LE, that is, UTF-32 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
+{ $see-also "encodings-introduction" } ;
+
+HELP: utf32be
+{ $class-description "The encoding descriptor for UTF-32BE, that is, UTF-32 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." }
+{ $see-also "encodings-introduction" } ;
+
+HELP: utf32
+{ $class-description "The encoding descriptor for UTF-32, that is, UTF-32 with a byte order mark. This is the most useful for general input and output in UTF-32. Streams can be made which read or write wth this encoding." }
+{ $see-also "encodings-introduction" } ;
+
+{ utf32 utf32le utf32be } related-words
+
--- a/basis/io/encodings/utf32/utf32-tests.factor
+++ b/basis/io/encodings/utf32/utf32-tests.factor
@ -0,0 +1,30 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: kernel tools.test io.encodings.utf32 arrays sbufs
+io.streams.byte-array sequences io.encodings io
+io.encodings.string alien.c-types alien.strings accessors classes ;
+IN: io.encodings.utf32.tests
+
+[ { CHAR: x } ] [ { 0 0 0 CHAR: x } utf32be decode >array ] unit-test
+[ { HEX: 1D11E } ] [ { 0 1 HEX: D1 HEX: 1E } utf32be decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { 0 1 HEX: D1 } utf32be decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { 0 1 } utf32be decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { 0 } utf32be decode >array ] unit-test
+[ { } ] [ { } utf32be decode >array ] unit-test
+
+[ { 0 0 0 CHAR: x 0 1 HEX: D1 HEX: 1E } ] [ { CHAR: x HEX: 1d11e } utf32be encode >array ] unit-test
+
+[ { CHAR: x } ] [ { CHAR: x 0 0 0 } utf32le decode >array ] unit-test
+[ { HEX: 1d11e } ] [ { HEX: 1e HEX: d1 1 0 } utf32le decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 1 } utf32le decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 } utf32le decode >array ] unit-test
+[ { CHAR: replacement-character } ] [ { HEX: 1e } utf32le decode >array ] unit-test
+[ { } ] [ { } utf32le decode >array ] unit-test
+
+[ { 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32le encode >array ] unit-test
+
+[ { CHAR: x } ] [ { HEX: ff HEX: fe 0 0 CHAR: x 0 0 0 } utf32 decode >array ] unit-test
+[ { CHAR: x } ] [ { 0 0 HEX: fe HEX: ff 0 0 0 CHAR: x } utf32 decode >array ] unit-test
+
+[ { HEX: ff HEX: fe 0 0 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32 encode >array ] unit-test
+
--- a/basis/io/encodings/utf32/utf32.factor
+++ b/basis/io/encodings/utf32/utf32.factor
@ -0,0 +1,87 @@
+! Copyright (C) 2009 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: math kernel io.encodings combinators io io.encodings.utf16
+generalizations sequences ;
+IN: io.encodings.utf32
+
+SINGLETON: utf32be
+
+SINGLETON: utf32le
+
+SINGLETON: utf32
+
+<PRIVATE
+
+: 4spin ( a b c d -- b c d a )
+    4 nrev ; inline
+
+! Decoding
+
+: stream-read4 ( stream -- a b c d )
+    {
+        [ stream-read1 ]
+        [ stream-read1 ]
+        [ stream-read1 ]
+        [ stream-read1 ]
+    } cleave ;
+
+: with-replacement ( _ _ _ ch quot -- new-ch )
+    [ 3drop replacement-char ] if* ; inline
+
+: >char ( d c b a -- abcd )
+    [
+        24 shift -roll [
+            16 shift -rot [
+                8 shift swap [
+                    bitor bitor bitor
+                ] with-replacement
+            ] with-replacement
+        ] with-replacement
+    ] with-replacement ;
+
+M: utf32be decode-char
+    drop stream-read4 4spin
+    [ >char ] [ 3drop f ] if* ;
+
+M: utf32le decode-char
+    drop stream-read4 4 npick
+    [ >char ] [ 2drop 2drop f ] if ;
+
+! Encoding
+
+: split-off ( ab -- b a )
+    [ HEX: FF bitand ] [ -8 shift ] bi ;
+
+: char> ( abcd -- d b c a )
+    split-off split-off split-off ;
+
+: stream-write4 ( d c b a stream -- )
+    {
+        [ stream-write1 ]
+        [ stream-write1 ]
+        [ stream-write1 ]
+        [ stream-write1 ]
+    } cleave ;
+
+M: utf32be encode-char
+    drop [ char> ] dip stream-write4 ;
+
+M: utf32le encode-char
+    drop [ char> 4spin ] dip stream-write4 ;
+
+! UTF-32
+
+: bom-le B{ HEX: ff HEX: fe 0 0 } ; inline
+
+: bom-be B{ 0 0 HEX: fe HEX: ff } ; inline
+
+: bom>le/be ( bom -- le/be )
+    dup bom-le sequence= [ drop utf32le ] [
+        bom-be sequence= [ utf32be ] [ missing-bom ] if
+    ] if ;
+
+M: utf32 <decoder> ( stream utf32 -- decoder )
+    drop 4 over stream-read bom>le/be <decoder> ;
+
+M: utf32 <encoder> ( stream utf32 -- encoder )
+    drop bom-le over stream-write utf32le <encoder> ;