diff --git a/basis/io/encodings/utf16/utf16-docs.factor b/basis/io/encodings/utf16/utf16-docs.factor index dc499b5ed4..9622200a68 100644 --- a/basis/io/encodings/utf16/utf16-docs.factor +++ b/basis/io/encodings/utf16/utf16-docs.factor @@ -1,3 +1,5 @@ +! Copyright (C) 2008 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. USING: help.markup help.syntax io.encodings strings ; IN: io.encodings.utf16 diff --git a/basis/io/encodings/utf16/utf16-tests.factor b/basis/io/encodings/utf16/utf16-tests.factor index bde92a260d..230612cc77 100644 --- a/basis/io/encodings/utf16/utf16-tests.factor +++ b/basis/io/encodings/utf16/utf16-tests.factor @@ -1,6 +1,7 @@ +! Copyright (C) 2008 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. USING: kernel tools.test io.encodings.utf16 arrays sbufs io.streams.byte-array sequences io.encodings io -bootstrap.unicode io.encodings.string alien.c-types alien.strings accessors classes ; IN: io.encodings.utf16.tests @@ -15,7 +16,6 @@ IN: io.encodings.utf16.tests [ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test [ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test [ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test -[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test [ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test diff --git a/basis/io/encodings/utf16/utf16.factor b/basis/io/encodings/utf16/utf16.factor index 167d7534d1..d8c553684e 100644 --- a/basis/io/encodings/utf16/utf16.factor +++ b/basis/io/encodings/utf16/utf16.factor @@ -105,10 +105,6 @@ M: utf16le encode-char ( char stream encoding -- ) : bom-be B{ HEX: fe HEX: ff } ; inline -: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ; - -: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; - : bom>le/be ( bom -- le/be ) dup bom-le sequence= [ drop utf16le ] [ bom-be sequence= [ utf16be ] [ missing-bom ] if diff --git a/basis/io/encodings/utf32/authors.txt b/basis/io/encodings/utf32/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/basis/io/encodings/utf32/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/basis/io/encodings/utf32/summary.txt b/basis/io/encodings/utf32/summary.txt new file mode 100644 index 0000000000..1266c8c520 --- /dev/null +++ b/basis/io/encodings/utf32/summary.txt @@ -0,0 +1 @@ +UTF32 encoding/decoding diff --git a/basis/io/encodings/utf32/tags.txt b/basis/io/encodings/utf32/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/basis/io/encodings/utf32/tags.txt @@ -0,0 +1 @@ +text diff --git a/basis/io/encodings/utf32/utf32-docs.factor b/basis/io/encodings/utf32/utf32-docs.factor new file mode 100644 index 0000000000..8cdb2ef1e5 --- /dev/null +++ b/basis/io/encodings/utf32/utf32-docs.factor @@ -0,0 +1,27 @@ +! Copyright (C) 2009 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: help.markup help.syntax io.encodings strings ; +IN: io.encodings.utf32 + +ARTICLE: "io.encodings.utf32" "UTF-32 encoding" +"The UTF-32 encoding is a fixed-width encoding. Unicode code points are encoded as 4 byte sequences. There are three encoding descriptor classes for working with UTF-32, depending on endianness or the presence of a BOM:" +{ $subsection utf32 } +{ $subsection utf32le } +{ $subsection utf32be } ; + +ABOUT: "io.encodings.utf32" + +HELP: utf32le +{ $class-description "The encoding descriptor for UTF-32LE, that is, UTF-32 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } +{ $see-also "encodings-introduction" } ; + +HELP: utf32be +{ $class-description "The encoding descriptor for UTF-32BE, that is, UTF-32 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } +{ $see-also "encodings-introduction" } ; + +HELP: utf32 +{ $class-description "The encoding descriptor for UTF-32, that is, UTF-32 with a byte order mark. This is the most useful for general input and output in UTF-32. Streams can be made which read or write wth this encoding." } +{ $see-also "encodings-introduction" } ; + +{ utf32 utf32le utf32be } related-words + diff --git a/basis/io/encodings/utf32/utf32-tests.factor b/basis/io/encodings/utf32/utf32-tests.factor new file mode 100644 index 0000000000..be1111e242 --- /dev/null +++ b/basis/io/encodings/utf32/utf32-tests.factor @@ -0,0 +1,30 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: kernel tools.test io.encodings.utf32 arrays sbufs +io.streams.byte-array sequences io.encodings io +io.encodings.string alien.c-types alien.strings accessors classes ; +IN: io.encodings.utf32.tests + +[ { CHAR: x } ] [ { 0 0 0 CHAR: x } utf32be decode >array ] unit-test +[ { HEX: 1D11E } ] [ { 0 1 HEX: D1 HEX: 1E } utf32be decode >array ] unit-test +[ { CHAR: replacement-character } ] [ { 0 1 HEX: D1 } utf32be decode >array ] unit-test +[ { CHAR: replacement-character } ] [ { 0 1 } utf32be decode >array ] unit-test +[ { CHAR: replacement-character } ] [ { 0 } utf32be decode >array ] unit-test +[ { } ] [ { } utf32be decode >array ] unit-test + +[ { 0 0 0 CHAR: x 0 1 HEX: D1 HEX: 1E } ] [ { CHAR: x HEX: 1d11e } utf32be encode >array ] unit-test + +[ { CHAR: x } ] [ { CHAR: x 0 0 0 } utf32le decode >array ] unit-test +[ { HEX: 1d11e } ] [ { HEX: 1e HEX: d1 1 0 } utf32le decode >array ] unit-test +[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 1 } utf32le decode >array ] unit-test +[ { CHAR: replacement-character } ] [ { HEX: 1e HEX: d1 } utf32le decode >array ] unit-test +[ { CHAR: replacement-character } ] [ { HEX: 1e } utf32le decode >array ] unit-test +[ { } ] [ { } utf32le decode >array ] unit-test + +[ { 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32le encode >array ] unit-test + +[ { CHAR: x } ] [ { HEX: ff HEX: fe 0 0 CHAR: x 0 0 0 } utf32 decode >array ] unit-test +[ { CHAR: x } ] [ { 0 0 HEX: fe HEX: ff 0 0 0 CHAR: x } utf32 decode >array ] unit-test + +[ { HEX: ff HEX: fe 0 0 120 0 0 0 HEX: 1e HEX: d1 1 0 } ] [ { CHAR: x HEX: 1d11e } utf32 encode >array ] unit-test + diff --git a/basis/io/encodings/utf32/utf32.factor b/basis/io/encodings/utf32/utf32.factor new file mode 100644 index 0000000000..68fb6cd2f6 --- /dev/null +++ b/basis/io/encodings/utf32/utf32.factor @@ -0,0 +1,87 @@ +! Copyright (C) 2009 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: math kernel io.encodings combinators io io.encodings.utf16 +generalizations sequences ; +IN: io.encodings.utf32 + +SINGLETON: utf32be + +SINGLETON: utf32le + +SINGLETON: utf32 + +char ( d c b a -- abcd ) + [ + 24 shift -roll [ + 16 shift -rot [ + 8 shift swap [ + bitor bitor bitor + ] with-replacement + ] with-replacement + ] with-replacement + ] with-replacement ; + +M: utf32be decode-char + drop stream-read4 4spin + [ >char ] [ 3drop f ] if* ; + +M: utf32le decode-char + drop stream-read4 4 npick + [ >char ] [ 2drop 2drop f ] if ; + +! Encoding + +: split-off ( ab -- b a ) + [ HEX: FF bitand ] [ -8 shift ] bi ; + +: char> ( abcd -- d b c a ) + split-off split-off split-off ; + +: stream-write4 ( d c b a stream -- ) + { + [ stream-write1 ] + [ stream-write1 ] + [ stream-write1 ] + [ stream-write1 ] + } cleave ; + +M: utf32be encode-char + drop [ char> ] dip stream-write4 ; + +M: utf32le encode-char + drop [ char> 4spin ] dip stream-write4 ; + +! UTF-32 + +: bom-le B{ HEX: ff HEX: fe 0 0 } ; inline + +: bom-be B{ 0 0 HEX: fe HEX: ff } ; inline + +: bom>le/be ( bom -- le/be ) + dup bom-le sequence= [ drop utf32le ] [ + bom-be sequence= [ utf32be ] [ missing-bom ] if + ] if ; + +M: utf32 ( stream utf32 -- decoder ) + drop 4 over stream-read bom>le/be ; + +M: utf32 ( stream utf32 -- encoder ) + drop bom-le over stream-write utf32le ;