io/utf8 and 16 were moved to core/io/encodings

db4
Daniel Ehrenberg 2008-02-11 17:44:14 -06:00
parent 99ff43b404
commit 8bbc144ce7
10 changed files with 0 additions and 286 deletions

View File

@ -1 +0,0 @@
Daniel Ehrenberg

View File

@ -1 +0,0 @@
UTF16 encoding/decoding

View File

@ -1,45 +0,0 @@
USING: help.markup help.syntax io.encodings strings ;
IN: io.utf16
ARTICLE: "io.utf16" "Working with UTF16-encoded data"
"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences."
{ $subsection encode-utf16le }
{ $subsection encode-utf16be }
{ $subsection decode-utf16le }
{ $subsection decode-utf16be }
"Support for UTF16 data with a byte order mark:"
{ $subsection encode-utf16 }
{ $subsection decode-utf16 } ;
ABOUT: "io.utf16"
HELP: decode-utf16
{ $values { "seq" "a sequence of bytes" } { "str" string } }
{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." }
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
HELP: decode-utf16be
{ $values { "seq" "a sequence of bytes" } { "str" string } }
{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
HELP: decode-utf16le
{ $values { "seq" "a sequence of bytes" } { "str" string } }
{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
{ decode-utf16 decode-utf16le decode-utf16be } related-words
HELP: encode-utf16be
{ $values { "str" string } { "seq" "a sequence of bytes" } }
{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ;
HELP: encode-utf16le
{ $values { "str" string } { "seq" "a sequence of bytes" } }
{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ;
HELP: encode-utf16
{ $values { "str" string } { "seq" "a sequence of bytes" } }
{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ;
{ encode-utf16 encode-utf16be encode-utf16le } related-words

View File

@ -1,15 +0,0 @@
USING: tools.test io.utf16 arrays unicode.syntax ;
[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test
[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test
[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test
[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test

View File

@ -1,116 +0,0 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting ;
IN: io.utf16
SYMBOL: double
SYMBOL: quad1
SYMBOL: quad2
SYMBOL: quad3
SYMBOL: ignore
: do-ignore ( -- ch state ) 0 ignore ;
: append-nums ( byte ch -- ch )
8 shift bitor ;
: end-multibyte ( buf byte ch -- buf ch state )
append-nums decoded ;
: begin-utf16be ( buf byte -- buf ch state )
dup -3 shift BIN: 11011 number= [
dup BIN: 00000100 bitand zero?
[ BIN: 11 bitand quad1 ]
[ drop do-ignore ] if
] [ double ] if ;
: handle-quad2be ( byte ch -- ch state )
swap dup -2 shift BIN: 110111 number= [
>r 2 shift r> BIN: 11 bitand bitor quad3
] [ 2drop do-ignore ] if ;
: (decode-utf16be) ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf16be ] }
{ double [ end-multibyte ] }
{ quad1 [ append-nums quad2 ] }
{ quad2 [ handle-quad2be ] }
{ quad3 [ append-nums HEX: 10000 + decoded ] }
{ ignore [ 2drop push-replacement ] }
} case ;
: decode-utf16be ( seq -- str )
[ -rot (decode-utf16be) ] decode ;
: handle-double ( buf byte ch -- buf ch state )
swap dup -3 shift BIN: 11011 = [
dup BIN: 100 bitand 0 number=
[ BIN: 11 bitand 8 shift bitor quad2 ]
[ 2drop push-replacement ] if
] [ end-multibyte ] if ;
: handle-quad3le ( buf byte ch -- buf ch state )
swap dup -2 shift BIN: 110111 = [
BIN: 11 bitand append-nums HEX: 10000 + decoded
] [ 2drop push-replacement ] if ;
: (decode-utf16le) ( buf byte ch state -- buf ch state )
{
{ begin [ drop double ] }
{ double [ handle-double ] }
{ quad1 [ append-nums quad2 ] }
{ quad2 [ 10 shift bitor quad3 ] }
{ quad3 [ handle-quad3le ] }
} case ;
: decode-utf16le ( seq -- str )
[ -rot (decode-utf16le) ] decode ;
: encode-first
-10 shift
dup -8 shift BIN: 11011000 bitor
swap HEX: FF bitand ;
: encode-second
BIN: 1111111111 bitand
dup -8 shift BIN: 11011100 bitor
swap BIN: 11111111 bitand ;
: char>utf16be ( char -- )
dup HEX: FFFF > [
HEX: 10000 -
dup encode-first swap , ,
encode-second swap , ,
] [ h>b/b , , ] if ;
: encode-utf16be ( str -- seq )
[ [ char>utf16be ] each ] B{ } make ;
: char>utf16le ( char -- )
dup HEX: FFFF > [
HEX: 10000 -
dup encode-first , ,
encode-second , ,
] [ h>b/b swap , , ] if ;
: encode-utf16le ( str -- seq )
[ [ char>utf16le ] each ] B{ } make ;
: bom-le B{ HEX: ff HEX: fe } ; inline
: bom-be B{ HEX: fe HEX: ff } ; inline
: encode-utf16 ( str -- seq )
encode-utf16le bom-le swap append ;
: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
: decode-utf16 ( seq -- str )
{
{ [ utf16le? ] [ decode-utf16le ] }
{ [ utf16be? ] [ decode-utf16be ] }
{ [ t ] [ decode-error ] }
} cond ;

View File

@ -1 +0,0 @@
Daniel Ehrenberg

View File

@ -1 +0,0 @@
UTF8 encoding/decoding

View File

@ -1,18 +0,0 @@
USING: help.markup help.syntax io.encodings strings ;
IN: io.utf8
ARTICLE: "io.utf8" "Working with UTF8-encoded data"
"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences."
{ $subsection encode-utf8 }
{ $subsection decode-utf8 } ;
ABOUT: "io.utf8"
HELP: decode-utf8
{ $values { "seq" "a sequence of bytes" } { "str" string } }
{ $description "Decodes a sequence of bytes representing a Unicode string in UTF8 format." }
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
HELP: encode-utf8
{ $values { "str" string } { "seq" "a sequence of bytes" } }
{ $description "Encodes a Unicode string as a sequence of bytes in UTF8 format." } ;

View File

@ -1,16 +0,0 @@
USING: io.utf8 tools.test strings arrays unicode.syntax ;
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
[ "x" ] [ "x" decode-utf8 >string ] unit-test
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test

View File

@ -1,72 +0,0 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors
namespaces io.encodings combinators ;
IN: io.utf8
SYMBOL: double
SYMBOL: triple
SYMBOL: triple2
SYMBOL: quad
SYMBOL: quad2
SYMBOL: quad3
: starts-2? ( char -- ? )
-6 shift BIN: 10 number= ;
: append-nums ( buf bottom top state-out -- buf num state )
>r over starts-2?
[ 6 shift swap BIN: 111111 bitand bitor r> ]
[ r> 3drop push-replacement ] if ;
: begin-utf8 ( buf byte -- buf ch state )
{
{ [ dup -7 shift zero? ] [ decoded ] }
{ [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
{ [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
{ [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
{ [ t ] [ drop push-replacement ] }
} cond ;
: end-multibyte ( buf byte ch -- buf ch state )
f append-nums [ decoded ] unless* ;
: (decode-utf8) ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf8 ] }
{ double [ end-multibyte ] }
{ triple [ triple2 append-nums ] }
{ triple2 [ end-multibyte ] }
{ quad [ quad2 append-nums ] }
{ quad2 [ quad3 append-nums ] }
{ quad3 [ end-multibyte ] }
} case ;
: decode-utf8 ( seq -- str )
[ -rot (decode-utf8) ] decode ;
: encoded ( char -- )
BIN: 111111 bitand BIN: 10000000 bitor , ;
: char>utf8 ( char -- )
{
{ [ dup -7 shift zero? ] [ , ] }
{ [ dup -11 shift zero? ] [
dup -6 shift BIN: 11000000 bitor ,
encoded
] }
{ [ dup -16 shift zero? ] [
dup -12 shift BIN: 11100000 bitor ,
dup -6 shift encoded
encoded
] }
{ [ t ] [
dup -18 shift BIN: 11110000 bitor ,
dup -12 shift encoded
dup -6 shift encoded
encoded
] }
} cond ;
: encode-utf8 ( str -- seq )
[ [ char>utf8 ] each ] B{ } make ;