Working on unicode encodings

db4
Daniel Ehrenberg 2008-02-15 19:44:35 -06:00
parent aa8769c29d
commit 4678da8861
5 changed files with 72 additions and 38 deletions

View File

@ -53,27 +53,17 @@ GENERIC: decode-step ( buf byte ch state encoding -- buf ch state )
>r swap start-decoding r>
decode-read-loop ;
GENERIC: init-decoding ( stream encoding -- decoded-stream )
: <decoding> ( stream decoding-class -- decoded-stream )
construct-empty init-decoding <line-reader> ;
GENERIC: init-encoding ( stream encoding -- encoded-stream )
construct-delegate <line-reader> ;
: <encoding> ( stream encoding-class -- encoded-stream )
construct-empty init-encoding <plain-writer> ;
construct-delegate <plain-writer> ;
GENERIC: encode-string ( string encoding -- byte-array )
M: tuple-class encode-string construct-empty encode-string ;
MIXIN: encoding-stream
M: encoding-stream init-decoding ( stream encoding-stream -- encoding-stream )
tuck set-delegate ;
M: encoding-stream init-encoding ( stream encoding-stream -- encoding-stream )
tuck set-delegate ;
M: encoding-stream stream-read1 1 swap stream-read ;
M: encoding-stream stream-read
@ -93,3 +83,13 @@ M: encoding-stream stream-write
[ encode-string ] keep delegate stream-write ;
M: encoding-stream dispose delegate dispose ;
GENERIC: underlying-stream ( encoded-stream -- delegate )
M: encoding-stream underlying-stream delegate ;
GENERIC: set-underlying-stream ( new-underlying stream -- )
M: encoding-stream set-underlying-stream set-delegate ;
: set-encoding ( encoding stream -- ) ! This doesn't work now
[ underlying-stream swap construct-delegate ] keep
set-underlying-stream ;

View File

@ -1,15 +1,28 @@
USING: tools.test io.utf16 arrays unicode.syntax ;
USING: kernel tools.test io.encodings.utf16 arrays sbufs sequences io.encodings
io unicode.syntax ;
[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test
: decode-w/stream ( array encoding -- newarray )
>r >sbuf dup reverse-here r> <decoding> contents >array ;
[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test
: encode-w/stream ( array encoding -- newarray )
>r SBUF" " clone tuck r> <encoding> stream-write >array ;
[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode-w/stream ] unit-test
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode-w/stream ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode-w/stream ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode-w/stream ] unit-test
[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode-w/stream ] unit-test
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode-w/stream ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode-w/stream ] unit-test
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode-w/stream ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode-w/stream ] unit-test
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode-w/stream ] unit-test
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode-w/stream ] unit-test
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode-w/stream ] unit-test

View File

@ -1,8 +1,8 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting ;
IN: io.utf16
io.encodings combinators splitting io byte-arrays ;
IN: io.encodings.utf16
SYMBOL: double
SYMBOL: quad1
@ -104,27 +104,49 @@ SYMBOL: ignore
: encode-utf16 ( str -- seq )
encode-utf16le bom-le swap append ;
: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
: decode-utf16 ( seq -- str )
{
{ [ utf16le? ] [ decode-utf16le ] }
{ [ utf16be? ] [ decode-utf16be ] }
{ [ start-utf16le? ] [ decode-utf16le ] }
{ [ start-utf16be? ] [ decode-utf16be ] }
{ [ t ] [ decode-error ] }
} cond ;
TUPLE: utf16le ;
: <utf16le> utf16le construct-delegate ;
INSTANCE: utf16le encoding-stream
M: utf16le encode-string drop encode-utf16le ;
M: utf16le decode-step drop decode-utf16le-step ;
TUPLE: utf16be ;
: <utf16be> utf16be construct-delegate ;
INSTANCE: utf16be encoding-stream
M: utf16be encode-string drop encode-utf16be ;
M: utf16le decode-step drop decode-utf16be-step ;
M: utf16be decode-step drop decode-utf16be-step ;
TUPLE: utf16 encoding ;
INSTANCE: utf16 encoding-stream
M: utf16 underlying-stream delegate dup delegate [ ] [ ] ?if ; ! necessary?
M: utf16 set-underlying-stream delegate set-delegate ; ! necessary?
M: utf16 encode-string
>r encode-utf16le r>
dup utf16-encoding [ drop ]
[ t swap set-utf16-encoding bom-le swap append ] if ;
: bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [
bom-be sequence= [ utf16be ] [ decode-error ] if
] if ;
: read-bom ( utf16 -- encoding )
2 over delegate stream-read bom>le/be construct-empty
[ swap set-utf16-encoding ] keep ;
M: utf16 decode-step
! inefficient: checks if bom is done many times
! This should transform itself into utf16be or utf16le after reading BOM
dup utf16-encoding [ ] [ read-bom ] ?if decode-step ;

View File

@ -1,11 +1,11 @@
USING: io.encodings.utf8 tools.test sbufs kernel io
USING: io.encodings.utf8 tools.test sbufs kernel io io.encodings
sequences strings arrays unicode.syntax ;
: decode-utf8-w/stream ( array -- newarray )
>sbuf dup reverse-here <utf8> contents >array ;
>sbuf dup reverse-here utf8 <decoding> contents ;
: encode-utf8-w/stream ( array -- newarray )
SBUF" " clone tuck <utf8> write >array ;
SBUF" " clone tuck utf8 <encoding> stream-write >array ;
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
@ -19,5 +19,5 @@ sequences strings arrays unicode.syntax ;
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
[ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8-w/stream ] unit-test

View File

@ -78,7 +78,6 @@ SYMBOL: quad3
! Interface for streams
TUPLE: utf8 ;
: <utf8> utf8 construct-delegate ;
INSTANCE: utf8 encoding-stream
M: utf8 encode-string drop encode-utf8 ;