Working on unicode encodings
parent
aa8769c29d
commit
4678da8861
core/io/encodings
|
@ -53,27 +53,17 @@ GENERIC: decode-step ( buf byte ch state encoding -- buf ch state )
|
|||
>r swap start-decoding r>
|
||||
decode-read-loop ;
|
||||
|
||||
GENERIC: init-decoding ( stream encoding -- decoded-stream )
|
||||
|
||||
: <decoding> ( stream decoding-class -- decoded-stream )
|
||||
construct-empty init-decoding <line-reader> ;
|
||||
|
||||
GENERIC: init-encoding ( stream encoding -- encoded-stream )
|
||||
construct-delegate <line-reader> ;
|
||||
|
||||
: <encoding> ( stream encoding-class -- encoded-stream )
|
||||
construct-empty init-encoding <plain-writer> ;
|
||||
construct-delegate <plain-writer> ;
|
||||
|
||||
GENERIC: encode-string ( string encoding -- byte-array )
|
||||
M: tuple-class encode-string construct-empty encode-string ;
|
||||
|
||||
MIXIN: encoding-stream
|
||||
|
||||
M: encoding-stream init-decoding ( stream encoding-stream -- encoding-stream )
|
||||
tuck set-delegate ;
|
||||
|
||||
M: encoding-stream init-encoding ( stream encoding-stream -- encoding-stream )
|
||||
tuck set-delegate ;
|
||||
|
||||
M: encoding-stream stream-read1 1 swap stream-read ;
|
||||
|
||||
M: encoding-stream stream-read
|
||||
|
@ -93,3 +83,13 @@ M: encoding-stream stream-write
|
|||
[ encode-string ] keep delegate stream-write ;
|
||||
|
||||
M: encoding-stream dispose delegate dispose ;
|
||||
|
||||
GENERIC: underlying-stream ( encoded-stream -- delegate )
|
||||
M: encoding-stream underlying-stream delegate ;
|
||||
|
||||
GENERIC: set-underlying-stream ( new-underlying stream -- )
|
||||
M: encoding-stream set-underlying-stream set-delegate ;
|
||||
|
||||
: set-encoding ( encoding stream -- ) ! This doesn't work now
|
||||
[ underlying-stream swap construct-delegate ] keep
|
||||
set-underlying-stream ;
|
||||
|
|
|
@ -1,15 +1,28 @@
|
|||
USING: tools.test io.utf16 arrays unicode.syntax ;
|
||||
USING: kernel tools.test io.encodings.utf16 arrays sbufs sequences io.encodings
|
||||
io unicode.syntax ;
|
||||
|
||||
[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test
|
||||
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test
|
||||
: decode-w/stream ( array encoding -- newarray )
|
||||
>r >sbuf dup reverse-here r> <decoding> contents >array ;
|
||||
|
||||
[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test
|
||||
: encode-w/stream ( array encoding -- newarray )
|
||||
>r SBUF" " clone tuck r> <encoding> stream-write >array ;
|
||||
|
||||
[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test
|
||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test
|
||||
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode-w/stream ] unit-test
|
||||
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode-w/stream ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode-w/stream ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode-w/stream ] unit-test
|
||||
|
||||
[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test
|
||||
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode-w/stream ] unit-test
|
||||
|
||||
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode-w/stream ] unit-test
|
||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode-w/stream ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode-w/stream ] unit-test
|
||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
|
||||
|
||||
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode-w/stream ] unit-test
|
||||
|
||||
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode-w/stream ] unit-test
|
||||
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode-w/stream ] unit-test
|
||||
|
||||
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode-w/stream ] unit-test
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: math kernel sequences sbufs vectors namespaces io.binary
|
||||
io.encodings combinators splitting ;
|
||||
IN: io.utf16
|
||||
io.encodings combinators splitting io byte-arrays ;
|
||||
IN: io.encodings.utf16
|
||||
|
||||
SYMBOL: double
|
||||
SYMBOL: quad1
|
||||
|
@ -104,27 +104,49 @@ SYMBOL: ignore
|
|||
: encode-utf16 ( str -- seq )
|
||||
encode-utf16le bom-le swap append ;
|
||||
|
||||
: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
|
||||
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
|
||||
|
||||
: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
||||
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
||||
|
||||
: decode-utf16 ( seq -- str )
|
||||
{
|
||||
{ [ utf16le? ] [ decode-utf16le ] }
|
||||
{ [ utf16be? ] [ decode-utf16be ] }
|
||||
{ [ start-utf16le? ] [ decode-utf16le ] }
|
||||
{ [ start-utf16be? ] [ decode-utf16be ] }
|
||||
{ [ t ] [ decode-error ] }
|
||||
} cond ;
|
||||
|
||||
TUPLE: utf16le ;
|
||||
: <utf16le> utf16le construct-delegate ;
|
||||
INSTANCE: utf16le encoding-stream
|
||||
|
||||
M: utf16le encode-string drop encode-utf16le ;
|
||||
M: utf16le decode-step drop decode-utf16le-step ;
|
||||
|
||||
TUPLE: utf16be ;
|
||||
: <utf16be> utf16be construct-delegate ;
|
||||
INSTANCE: utf16be encoding-stream
|
||||
|
||||
M: utf16be encode-string drop encode-utf16be ;
|
||||
M: utf16le decode-step drop decode-utf16be-step ;
|
||||
M: utf16be decode-step drop decode-utf16be-step ;
|
||||
|
||||
TUPLE: utf16 encoding ;
|
||||
INSTANCE: utf16 encoding-stream
|
||||
M: utf16 underlying-stream delegate dup delegate [ ] [ ] ?if ; ! necessary?
|
||||
M: utf16 set-underlying-stream delegate set-delegate ; ! necessary?
|
||||
|
||||
M: utf16 encode-string
|
||||
>r encode-utf16le r>
|
||||
dup utf16-encoding [ drop ]
|
||||
[ t swap set-utf16-encoding bom-le swap append ] if ;
|
||||
|
||||
: bom>le/be ( bom -- le/be )
|
||||
dup bom-le sequence= [ drop utf16le ] [
|
||||
bom-be sequence= [ utf16be ] [ decode-error ] if
|
||||
] if ;
|
||||
|
||||
: read-bom ( utf16 -- encoding )
|
||||
2 over delegate stream-read bom>le/be construct-empty
|
||||
[ swap set-utf16-encoding ] keep ;
|
||||
|
||||
M: utf16 decode-step
|
||||
! inefficient: checks if bom is done many times
|
||||
! This should transform itself into utf16be or utf16le after reading BOM
|
||||
dup utf16-encoding [ ] [ read-bom ] ?if decode-step ;
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
USING: io.encodings.utf8 tools.test sbufs kernel io
|
||||
USING: io.encodings.utf8 tools.test sbufs kernel io io.encodings
|
||||
sequences strings arrays unicode.syntax ;
|
||||
|
||||
: decode-utf8-w/stream ( array -- newarray )
|
||||
>sbuf dup reverse-here <utf8> contents >array ;
|
||||
>sbuf dup reverse-here utf8 <decoding> contents ;
|
||||
|
||||
: encode-utf8-w/stream ( array -- newarray )
|
||||
SBUF" " clone tuck <utf8> write >array ;
|
||||
SBUF" " clone tuck utf8 <encoding> stream-write >array ;
|
||||
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
|
@ -19,5 +19,5 @@ sequences strings arrays unicode.syntax ;
|
|||
|
||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
|
||||
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
|
||||
[ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
|
||||
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8-w/stream ] unit-test
|
||||
|
|
|
@ -78,7 +78,6 @@ SYMBOL: quad3
|
|||
! Interface for streams
|
||||
|
||||
TUPLE: utf8 ;
|
||||
: <utf8> utf8 construct-delegate ;
|
||||
INSTANCE: utf8 encoding-stream
|
||||
|
||||
M: utf8 encode-string drop encode-utf8 ;
|
||||
|
|
Loading…
Reference in New Issue