Modifications to the encoding protocol for later optimization
parent
d8858ef924
commit
ee9b940bc6
|
@ -46,23 +46,23 @@ ARTICLE: "encodings-protocol" "Encoding protocol"
|
|||
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
|
||||
{ $subsection decode-step }
|
||||
{ $subsection init-decoder }
|
||||
{ $subsection encode-string } ;
|
||||
{ $subsection stream-write-encoded } ;
|
||||
|
||||
HELP: decode-step ( buf char encoding -- )
|
||||
{ $values { "buf" "A string buffer which characters can be pushed to" }
|
||||
{ "char" "An octet which is read from a stream" }
|
||||
{ "encoding" "An encoding descriptor tuple" } }
|
||||
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change." } ;
|
||||
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change. This should not be used directly." } ;
|
||||
|
||||
HELP: encode-string ( string encoding -- byte-array )
|
||||
HELP: stream-write-encoded ( string stream encoding -- )
|
||||
{ $values { "string" "a string" }
|
||||
{ "encoding" "an encoding descriptor" }
|
||||
{ "byte-array" "an encoded byte-array" } }
|
||||
{ $description "Encodes the string with the given encoding descriptor, outputting the result to a byte-array." } ;
|
||||
{ "stream" "an output stream" }
|
||||
{ "encoding" "an encoding descriptor" } }
|
||||
{ $description "Encodes the string with the given encoding descriptor, outputing the result to the given stream. This should not be used directly." } ;
|
||||
|
||||
HELP: init-decoder ( stream encoding -- encoding )
|
||||
{ $values { "stream" "an input stream" }
|
||||
{ "encoding" "an encoding descriptor" } }
|
||||
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM." } ;
|
||||
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM. This should not be used directly." } ;
|
||||
|
||||
{ init-decoder decode-step encode-string } related-words
|
||||
{ init-decoder decode-step stream-write-encoded } related-words
|
||||
|
|
|
@ -15,9 +15,8 @@ GENERIC: init-decoder ( stream encoding -- encoding )
|
|||
M: tuple-class init-decoder construct-empty init-decoder ;
|
||||
M: object init-decoder nip ;
|
||||
|
||||
GENERIC: encode-string ( string encoding -- byte-array )
|
||||
M: tuple-class encode-string construct-empty encode-string ;
|
||||
M: object encode-string drop >byte-array ;
|
||||
GENERIC: stream-write-encoded ( string stream encoding -- byte-array )
|
||||
M: object stream-write-encoded drop stream-write ;
|
||||
|
||||
! Decoding
|
||||
|
||||
|
@ -136,7 +135,7 @@ M: encoder stream-write1
|
|||
>r 1string r> stream-write ;
|
||||
|
||||
M: encoder stream-write
|
||||
[ encoder-code encode-string ] keep delegate stream-write ;
|
||||
{ delegate encoder-code } get-slots stream-write-encoded ;
|
||||
|
||||
M: encoder dispose delegate dispose ;
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: help.markup help.syntax byte-arrays strings ;
|
||||
IN: io.encodings.string
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: strings io.encodings.utf8 io.encodings.utf16
|
||||
io.encodings.string tools.test ;
|
||||
IN: io.encodings.string.tests
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: io io.streams.byte-array ;
|
||||
IN: io.encodings.string
|
||||
|
||||
|
|
|
@ -1,21 +1,20 @@
|
|||
USING: io.encodings.utf8 tools.test sbufs kernel io io.encodings
|
||||
sequences strings arrays unicode io.streams.byte-array ;
|
||||
USING: io.encodings.utf8 tools.test io.encodings.string strings arrays ;
|
||||
|
||||
: decode-utf8-w/stream ( array -- newarray )
|
||||
utf8 <byte-reader> contents >array ;
|
||||
utf8 decode >array ;
|
||||
|
||||
: encode-utf8-w/stream ( array -- newarray )
|
||||
utf8 [ write ] with-byte-writer >array ;
|
||||
utf8 encode >array ;
|
||||
|
||||
[ { CHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream ] unit-test
|
||||
|
||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream ] unit-test
|
||||
|
||||
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
|
||||
|
||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ { CHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream ] unit-test
|
||||
|
||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
|
|
|
@ -60,29 +60,28 @@ M: utf8 init-decoder nip begin over set-utf8-state ;
|
|||
! Encoding UTF-8
|
||||
|
||||
: encoded ( char -- )
|
||||
BIN: 111111 bitand BIN: 10000000 bitor , ;
|
||||
BIN: 111111 bitand BIN: 10000000 bitor write1 ;
|
||||
|
||||
: char>utf8 ( char -- )
|
||||
{
|
||||
{ [ dup -7 shift zero? ] [ , ] }
|
||||
{ [ dup -7 shift zero? ] [ write1 ] }
|
||||
{ [ dup -11 shift zero? ] [
|
||||
dup -6 shift BIN: 11000000 bitor ,
|
||||
dup -6 shift BIN: 11000000 bitor write1
|
||||
encoded
|
||||
] }
|
||||
{ [ dup -16 shift zero? ] [
|
||||
dup -12 shift BIN: 11100000 bitor ,
|
||||
dup -12 shift BIN: 11100000 bitor write1
|
||||
dup -6 shift encoded
|
||||
encoded
|
||||
] }
|
||||
{ [ t ] [
|
||||
dup -18 shift BIN: 11110000 bitor ,
|
||||
dup -18 shift BIN: 11110000 bitor write1
|
||||
dup -12 shift encoded
|
||||
dup -6 shift encoded
|
||||
encoded
|
||||
] }
|
||||
} cond ;
|
||||
|
||||
: encode-utf8 ( str -- seq )
|
||||
[ [ char>utf8 ] each ] B{ } make ;
|
||||
|
||||
M: utf8 encode-string drop encode-utf8 ;
|
||||
M: utf8 stream-write-encoded
|
||||
! For efficiency, this should be modified to avoid variable reads
|
||||
drop [ [ char>utf8 ] each ] with-stream* ;
|
||||
|
|
|
@ -3,13 +3,13 @@
|
|||
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ;
|
||||
IN: io.encodings.ascii
|
||||
|
||||
: encode-check<= ( string max -- byte-array )
|
||||
dupd [ <= ] curry all? [ >byte-array ] [ encode-error ] if ;
|
||||
: encode-check<= ( string stream max -- )
|
||||
[ pick > [ encode-error ] [ stream-write1 ] if ] 2curry each ;
|
||||
|
||||
TUPLE: ascii ;
|
||||
|
||||
M: ascii encode-string
|
||||
M: ascii stream-write-encoded ( string stream encoding -- )
|
||||
drop 127 encode-check<= ;
|
||||
|
||||
M: ascii decode-step
|
||||
drop dup 128 >= [ encode-error ] [ swap push ] if ;
|
||||
drop dup 128 >= [ decode-error ] [ swap push ] if ;
|
||||
|
|
|
@ -5,8 +5,8 @@ IN: io.encodings.latin1
|
|||
|
||||
TUPLE: latin1 ;
|
||||
|
||||
M: latin1 encode-string
|
||||
M: latin1 stream-write-encoded
|
||||
drop 255 encode-check<= ;
|
||||
|
||||
M: latin1 decode-step
|
||||
drop dup 256 >= [ encode-error ] [ swap push ] if ;
|
||||
drop dup 256 >= [ decode-error ] [ swap push ] if ;
|
||||
|
|
|
@ -1,28 +1,22 @@
|
|||
USING: kernel tools.test io.encodings.utf16 arrays sbufs
|
||||
sequences io.encodings io unicode io.streams.byte-array ;
|
||||
sequences io.encodings io unicode io.encodings.string ;
|
||||
|
||||
: decode-w/stream ( array encoding -- newarray )
|
||||
<byte-reader> contents >array ;
|
||||
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode >array ] unit-test
|
||||
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode >array ] unit-test
|
||||
|
||||
: encode-w/stream ( array encoding -- newarray )
|
||||
[ write ] with-byte-writer >array ;
|
||||
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode >array ] unit-test
|
||||
|
||||
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode-w/stream ] unit-test
|
||||
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode-w/stream ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode-w/stream ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode-w/stream ] unit-test
|
||||
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode >array ] unit-test
|
||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
|
||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
||||
|
||||
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode-w/stream ] unit-test
|
||||
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
|
||||
|
||||
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode-w/stream ] unit-test
|
||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode-w/stream ] unit-test
|
||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode-w/stream ] unit-test
|
||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
|
||||
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode >array ] unit-test
|
||||
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode >array ] unit-test
|
||||
|
||||
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode-w/stream ] unit-test
|
||||
|
||||
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode-w/stream ] unit-test
|
||||
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode-w/stream ] unit-test
|
||||
|
||||
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode-w/stream ] unit-test
|
||||
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode >array ] unit-test
|
||||
|
|
|
@ -106,25 +106,28 @@ M: utf16le init-decoder nip begin over set-utf16le-state ;
|
|||
: char>utf16be ( char -- )
|
||||
dup HEX: FFFF > [
|
||||
HEX: 10000 -
|
||||
dup encode-first swap , ,
|
||||
encode-second swap , ,
|
||||
] [ h>b/b , , ] if ;
|
||||
dup encode-first swap write1 write1
|
||||
encode-second swap write1 write1
|
||||
] [ h>b/b write1 write1 ] if ;
|
||||
|
||||
: encode-utf16be ( str -- seq )
|
||||
[ [ char>utf16be ] each ] B{ } make ;
|
||||
: stream-write-utf16be ( string stream -- )
|
||||
[ [ char>utf16be ] each ] with-stream* ;
|
||||
|
||||
M: utf16be stream-write-encoded ( string stream encoding -- )
|
||||
drop stream-write-utf16be ;
|
||||
|
||||
: char>utf16le ( char -- )
|
||||
dup HEX: FFFF > [
|
||||
HEX: 10000 -
|
||||
dup encode-first , ,
|
||||
encode-second , ,
|
||||
] [ h>b/b swap , , ] if ;
|
||||
dup encode-first write1 write1
|
||||
encode-second write1 write1
|
||||
] [ h>b/b swap write1 write1 ] if ;
|
||||
|
||||
: encode-utf16le ( str -- seq )
|
||||
[ [ char>utf16le ] each ] B{ } make ;
|
||||
: stream-write-utf16le ( string stream -- )
|
||||
[ [ char>utf16le ] each ] with-stream* ;
|
||||
|
||||
M: utf16le encode-string drop encode-utf16le ;
|
||||
M: utf16be encode-string drop encode-utf16be ;
|
||||
M: utf16le stream-write-encoded ( string stream encoding -- )
|
||||
drop stream-write-utf16le ;
|
||||
|
||||
! UTF-16
|
||||
|
||||
|
@ -132,19 +135,16 @@ M: utf16be encode-string drop encode-utf16be ;
|
|||
|
||||
: bom-be B{ HEX: fe HEX: ff } ; inline
|
||||
|
||||
: encode-utf16 ( str -- seq )
|
||||
encode-utf16le bom-le swap append ;
|
||||
|
||||
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
|
||||
|
||||
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
||||
|
||||
TUPLE: utf16 started? ;
|
||||
|
||||
M: utf16 encode-string
|
||||
>r encode-utf16le r>
|
||||
M: utf16 stream-write-encoded
|
||||
dup utf16-started? [ drop ]
|
||||
[ t swap set-utf16-started? bom-le swap append ] if ;
|
||||
[ t swap set-utf16-started? bom-le over stream-write ] if
|
||||
stream-write-utf16le ;
|
||||
|
||||
: bom>le/be ( bom -- le/be )
|
||||
dup bom-le sequence= [ drop utf16le ] [
|
||||
|
|
Loading…
Reference in New Issue