Modifications to the encoding protocol for later optimization

db4
Daniel Ehrenberg 2008-03-06 00:23:38 -06:00
parent d8858ef924
commit ee9b940bc6
11 changed files with 70 additions and 73 deletions

View File

@ -46,23 +46,23 @@ ARTICLE: "encodings-protocol" "Encoding protocol"
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
{ $subsection decode-step }
{ $subsection init-decoder }
{ $subsection encode-string } ;
{ $subsection stream-write-encoded } ;
HELP: decode-step ( buf char encoding -- )
{ $values { "buf" "A string buffer which characters can be pushed to" }
{ "char" "An octet which is read from a stream" }
{ "encoding" "An encoding descriptor tuple" } }
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change." } ;
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change. This should not be used directly." } ;
HELP: encode-string ( string encoding -- byte-array )
HELP: stream-write-encoded ( string stream encoding -- )
{ $values { "string" "a string" }
{ "encoding" "an encoding descriptor" }
{ "byte-array" "an encoded byte-array" } }
{ $description "Encodes the string with the given encoding descriptor, outputting the result to a byte-array." } ;
{ "stream" "an output stream" }
{ "encoding" "an encoding descriptor" } }
{ $description "Encodes the string with the given encoding descriptor, outputing the result to the given stream. This should not be used directly." } ;
HELP: init-decoder ( stream encoding -- encoding )
{ $values { "stream" "an input stream" }
{ "encoding" "an encoding descriptor" } }
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM." } ;
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM. This should not be used directly." } ;
{ init-decoder decode-step encode-string } related-words
{ init-decoder decode-step stream-write-encoded } related-words

View File

@ -15,9 +15,8 @@ GENERIC: init-decoder ( stream encoding -- encoding )
M: tuple-class init-decoder construct-empty init-decoder ;
M: object init-decoder nip ;
GENERIC: encode-string ( string encoding -- byte-array )
M: tuple-class encode-string construct-empty encode-string ;
M: object encode-string drop >byte-array ;
GENERIC: stream-write-encoded ( string stream encoding -- byte-array )
M: object stream-write-encoded drop stream-write ;
! Decoding
@ -136,7 +135,7 @@ M: encoder stream-write1
>r 1string r> stream-write ;
M: encoder stream-write
[ encoder-code encode-string ] keep delegate stream-write ;
{ delegate encoder-code } get-slots stream-write-encoded ;
M: encoder dispose delegate dispose ;

View File

@ -1,3 +1,5 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax byte-arrays strings ;
IN: io.encodings.string

View File

@ -1,3 +1,5 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: strings io.encodings.utf8 io.encodings.utf16
io.encodings.string tools.test ;
IN: io.encodings.string.tests

View File

@ -1,3 +1,5 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: io io.streams.byte-array ;
IN: io.encodings.string

View File

@ -1,21 +1,20 @@
USING: io.encodings.utf8 tools.test sbufs kernel io io.encodings
sequences strings arrays unicode io.streams.byte-array ;
USING: io.encodings.utf8 tools.test io.encodings.string strings arrays ;
: decode-utf8-w/stream ( array -- newarray )
utf8 <byte-reader> contents >array ;
utf8 decode >array ;
: encode-utf8-w/stream ( array -- newarray )
utf8 [ write ] with-byte-writer >array ;
utf8 encode >array ;
[ { CHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream ] unit-test
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream ] unit-test
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test

View File

@ -60,29 +60,28 @@ M: utf8 init-decoder nip begin over set-utf8-state ;
! Encoding UTF-8
: encoded ( char -- )
BIN: 111111 bitand BIN: 10000000 bitor , ;
BIN: 111111 bitand BIN: 10000000 bitor write1 ;
: char>utf8 ( char -- )
{
{ [ dup -7 shift zero? ] [ , ] }
{ [ dup -7 shift zero? ] [ write1 ] }
{ [ dup -11 shift zero? ] [
dup -6 shift BIN: 11000000 bitor ,
dup -6 shift BIN: 11000000 bitor write1
encoded
] }
{ [ dup -16 shift zero? ] [
dup -12 shift BIN: 11100000 bitor ,
dup -12 shift BIN: 11100000 bitor write1
dup -6 shift encoded
encoded
] }
{ [ t ] [
dup -18 shift BIN: 11110000 bitor ,
dup -18 shift BIN: 11110000 bitor write1
dup -12 shift encoded
dup -6 shift encoded
encoded
] }
} cond ;
: encode-utf8 ( str -- seq )
[ [ char>utf8 ] each ] B{ } make ;
M: utf8 encode-string drop encode-utf8 ;
M: utf8 stream-write-encoded
! For efficiency, this should be modified to avoid variable reads
drop [ [ char>utf8 ] each ] with-stream* ;

View File

@ -3,13 +3,13 @@
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ;
IN: io.encodings.ascii
: encode-check<= ( string max -- byte-array )
dupd [ <= ] curry all? [ >byte-array ] [ encode-error ] if ;
: encode-check<= ( string stream max -- )
[ pick > [ encode-error ] [ stream-write1 ] if ] 2curry each ;
TUPLE: ascii ;
M: ascii encode-string
M: ascii stream-write-encoded ( string stream encoding -- )
drop 127 encode-check<= ;
M: ascii decode-step
drop dup 128 >= [ encode-error ] [ swap push ] if ;
drop dup 128 >= [ decode-error ] [ swap push ] if ;

View File

@ -5,8 +5,8 @@ IN: io.encodings.latin1
TUPLE: latin1 ;
M: latin1 encode-string
M: latin1 stream-write-encoded
drop 255 encode-check<= ;
M: latin1 decode-step
drop dup 256 >= [ encode-error ] [ swap push ] if ;
drop dup 256 >= [ decode-error ] [ swap push ] if ;

View File

@ -1,28 +1,22 @@
USING: kernel tools.test io.encodings.utf16 arrays sbufs
sequences io.encodings io unicode io.streams.byte-array ;
sequences io.encodings io unicode io.encodings.string ;
: decode-w/stream ( array encoding -- newarray )
<byte-reader> contents >array ;
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode >array ] unit-test
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode >array ] unit-test
: encode-w/stream ( array encoding -- newarray )
[ write ] with-byte-writer >array ;
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode >array ] unit-test
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode-w/stream ] unit-test
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode-w/stream ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode-w/stream ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode-w/stream ] unit-test
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode >array ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode-w/stream ] unit-test
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode-w/stream ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode-w/stream ] unit-test
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode-w/stream ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode >array ] unit-test
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode >array ] unit-test
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode-w/stream ] unit-test
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode-w/stream ] unit-test
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode-w/stream ] unit-test
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode-w/stream ] unit-test
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode >array ] unit-test

View File

@ -106,25 +106,28 @@ M: utf16le init-decoder nip begin over set-utf16le-state ;
: char>utf16be ( char -- )
dup HEX: FFFF > [
HEX: 10000 -
dup encode-first swap , ,
encode-second swap , ,
] [ h>b/b , , ] if ;
dup encode-first swap write1 write1
encode-second swap write1 write1
] [ h>b/b write1 write1 ] if ;
: encode-utf16be ( str -- seq )
[ [ char>utf16be ] each ] B{ } make ;
: stream-write-utf16be ( string stream -- )
[ [ char>utf16be ] each ] with-stream* ;
M: utf16be stream-write-encoded ( string stream encoding -- )
drop stream-write-utf16be ;
: char>utf16le ( char -- )
dup HEX: FFFF > [
HEX: 10000 -
dup encode-first , ,
encode-second , ,
] [ h>b/b swap , , ] if ;
dup encode-first write1 write1
encode-second write1 write1
] [ h>b/b swap write1 write1 ] if ;
: encode-utf16le ( str -- seq )
[ [ char>utf16le ] each ] B{ } make ;
: stream-write-utf16le ( string stream -- )
[ [ char>utf16le ] each ] with-stream* ;
M: utf16le encode-string drop encode-utf16le ;
M: utf16be encode-string drop encode-utf16be ;
M: utf16le stream-write-encoded ( string stream encoding -- )
drop stream-write-utf16le ;
! UTF-16
@ -132,19 +135,16 @@ M: utf16be encode-string drop encode-utf16be ;
: bom-be B{ HEX: fe HEX: ff } ; inline
: encode-utf16 ( str -- seq )
encode-utf16le bom-le swap append ;
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
TUPLE: utf16 started? ;
M: utf16 encode-string
>r encode-utf16le r>
M: utf16 stream-write-encoded
dup utf16-started? [ drop ]
[ t swap set-utf16-started? bom-le swap append ] if ;
[ t swap set-utf16-started? bom-le over stream-write ] if
stream-write-utf16le ;
: bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [