Modifications to the encoding protocol for later optimization
parent
d8858ef924
commit
ee9b940bc6
|
@ -46,23 +46,23 @@ ARTICLE: "encodings-protocol" "Encoding protocol"
|
||||||
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
|
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
|
||||||
{ $subsection decode-step }
|
{ $subsection decode-step }
|
||||||
{ $subsection init-decoder }
|
{ $subsection init-decoder }
|
||||||
{ $subsection encode-string } ;
|
{ $subsection stream-write-encoded } ;
|
||||||
|
|
||||||
HELP: decode-step ( buf char encoding -- )
|
HELP: decode-step ( buf char encoding -- )
|
||||||
{ $values { "buf" "A string buffer which characters can be pushed to" }
|
{ $values { "buf" "A string buffer which characters can be pushed to" }
|
||||||
{ "char" "An octet which is read from a stream" }
|
{ "char" "An octet which is read from a stream" }
|
||||||
{ "encoding" "An encoding descriptor tuple" } }
|
{ "encoding" "An encoding descriptor tuple" } }
|
||||||
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change." } ;
|
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change. This should not be used directly." } ;
|
||||||
|
|
||||||
HELP: encode-string ( string encoding -- byte-array )
|
HELP: stream-write-encoded ( string stream encoding -- )
|
||||||
{ $values { "string" "a string" }
|
{ $values { "string" "a string" }
|
||||||
{ "encoding" "an encoding descriptor" }
|
{ "stream" "an output stream" }
|
||||||
{ "byte-array" "an encoded byte-array" } }
|
{ "encoding" "an encoding descriptor" } }
|
||||||
{ $description "Encodes the string with the given encoding descriptor, outputting the result to a byte-array." } ;
|
{ $description "Encodes the string with the given encoding descriptor, outputing the result to the given stream. This should not be used directly." } ;
|
||||||
|
|
||||||
HELP: init-decoder ( stream encoding -- encoding )
|
HELP: init-decoder ( stream encoding -- encoding )
|
||||||
{ $values { "stream" "an input stream" }
|
{ $values { "stream" "an input stream" }
|
||||||
{ "encoding" "an encoding descriptor" } }
|
{ "encoding" "an encoding descriptor" } }
|
||||||
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM." } ;
|
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM. This should not be used directly." } ;
|
||||||
|
|
||||||
{ init-decoder decode-step encode-string } related-words
|
{ init-decoder decode-step stream-write-encoded } related-words
|
||||||
|
|
|
@ -15,9 +15,8 @@ GENERIC: init-decoder ( stream encoding -- encoding )
|
||||||
M: tuple-class init-decoder construct-empty init-decoder ;
|
M: tuple-class init-decoder construct-empty init-decoder ;
|
||||||
M: object init-decoder nip ;
|
M: object init-decoder nip ;
|
||||||
|
|
||||||
GENERIC: encode-string ( string encoding -- byte-array )
|
GENERIC: stream-write-encoded ( string stream encoding -- byte-array )
|
||||||
M: tuple-class encode-string construct-empty encode-string ;
|
M: object stream-write-encoded drop stream-write ;
|
||||||
M: object encode-string drop >byte-array ;
|
|
||||||
|
|
||||||
! Decoding
|
! Decoding
|
||||||
|
|
||||||
|
@ -136,7 +135,7 @@ M: encoder stream-write1
|
||||||
>r 1string r> stream-write ;
|
>r 1string r> stream-write ;
|
||||||
|
|
||||||
M: encoder stream-write
|
M: encoder stream-write
|
||||||
[ encoder-code encode-string ] keep delegate stream-write ;
|
{ delegate encoder-code } get-slots stream-write-encoded ;
|
||||||
|
|
||||||
M: encoder dispose delegate dispose ;
|
M: encoder dispose delegate dispose ;
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: help.markup help.syntax byte-arrays strings ;
|
USING: help.markup help.syntax byte-arrays strings ;
|
||||||
IN: io.encodings.string
|
IN: io.encodings.string
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: strings io.encodings.utf8 io.encodings.utf16
|
USING: strings io.encodings.utf8 io.encodings.utf16
|
||||||
io.encodings.string tools.test ;
|
io.encodings.string tools.test ;
|
||||||
IN: io.encodings.string.tests
|
IN: io.encodings.string.tests
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: io io.streams.byte-array ;
|
USING: io io.streams.byte-array ;
|
||||||
IN: io.encodings.string
|
IN: io.encodings.string
|
||||||
|
|
||||||
|
|
|
@ -1,21 +1,20 @@
|
||||||
USING: io.encodings.utf8 tools.test sbufs kernel io io.encodings
|
USING: io.encodings.utf8 tools.test io.encodings.string strings arrays ;
|
||||||
sequences strings arrays unicode io.streams.byte-array ;
|
|
||||||
|
|
||||||
: decode-utf8-w/stream ( array -- newarray )
|
: decode-utf8-w/stream ( array -- newarray )
|
||||||
utf8 <byte-reader> contents >array ;
|
utf8 decode >array ;
|
||||||
|
|
||||||
: encode-utf8-w/stream ( array -- newarray )
|
: encode-utf8-w/stream ( array -- newarray )
|
||||||
utf8 [ write ] with-byte-writer >array ;
|
utf8 encode >array ;
|
||||||
|
|
||||||
[ { CHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
|
[ { CHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream ] unit-test
|
||||||
|
|
||||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream ] unit-test
|
||||||
|
|
||||||
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
|
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
|
||||||
|
|
||||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||||
|
|
||||||
[ { CHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
[ { CHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream ] unit-test
|
||||||
|
|
||||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||||
|
|
||||||
|
|
|
@ -60,29 +60,28 @@ M: utf8 init-decoder nip begin over set-utf8-state ;
|
||||||
! Encoding UTF-8
|
! Encoding UTF-8
|
||||||
|
|
||||||
: encoded ( char -- )
|
: encoded ( char -- )
|
||||||
BIN: 111111 bitand BIN: 10000000 bitor , ;
|
BIN: 111111 bitand BIN: 10000000 bitor write1 ;
|
||||||
|
|
||||||
: char>utf8 ( char -- )
|
: char>utf8 ( char -- )
|
||||||
{
|
{
|
||||||
{ [ dup -7 shift zero? ] [ , ] }
|
{ [ dup -7 shift zero? ] [ write1 ] }
|
||||||
{ [ dup -11 shift zero? ] [
|
{ [ dup -11 shift zero? ] [
|
||||||
dup -6 shift BIN: 11000000 bitor ,
|
dup -6 shift BIN: 11000000 bitor write1
|
||||||
encoded
|
encoded
|
||||||
] }
|
] }
|
||||||
{ [ dup -16 shift zero? ] [
|
{ [ dup -16 shift zero? ] [
|
||||||
dup -12 shift BIN: 11100000 bitor ,
|
dup -12 shift BIN: 11100000 bitor write1
|
||||||
dup -6 shift encoded
|
dup -6 shift encoded
|
||||||
encoded
|
encoded
|
||||||
] }
|
] }
|
||||||
{ [ t ] [
|
{ [ t ] [
|
||||||
dup -18 shift BIN: 11110000 bitor ,
|
dup -18 shift BIN: 11110000 bitor write1
|
||||||
dup -12 shift encoded
|
dup -12 shift encoded
|
||||||
dup -6 shift encoded
|
dup -6 shift encoded
|
||||||
encoded
|
encoded
|
||||||
] }
|
] }
|
||||||
} cond ;
|
} cond ;
|
||||||
|
|
||||||
: encode-utf8 ( str -- seq )
|
M: utf8 stream-write-encoded
|
||||||
[ [ char>utf8 ] each ] B{ } make ;
|
! For efficiency, this should be modified to avoid variable reads
|
||||||
|
drop [ [ char>utf8 ] each ] with-stream* ;
|
||||||
M: utf8 encode-string drop encode-utf8 ;
|
|
||||||
|
|
|
@ -3,13 +3,13 @@
|
||||||
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ;
|
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ;
|
||||||
IN: io.encodings.ascii
|
IN: io.encodings.ascii
|
||||||
|
|
||||||
: encode-check<= ( string max -- byte-array )
|
: encode-check<= ( string stream max -- )
|
||||||
dupd [ <= ] curry all? [ >byte-array ] [ encode-error ] if ;
|
[ pick > [ encode-error ] [ stream-write1 ] if ] 2curry each ;
|
||||||
|
|
||||||
TUPLE: ascii ;
|
TUPLE: ascii ;
|
||||||
|
|
||||||
M: ascii encode-string
|
M: ascii stream-write-encoded ( string stream encoding -- )
|
||||||
drop 127 encode-check<= ;
|
drop 127 encode-check<= ;
|
||||||
|
|
||||||
M: ascii decode-step
|
M: ascii decode-step
|
||||||
drop dup 128 >= [ encode-error ] [ swap push ] if ;
|
drop dup 128 >= [ decode-error ] [ swap push ] if ;
|
||||||
|
|
|
@ -5,8 +5,8 @@ IN: io.encodings.latin1
|
||||||
|
|
||||||
TUPLE: latin1 ;
|
TUPLE: latin1 ;
|
||||||
|
|
||||||
M: latin1 encode-string
|
M: latin1 stream-write-encoded
|
||||||
drop 255 encode-check<= ;
|
drop 255 encode-check<= ;
|
||||||
|
|
||||||
M: latin1 decode-step
|
M: latin1 decode-step
|
||||||
drop dup 256 >= [ encode-error ] [ swap push ] if ;
|
drop dup 256 >= [ decode-error ] [ swap push ] if ;
|
||||||
|
|
|
@ -1,28 +1,22 @@
|
||||||
USING: kernel tools.test io.encodings.utf16 arrays sbufs
|
USING: kernel tools.test io.encodings.utf16 arrays sbufs
|
||||||
sequences io.encodings io unicode io.streams.byte-array ;
|
sequences io.encodings io unicode io.encodings.string ;
|
||||||
|
|
||||||
: decode-w/stream ( array encoding -- newarray )
|
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode >array ] unit-test
|
||||||
<byte-reader> contents >array ;
|
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode >array ] unit-test
|
||||||
|
[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode >array ] unit-test
|
||||||
|
[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode >array ] unit-test
|
||||||
|
|
||||||
: encode-w/stream ( array encoding -- newarray )
|
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode >array ] unit-test
|
||||||
[ write ] with-byte-writer >array ;
|
|
||||||
|
|
||||||
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode-w/stream ] unit-test
|
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode >array ] unit-test
|
||||||
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode-w/stream ] unit-test
|
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
||||||
[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode-w/stream ] unit-test
|
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
|
||||||
[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode-w/stream ] unit-test
|
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
|
||||||
|
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
|
||||||
|
|
||||||
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode-w/stream ] unit-test
|
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
|
||||||
|
|
||||||
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode-w/stream ] unit-test
|
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode >array ] unit-test
|
||||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
|
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode >array ] unit-test
|
||||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode-w/stream ] unit-test
|
|
||||||
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode-w/stream ] unit-test
|
|
||||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
|
|
||||||
|
|
||||||
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode-w/stream ] unit-test
|
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode >array ] unit-test
|
||||||
|
|
||||||
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode-w/stream ] unit-test
|
|
||||||
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode-w/stream ] unit-test
|
|
||||||
|
|
||||||
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode-w/stream ] unit-test
|
|
||||||
|
|
|
@ -106,25 +106,28 @@ M: utf16le init-decoder nip begin over set-utf16le-state ;
|
||||||
: char>utf16be ( char -- )
|
: char>utf16be ( char -- )
|
||||||
dup HEX: FFFF > [
|
dup HEX: FFFF > [
|
||||||
HEX: 10000 -
|
HEX: 10000 -
|
||||||
dup encode-first swap , ,
|
dup encode-first swap write1 write1
|
||||||
encode-second swap , ,
|
encode-second swap write1 write1
|
||||||
] [ h>b/b , , ] if ;
|
] [ h>b/b write1 write1 ] if ;
|
||||||
|
|
||||||
: encode-utf16be ( str -- seq )
|
: stream-write-utf16be ( string stream -- )
|
||||||
[ [ char>utf16be ] each ] B{ } make ;
|
[ [ char>utf16be ] each ] with-stream* ;
|
||||||
|
|
||||||
|
M: utf16be stream-write-encoded ( string stream encoding -- )
|
||||||
|
drop stream-write-utf16be ;
|
||||||
|
|
||||||
: char>utf16le ( char -- )
|
: char>utf16le ( char -- )
|
||||||
dup HEX: FFFF > [
|
dup HEX: FFFF > [
|
||||||
HEX: 10000 -
|
HEX: 10000 -
|
||||||
dup encode-first , ,
|
dup encode-first write1 write1
|
||||||
encode-second , ,
|
encode-second write1 write1
|
||||||
] [ h>b/b swap , , ] if ;
|
] [ h>b/b swap write1 write1 ] if ;
|
||||||
|
|
||||||
: encode-utf16le ( str -- seq )
|
: stream-write-utf16le ( string stream -- )
|
||||||
[ [ char>utf16le ] each ] B{ } make ;
|
[ [ char>utf16le ] each ] with-stream* ;
|
||||||
|
|
||||||
M: utf16le encode-string drop encode-utf16le ;
|
M: utf16le stream-write-encoded ( string stream encoding -- )
|
||||||
M: utf16be encode-string drop encode-utf16be ;
|
drop stream-write-utf16le ;
|
||||||
|
|
||||||
! UTF-16
|
! UTF-16
|
||||||
|
|
||||||
|
@ -132,19 +135,16 @@ M: utf16be encode-string drop encode-utf16be ;
|
||||||
|
|
||||||
: bom-be B{ HEX: fe HEX: ff } ; inline
|
: bom-be B{ HEX: fe HEX: ff } ; inline
|
||||||
|
|
||||||
: encode-utf16 ( str -- seq )
|
|
||||||
encode-utf16le bom-le swap append ;
|
|
||||||
|
|
||||||
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
|
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
|
||||||
|
|
||||||
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
||||||
|
|
||||||
TUPLE: utf16 started? ;
|
TUPLE: utf16 started? ;
|
||||||
|
|
||||||
M: utf16 encode-string
|
M: utf16 stream-write-encoded
|
||||||
>r encode-utf16le r>
|
|
||||||
dup utf16-started? [ drop ]
|
dup utf16-started? [ drop ]
|
||||||
[ t swap set-utf16-started? bom-le swap append ] if ;
|
[ t swap set-utf16-started? bom-le over stream-write ] if
|
||||||
|
stream-write-utf16le ;
|
||||||
|
|
||||||
: bom>le/be ( bom -- le/be )
|
: bom>le/be ( bom -- le/be )
|
||||||
dup bom-le sequence= [ drop utf16le ] [
|
dup bom-le sequence= [ drop utf16le ] [
|
||||||
|
|
Loading…
Reference in New Issue