Modifications to the encoding protocol for later optimization

db4
Daniel Ehrenberg 2008-03-06 00:23:38 -06:00
parent d8858ef924
commit ee9b940bc6
11 changed files with 70 additions and 73 deletions

View File

@ -46,23 +46,23 @@ ARTICLE: "encodings-protocol" "Encoding protocol"
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again." "An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
{ $subsection decode-step } { $subsection decode-step }
{ $subsection init-decoder } { $subsection init-decoder }
{ $subsection encode-string } ; { $subsection stream-write-encoded } ;
HELP: decode-step ( buf char encoding -- ) HELP: decode-step ( buf char encoding -- )
{ $values { "buf" "A string buffer which characters can be pushed to" } { $values { "buf" "A string buffer which characters can be pushed to" }
{ "char" "An octet which is read from a stream" } { "char" "An octet which is read from a stream" }
{ "encoding" "An encoding descriptor tuple" } } { "encoding" "An encoding descriptor tuple" } }
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change." } ; { $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change. This should not be used directly." } ;
HELP: encode-string ( string encoding -- byte-array ) HELP: stream-write-encoded ( string stream encoding -- )
{ $values { "string" "a string" } { $values { "string" "a string" }
{ "encoding" "an encoding descriptor" } { "stream" "an output stream" }
{ "byte-array" "an encoded byte-array" } } { "encoding" "an encoding descriptor" } }
{ $description "Encodes the string with the given encoding descriptor, outputting the result to a byte-array." } ; { $description "Encodes the string with the given encoding descriptor, outputing the result to the given stream. This should not be used directly." } ;
HELP: init-decoder ( stream encoding -- encoding ) HELP: init-decoder ( stream encoding -- encoding )
{ $values { "stream" "an input stream" } { $values { "stream" "an input stream" }
{ "encoding" "an encoding descriptor" } } { "encoding" "an encoding descriptor" } }
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM." } ; { $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM. This should not be used directly." } ;
{ init-decoder decode-step encode-string } related-words { init-decoder decode-step stream-write-encoded } related-words

View File

@ -15,9 +15,8 @@ GENERIC: init-decoder ( stream encoding -- encoding )
M: tuple-class init-decoder construct-empty init-decoder ; M: tuple-class init-decoder construct-empty init-decoder ;
M: object init-decoder nip ; M: object init-decoder nip ;
GENERIC: encode-string ( string encoding -- byte-array ) GENERIC: stream-write-encoded ( string stream encoding -- byte-array )
M: tuple-class encode-string construct-empty encode-string ; M: object stream-write-encoded drop stream-write ;
M: object encode-string drop >byte-array ;
! Decoding ! Decoding
@ -136,7 +135,7 @@ M: encoder stream-write1
>r 1string r> stream-write ; >r 1string r> stream-write ;
M: encoder stream-write M: encoder stream-write
[ encoder-code encode-string ] keep delegate stream-write ; { delegate encoder-code } get-slots stream-write-encoded ;
M: encoder dispose delegate dispose ; M: encoder dispose delegate dispose ;

View File

@ -1,3 +1,5 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax byte-arrays strings ; USING: help.markup help.syntax byte-arrays strings ;
IN: io.encodings.string IN: io.encodings.string

View File

@ -1,3 +1,5 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: strings io.encodings.utf8 io.encodings.utf16 USING: strings io.encodings.utf8 io.encodings.utf16
io.encodings.string tools.test ; io.encodings.string tools.test ;
IN: io.encodings.string.tests IN: io.encodings.string.tests

View File

@ -1,3 +1,5 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: io io.streams.byte-array ; USING: io io.streams.byte-array ;
IN: io.encodings.string IN: io.encodings.string

View File

@ -1,21 +1,20 @@
USING: io.encodings.utf8 tools.test sbufs kernel io io.encodings USING: io.encodings.utf8 tools.test io.encodings.string strings arrays ;
sequences strings arrays unicode io.streams.byte-array ;
: decode-utf8-w/stream ( array -- newarray ) : decode-utf8-w/stream ( array -- newarray )
utf8 <byte-reader> contents >array ; utf8 decode >array ;
: encode-utf8-w/stream ( array -- newarray ) : encode-utf8-w/stream ( array -- newarray )
utf8 [ write ] with-byte-writer >array ; utf8 encode >array ;
[ { CHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test [ { CHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream ] unit-test
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test [ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream ] unit-test
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test [ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test [ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test [ { CHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test [ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test

View File

@ -60,29 +60,28 @@ M: utf8 init-decoder nip begin over set-utf8-state ;
! Encoding UTF-8 ! Encoding UTF-8
: encoded ( char -- ) : encoded ( char -- )
BIN: 111111 bitand BIN: 10000000 bitor , ; BIN: 111111 bitand BIN: 10000000 bitor write1 ;
: char>utf8 ( char -- ) : char>utf8 ( char -- )
{ {
{ [ dup -7 shift zero? ] [ , ] } { [ dup -7 shift zero? ] [ write1 ] }
{ [ dup -11 shift zero? ] [ { [ dup -11 shift zero? ] [
dup -6 shift BIN: 11000000 bitor , dup -6 shift BIN: 11000000 bitor write1
encoded encoded
] } ] }
{ [ dup -16 shift zero? ] [ { [ dup -16 shift zero? ] [
dup -12 shift BIN: 11100000 bitor , dup -12 shift BIN: 11100000 bitor write1
dup -6 shift encoded dup -6 shift encoded
encoded encoded
] } ] }
{ [ t ] [ { [ t ] [
dup -18 shift BIN: 11110000 bitor , dup -18 shift BIN: 11110000 bitor write1
dup -12 shift encoded dup -12 shift encoded
dup -6 shift encoded dup -6 shift encoded
encoded encoded
] } ] }
} cond ; } cond ;
: encode-utf8 ( str -- seq ) M: utf8 stream-write-encoded
[ [ char>utf8 ] each ] B{ } make ; ! For efficiency, this should be modified to avoid variable reads
drop [ [ char>utf8 ] each ] with-stream* ;
M: utf8 encode-string drop encode-utf8 ;

View File

@ -3,13 +3,13 @@
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ; USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ;
IN: io.encodings.ascii IN: io.encodings.ascii
: encode-check<= ( string max -- byte-array ) : encode-check<= ( string stream max -- )
dupd [ <= ] curry all? [ >byte-array ] [ encode-error ] if ; [ pick > [ encode-error ] [ stream-write1 ] if ] 2curry each ;
TUPLE: ascii ; TUPLE: ascii ;
M: ascii encode-string M: ascii stream-write-encoded ( string stream encoding -- )
drop 127 encode-check<= ; drop 127 encode-check<= ;
M: ascii decode-step M: ascii decode-step
drop dup 128 >= [ encode-error ] [ swap push ] if ; drop dup 128 >= [ decode-error ] [ swap push ] if ;

View File

@ -5,8 +5,8 @@ IN: io.encodings.latin1
TUPLE: latin1 ; TUPLE: latin1 ;
M: latin1 encode-string M: latin1 stream-write-encoded
drop 255 encode-check<= ; drop 255 encode-check<= ;
M: latin1 decode-step M: latin1 decode-step
drop dup 256 >= [ encode-error ] [ swap push ] if ; drop dup 256 >= [ decode-error ] [ swap push ] if ;

View File

@ -1,28 +1,22 @@
USING: kernel tools.test io.encodings.utf16 arrays sbufs USING: kernel tools.test io.encodings.utf16 arrays sbufs
sequences io.encodings io unicode io.streams.byte-array ; sequences io.encodings io unicode io.encodings.string ;
: decode-w/stream ( array encoding -- newarray ) [ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode >array ] unit-test
<byte-reader> contents >array ; [ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode >array ] unit-test
: encode-w/stream ( array encoding -- newarray ) [ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode >array ] unit-test
[ write ] with-byte-writer >array ;
[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode-w/stream ] unit-test [ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode >array ] unit-test
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode-w/stream ] unit-test [ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode-w/stream ] unit-test [ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode-w/stream ] unit-test [ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode >array ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode >array ] unit-test
[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode-w/stream ] unit-test [ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode >array ] unit-test
[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode-w/stream ] unit-test [ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode >array ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test [ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode >array ] unit-test
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode-w/stream ] unit-test
[ { CHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode-w/stream ] unit-test
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test
[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode-w/stream ] unit-test [ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode >array ] unit-test
[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode-w/stream ] unit-test
[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode-w/stream ] unit-test
[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode-w/stream ] unit-test

View File

@ -106,25 +106,28 @@ M: utf16le init-decoder nip begin over set-utf16le-state ;
: char>utf16be ( char -- ) : char>utf16be ( char -- )
dup HEX: FFFF > [ dup HEX: FFFF > [
HEX: 10000 - HEX: 10000 -
dup encode-first swap , , dup encode-first swap write1 write1
encode-second swap , , encode-second swap write1 write1
] [ h>b/b , , ] if ; ] [ h>b/b write1 write1 ] if ;
: encode-utf16be ( str -- seq ) : stream-write-utf16be ( string stream -- )
[ [ char>utf16be ] each ] B{ } make ; [ [ char>utf16be ] each ] with-stream* ;
M: utf16be stream-write-encoded ( string stream encoding -- )
drop stream-write-utf16be ;
: char>utf16le ( char -- ) : char>utf16le ( char -- )
dup HEX: FFFF > [ dup HEX: FFFF > [
HEX: 10000 - HEX: 10000 -
dup encode-first , , dup encode-first write1 write1
encode-second , , encode-second write1 write1
] [ h>b/b swap , , ] if ; ] [ h>b/b swap write1 write1 ] if ;
: encode-utf16le ( str -- seq ) : stream-write-utf16le ( string stream -- )
[ [ char>utf16le ] each ] B{ } make ; [ [ char>utf16le ] each ] with-stream* ;
M: utf16le encode-string drop encode-utf16le ; M: utf16le stream-write-encoded ( string stream encoding -- )
M: utf16be encode-string drop encode-utf16be ; drop stream-write-utf16le ;
! UTF-16 ! UTF-16
@ -132,19 +135,16 @@ M: utf16be encode-string drop encode-utf16be ;
: bom-be B{ HEX: fe HEX: ff } ; inline : bom-be B{ HEX: fe HEX: ff } ; inline
: encode-utf16 ( str -- seq )
encode-utf16le bom-le swap append ;
: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ; : start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; : start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
TUPLE: utf16 started? ; TUPLE: utf16 started? ;
M: utf16 encode-string M: utf16 stream-write-encoded
>r encode-utf16le r>
dup utf16-started? [ drop ] dup utf16-started? [ drop ]
[ t swap set-utf16-started? bom-le swap append ] if ; [ t swap set-utf16-started? bom-le over stream-write ] if
stream-write-utf16le ;
: bom>le/be ( bom -- le/be ) : bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [ dup bom-le sequence= [ drop utf16le ] [