Initial optimization of encodings

db4
Daniel Ehrenberg 2008-03-18 17:01:14 -04:00
parent 037c8cf35e
commit b362175d43
8 changed files with 65 additions and 67 deletions

View File

@ -44,25 +44,21 @@ $nl { $vocab-link "io.encodings.utf16" } ;
ARTICLE: "encodings-protocol" "Encoding protocol" ARTICLE: "encodings-protocol" "Encoding protocol"
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again." "An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
{ $subsection decode-step } { $subsection decode-char }
{ $subsection init-decoder } { $subsection encode-char }
{ $subsection stream-write-encoded } ; "The following methods are optional:"
{ $subsection <encoder> }
{ $subsection <decoder> } ;
HELP: decode-step ( buf char encoding -- ) HELP: decode-char ( stream encoding -- char/f )
{ $values { "buf" "A string buffer which characters can be pushed to" } { $values { "stream" "an underlying input stream" }
{ "char" "An octet which is read from a stream" }
{ "encoding" "An encoding descriptor tuple" } } { "encoding" "An encoding descriptor tuple" } }
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change. This should not be used directly." } ; { $description "Reads a single code point from the underlying stream, interpreting it by the encoding. This should not be used directly." } ;
HELP: stream-write-encoded ( string stream encoding -- ) HELP: encode-char ( char stream encoding -- )
{ $values { "string" "a string" } { $values { "char" "a character" }
{ "stream" "an output stream" } { "stream" "an underlying output stream" }
{ "encoding" "an encoding descriptor" } } { "encoding" "an encoding descriptor" } }
{ $description "Encodes the string with the given encoding descriptor, outputing the result to the given stream. This should not be used directly." } ; { $description "Writes the code point in the encoding to the underlying stream given. This should not be used directly." } ;
HELP: init-decoder ( stream encoding -- encoding ) { encode-char decode-char } related-words
{ $values { "stream" "an input stream" }
{ "encoding" "an encoding descriptor" } }
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM. This should not be used directly." } ;
{ init-decoder decode-step stream-write-encoded } related-words

View File

@ -61,25 +61,28 @@ M: tuple <decoder> f decoder construct-boa ;
] when nip ; ] when nip ;
: read-loop ( n stream -- string ) : read-loop ( n stream -- string )
over 0 <string> [ SBUF" " clone [
[ [
>r stream-read1 dup >r nip stream-read1 dup
[ swap r> set-nth-unsafe f ] [ r> 3drop t ] if [ r> push f ] [ r> 2drop t ] if
] 2curry find-integer ] 2curry find-integer drop
] keep swap [ head ] when* ; ] keep "" like f like ;
M: decoder stream-read M: decoder stream-read
tuck read-loop fix-read ; tuck read-loop fix-read ;
M: decoder stream-read-partial stream-read ;
: (read-until) ( buf quot -- string/f sep/f ) : (read-until) ( buf quot -- string/f sep/f )
! quot: -- char keep-going? ! quot: -- char stop?
dup call dup call
[ >r drop "" like r> ] [ >r drop "" like r> ]
[ pick push (read-until) ] if ; inline [ pick push (read-until) ] if ; inline
M: decoder stream-read-until M: decoder stream-read-until
SBUF" " clone -rot >decoder< SBUF" " clone -rot >decoder<
[ decode-char dup rot memq? ] 3curry (read-until) ; [ decode-char [ dup rot memq? ] [ drop f t ] if* ] 3curry
(read-until) ;
: fix-read1 ( stream char -- char ) : fix-read1 ( stream char -- char )
over decoder-cr [ over decoder-cr [
@ -118,6 +121,8 @@ M: encoder stream-write
M: encoder dispose encoder-stream dispose ; M: encoder dispose encoder-stream dispose ;
M: encoder stream-flush encoder-stream stream-flush ;
INSTANCE: encoder plain-writer INSTANCE: encoder plain-writer
! Rebinding duplex streams which have not read anything yet ! Rebinding duplex streams which have not read anything yet

View File

@ -15,7 +15,7 @@ TUPLE: utf8 ;
: append-nums ( stream byte -- stream char ) : append-nums ( stream byte -- stream char )
over stream-read1 dup starts-2? over stream-read1 dup starts-2?
[ 6 shift swap BIN: 111111 bitand bitor ] [ swap 6 shift swap BIN: 111111 bitand bitor ]
[ 2drop replacement-char ] if ; [ 2drop replacement-char ] if ;
: double ( stream byte -- stream char ) : double ( stream byte -- stream char )

View File

@ -1,5 +1,5 @@
USING: byte-arrays byte-vectors kernel io.encodings io.streams.string USING: byte-arrays byte-vectors kernel io.encodings io.streams.string
sequences io namespaces ; sequences io namespaces io.encodings.private ;
IN: io.streams.byte-array IN: io.streams.byte-array
: <byte-writer> ( encoding -- stream ) : <byte-writer> ( encoding -- stream )
@ -7,7 +7,7 @@ IN: io.streams.byte-array
: with-byte-writer ( encoding quot -- byte-array ) : with-byte-writer ( encoding quot -- byte-array )
>r <byte-writer> r> [ stdio get ] compose with-stream* >r <byte-writer> r> [ stdio get ] compose with-stream*
>byte-array ; inline dup encoder? [ encoder-stream ] when >byte-array ; inline
: <byte-reader> ( byte-array encoding -- stream ) : <byte-reader> ( byte-array encoding -- stream )
>r >byte-vector dup reverse-here r> <decoder> ; >r >byte-vector dup reverse-here r> <decoder> ;

View File

@ -49,8 +49,11 @@ M: growable stream-read
M: growable stream-read-partial M: growable stream-read-partial
stream-read ; stream-read ;
TUPLE: null ;
M: null decode-char drop stream-read1 ;
: <string-reader> ( str -- stream ) : <string-reader> ( str -- stream )
>sbuf dup reverse-here f <decoder> ; >sbuf dup reverse-here null <decoder> ;
: with-string-reader ( str quot -- ) : with-string-reader ( str quot -- )
>r <string-reader> r> with-stream ; inline >r <string-reader> r> with-stream ; inline

View File

@ -1,14 +1,16 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings kernel math ; USING: io io.encodings kernel math io.encodings.private ;
IN: io.encodings.ascii IN: io.encodings.ascii
<PRIVATE <PRIVATE
: encode-if< ( char stream encoding max -- ) : encode-if< ( char stream encoding max -- )
nip pick > [ encode-error ] [ stream-write1 ] if ; nip 1- pick < [ encode-error ] [ stream-write1 ] if ;
: decode-if< ( stream encoding max -- character ) : decode-if< ( stream encoding max -- character )
nip swap stream-read1 tuck > [ drop replacement-character ] unless ; nip swap stream-read1
[ tuck > [ drop replacement-char ] unless ]
[ drop f ] if* ;
PRIVATE> PRIVATE>
TUPLE: ascii ; TUPLE: ascii ;

View File

@ -1,14 +1,14 @@
! Copyright (C) 2006, 2008 Daniel Ehrenberg. ! Copyright (C) 2006, 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces io.binary USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting io byte-arrays ; io.encodings combinators splitting io byte-arrays inspector ;
IN: io.encodings.utf16 IN: io.encodings.utf16
TUPLE: utf16be ; TUPLE: utf16be ;
TUPLE: utf16le ch state ; TUPLE: utf16le ;
TUPLE: utf16 started? ; TUPLE: utf16 ;
<PRIVATE <PRIVATE
@ -21,12 +21,12 @@ TUPLE: utf16 started? ;
over stream-read1 swap append-nums ; over stream-read1 swap append-nums ;
: quad-be ( stream byte -- stream char ) : quad-be ( stream byte -- stream char )
double-be over stream-read1 dup [ double-be over stream-read1 [
dup -2 shift BIN: 110111 number= [ dup -2 shift BIN: 110111 number= [
>r 2 shift r> BIN: 11 bitand bitor >r 2 shift r> BIN: 11 bitand bitor
over stream-read1 swap append-nums HEX: 10000 + over stream-read1 swap append-nums HEX: 10000 +
] [ 2drop replacement-char ] if ] [ 2drop dup stream-read1 drop replacement-char ] if
] when ; ] when* ;
: ignore ( stream -- stream char ) : ignore ( stream -- stream char )
dup stream-read1 drop replacement-char ; dup stream-read1 drop replacement-char ;
@ -38,7 +38,7 @@ TUPLE: utf16 started? ;
[ drop ignore ] if [ drop ignore ] if
] [ double-be ] if ; ] [ double-be ] if ;
M: decode-char M: utf16be decode-char
drop dup stream-read1 dup [ begin-utf16be ] when nip ; drop dup stream-read1 dup [ begin-utf16be ] when nip ;
! UTF-16LE decoding ! UTF-16LE decoding
@ -54,59 +54,48 @@ M: decode-char
dup BIN: 100 bitand 0 number= dup BIN: 100 bitand 0 number=
[ BIN: 11 bitand 8 shift bitor quad-le ] [ BIN: 11 bitand 8 shift bitor quad-le ]
[ 2drop replacement-char ] if [ 2drop replacement-char ] if
] [ swap append-nums ] if ; ] [ append-nums ] if ;
: decode-utf16le-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop double ] }
{ double [ handle-double ] }
{ quad2 [ 10 shift bitor quad3 ] }
{ quad3 [ handle-quad3le ] }
} case ;
: begin-utf16le ( stream byte -- stream char ) : begin-utf16le ( stream byte -- stream char )
over stream-read1 [ double-le ] [ drop replacement-char ] if* over stream-read1 [ double-le ] [ drop replacement-char ] if* ;
M: decode-char M: utf16le decode-char
drop dup stream-read1 dup [ begin-utf16le ] when nip ; drop dup stream-read1 dup [ begin-utf16le ] when nip ;
! UTF-16LE/BE encoding ! UTF-16LE/BE encoding
: encode-first : encode-first ( char -- byte1 byte2 )
-10 shift -10 shift
dup -8 shift BIN: 11011000 bitor dup -8 shift BIN: 11011000 bitor
swap HEX: FF bitand ; swap HEX: FF bitand ;
: encode-second : encode-second ( char -- byte3 byte4 )
BIN: 1111111111 bitand BIN: 1111111111 bitand
dup -8 shift BIN: 11011100 bitor dup -8 shift BIN: 11011100 bitor
swap BIN: 11111111 bitand ; swap BIN: 11111111 bitand ;
: stream-write2 ( stream char1 char2 -- ) : stream-write2 ( stream char1 char2 -- )
rot [ stream-write1 ] 2apply ; rot [ stream-write1 ] curry 2apply ;
: char>utf16be ( stream char -- ) : char>utf16be ( stream char -- )
dup HEX: FFFF > [ dup HEX: FFFF > [
HEX: 10000 - HEX: 10000 -
dup encode-first stream-write2 2dup encode-first stream-write2
encode-second stream-write2 encode-second stream-write2
] [ h>b/b swap stream-write2 ] if ; ] [ h>b/b swap stream-write2 ] if ;
M: utf16be encode-char ( char stream encoding -- ) M: utf16be encode-char ( char stream encoding -- )
drop char>utf16be ; drop swap char>utf16be ;
: char>utf16le ( char -- ) : char>utf16le ( char stream -- )
dup HEX: FFFF > [ dup HEX: FFFF > [
HEX: 10000 - HEX: 10000 -
dup encode-first swap stream-write2 2dup encode-first swap stream-write2
encode-second swap stream-write2 encode-second swap stream-write2
] [ h>b/b stream-write2 ] if ; ] [ h>b/b stream-write2 ] if ;
: stream-write-utf16le ( string stream -- ) M: utf16le encode-char ( char stream encoding -- )
[ [ char>utf16le ] each ] with-stream* ; drop swap char>utf16le ;
M: utf16le stream-write-encoded ( string stream encoding -- )
drop stream-write-utf16le ;
! UTF-16 ! UTF-16
@ -118,13 +107,16 @@ M: utf16le stream-write-encoded ( string stream encoding -- )
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; : start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
TUPLE: missing-bom ;
M: missing-bom summary drop "The BOM for a UTF-16 stream was missing" ;
: bom>le/be ( bom -- le/be ) : bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [ dup bom-le sequence= [ drop utf16le ] [
bom-be sequence= [ utf16be ] [ decode-error ] if bom-be sequence= [ utf16be ] [ missing-bom ] if
] if ; ] if ;
M: utf16 <decoder> ( stream utf16 -- decoder ) M: utf16 <decoder> ( stream utf16 -- decoder )
2 rot stream-read bom>le/be <decoder> ; drop 2 over stream-read bom>le/be <decoder> ;
M: utf16 <encoder> ( stream utf16 -- encoder ) M: utf16 <encoder> ( stream utf16 -- encoder )
drop bom-le over stream-write utf16le <encoder> ; drop bom-le over stream-write utf16le <encoder> ;

View File

@ -1,6 +1,6 @@
IN: io.unix.launcher.tests IN: io.unix.launcher.tests
USING: io.files tools.test io.launcher arrays io namespaces USING: io.files tools.test io.launcher arrays io namespaces
continuations math io.encodings.ascii io.encodings.latin1 continuations math io.encodings.binary io.encodings.ascii
accessors kernel sequences ; accessors kernel sequences ;
[ ] [ [ ] [
@ -64,7 +64,7 @@ accessors kernel sequences ;
[ ] [ [ ] [
2 [ 2 [
"launcher-test-1" temp-file ascii <file-appender> [ "launcher-test-1" temp-file binary <file-appender> [
<process> <process>
swap >>stdout swap >>stdout
"echo Hello" >>command "echo Hello" >>command
@ -84,7 +84,7 @@ accessors kernel sequences ;
<process> <process>
"env" >>command "env" >>command
{ { "A" "B" } } >>environment { { "A" "B" } } >>environment
latin1 <process-stream> lines ascii <process-stream> lines
"A=B" swap member? "A=B" swap member?
] unit-test ] unit-test
@ -93,5 +93,5 @@ accessors kernel sequences ;
"env" >>command "env" >>command
{ { "A" "B" } } >>environment { { "A" "B" } } >>environment
+replace-environment+ >>environment-mode +replace-environment+ >>environment-mode
latin1 <process-stream> lines ascii <process-stream> lines
] unit-test ] unit-test