Merge branch 'master' of git://factorcode.org/git/littledan

db4
Slava Pestov 2008-03-19 13:23:55 -05:00
commit e826f1fe4a
10 changed files with 216 additions and 268 deletions

View File

@ -1,3 +1,8 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
IN: io.encodings.binary SYMBOL: binary
USING: io.encodings kernel ;
IN: io.encodings.binary
TUPLE: binary ;
M: binary <encoder> drop ;
M: binary <decoder> drop ;

View File

@ -44,25 +44,21 @@ $nl { $vocab-link "io.encodings.utf16" } ;
ARTICLE: "encodings-protocol" "Encoding protocol"
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
{ $subsection decode-step }
{ $subsection init-decoder }
{ $subsection stream-write-encoded } ;
{ $subsection decode-char }
{ $subsection encode-char }
"The following methods are optional:"
{ $subsection <encoder> }
{ $subsection <decoder> } ;
HELP: decode-step ( buf char encoding -- )
{ $values { "buf" "A string buffer which characters can be pushed to" }
{ "char" "An octet which is read from a stream" }
HELP: decode-char ( stream encoding -- char/f )
{ $values { "stream" "an underlying input stream" }
{ "encoding" "An encoding descriptor tuple" } }
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change. This should not be used directly." } ;
{ $description "Reads a single code point from the underlying stream, interpreting it by the encoding. This should not be used directly." } ;
HELP: stream-write-encoded ( string stream encoding -- )
{ $values { "string" "a string" }
{ "stream" "an output stream" }
HELP: encode-char ( char stream encoding -- )
{ $values { "char" "a character" }
{ "stream" "an underlying output stream" }
{ "encoding" "an encoding descriptor" } }
{ $description "Encodes the string with the given encoding descriptor, outputing the result to the given stream. This should not be used directly." } ;
{ $description "Writes the code point in the encoding to the underlying stream given. This should not be used directly." } ;
HELP: init-decoder ( stream encoding -- encoding )
{ $values { "stream" "an input stream" }
{ "encoding" "an encoding descriptor" } }
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM. This should not be used directly." } ;
{ init-decoder decode-step stream-write-encoded } related-words
{ encode-char decode-char } related-words

View File

@ -2,62 +2,36 @@
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces
growable strings io classes continuations combinators
io.styles io.streams.plain io.encodings.binary splitting
io.streams.duplex byte-arrays ;
io.styles io.streams.plain splitting
io.streams.duplex byte-arrays sequences.private ;
IN: io.encodings
! The encoding descriptor protocol
GENERIC: decode-step ( buf char encoding -- )
M: object decode-step drop swap push ;
GENERIC: decode-char ( stream encoding -- char/f )
GENERIC: init-decoder ( stream encoding -- encoding )
M: tuple-class init-decoder construct-empty init-decoder ;
M: object init-decoder nip ;
GENERIC: encode-char ( char stream encoding -- )
GENERIC: stream-write-encoded ( string stream encoding -- byte-array )
M: object stream-write-encoded drop stream-write ;
GENERIC: <decoder> ( stream decoding -- newstream )
GENERIC: <encoder> ( stream encoding -- newstream )
: replacement-char HEX: fffd ;
! Decoding
<PRIVATE
TUPLE: decode-error ;
: decode-error ( -- * ) \ decode-error construct-empty throw ;
SYMBOL: begin
TUPLE: decoder stream code cr ;
M: tuple-class <decoder> construct-empty <decoder> ;
M: tuple <decoder> f decoder construct-boa ;
: push-decoded ( buf ch -- buf ch state )
over push 0 begin ;
: push-replacement ( buf -- buf ch state )
! This is the replacement character
HEX: fffd push-decoded ;
: space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ;
: full? ( resizable -- ? ) space zero? ;
: end-read-loop ( buf ch state stream quot -- string/f )
2drop 2drop >string f like ;
: decode-read-loop ( buf stream encoding -- string/f )
pick full? [ 2drop >string ] [
over stream-read1 [
-rot tuck >r >r >r dupd r> decode-step r> r>
decode-read-loop
] [ 2drop >string f like ] if*
] if ;
: decode-read ( length stream encoding -- string )
rot <sbuf> -rot decode-read-loop ;
TUPLE: decoder code cr ;
: <decoder> ( stream encoding -- newstream )
dup binary eq? [ drop ] [
dupd init-decoder { set-delegate set-decoder-code }
decoder construct
] if ;
: >decoder< ( decoder -- stream encoding )
{ decoder-stream decoder-code } get-slots ;
: cr+ t swap set-decoder-cr ; inline
@ -82,72 +56,83 @@ TUPLE: decoder code cr ;
over decoder-cr [
over cr-
"\n" ?head [
swap stream-read1 [ add ] when*
] [ nip ] if
] [ nip ] if ;
over stream-read1 [ add ] when*
] when
] when nip ;
: read-loop ( n stream -- string )
SBUF" " clone [
[
>r nip stream-read1 dup
[ r> push f ] [ r> 2drop t ] if
] 2curry find-integer drop
] keep "" like f like ;
M: decoder stream-read
tuck { delegate decoder-code } get-slots decode-read fix-read ;
tuck read-loop fix-read ;
M: decoder stream-read-partial stream-read ;
: decoder-read-until ( stream delim -- ch )
! Copied from { c-reader stream-read-until }!!!
over stream-read1 dup [
dup pick memq? [ 2nip ] [ , decoder-read-until ] if
] [
2nip
] if ;
: (read-until) ( buf quot -- string/f sep/f )
! quot: -- char stop?
dup call
[ >r drop "" like r> ]
[ pick push (read-until) ] if ; inline
M: decoder stream-read-until
! Copied from { c-reader stream-read-until }!!!
[ swap decoder-read-until ] "" make
swap over empty? over not and [ 2drop f f ] when ;
SBUF" " clone -rot >decoder<
[ decode-char [ dup rot memq? ] [ drop f t ] if* ] 3curry
(read-until) ;
: fix-read1 ( stream char -- char )
over decoder-cr [
over cr-
dup CHAR: \n = [
drop stream-read1
] [ nip ] if
] [ nip ] if ;
drop dup stream-read1
] when
] when nip ;
M: decoder stream-read1
1 swap stream-read f like [ first ] [ f ] if* ;
dup >decoder< decode-char fix-read1 ;
M: decoder stream-readln ( stream -- str )
"\r\n" over stream-read-until handle-readln ;
M: decoder dispose decoder-stream dispose ;
! Encoding
TUPLE: encode-error ;
: encode-error ( -- * ) \ encode-error construct-empty throw ;
TUPLE: encoder code ;
: <encoder> ( stream encoding -- newstream )
dup binary eq? [ drop ] [
construct-empty { set-delegate set-encoder-code }
encoder construct
] if ;
TUPLE: encoder stream code ;
M: tuple-class <encoder> construct-empty <encoder> ;
M: tuple <encoder> encoder construct-boa ;
: >encoder< ( encoder -- stream encoding )
{ encoder-stream encoder-code } get-slots ;
M: encoder stream-write1
>r 1string r> stream-write ;
>encoder< encode-char ;
M: encoder stream-write
{ delegate encoder-code } get-slots stream-write-encoded ;
>encoder< [ encode-char ] 2curry each ;
M: encoder dispose delegate dispose ;
M: encoder dispose encoder-stream dispose ;
M: encoder stream-flush encoder-stream stream-flush ;
INSTANCE: encoder plain-writer
! Rebinding duplex streams which have not read anything yet
: reencode ( stream encoding -- newstream )
over encoder? [ >r delegate r> ] when <encoder> ;
over encoder? [ >r encoder-stream r> ] when <encoder> ;
: redecode ( stream encoding -- newstream )
over decoder? [ >r delegate r> ] when <decoder> ;
over decoder? [ >r decoder-stream r> ] when <decoder> ;
PRIVATE>
: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
tuck reencode >r redecode r> <duplex-stream> ;

View File

@ -6,82 +6,68 @@ IN: io.encodings.utf8
! Decoding UTF-8
TUPLE: utf8 ch state ;
TUPLE: utf8 ;
SYMBOL: double
SYMBOL: triple
SYMBOL: triple2
SYMBOL: quad
SYMBOL: quad2
SYMBOL: quad3
<PRIVATE
: starts-2? ( char -- ? )
-6 shift BIN: 10 number= ;
dup [ -6 shift BIN: 10 number= ] when ;
: append-nums ( buf bottom top state-out -- buf num state )
>r over starts-2?
[ 6 shift swap BIN: 111111 bitand bitor r> ]
[ r> 3drop push-replacement ] if ;
: append-nums ( stream byte -- stream char )
over stream-read1 dup starts-2?
[ swap 6 shift swap BIN: 111111 bitand bitor ]
[ 2drop replacement-char ] if ;
: begin-utf8 ( buf byte -- buf ch state )
: double ( stream byte -- stream char )
BIN: 11111 bitand append-nums ;
: triple ( stream byte -- stream char )
BIN: 1111 bitand append-nums append-nums ;
: quad ( stream byte -- stream char )
BIN: 111 bitand append-nums append-nums append-nums ;
: begin-utf8 ( stream byte -- stream char )
{
{ [ dup -7 shift zero? ] [ push-decoded ] }
{ [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
{ [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
{ [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
{ [ t ] [ drop push-replacement ] }
{ [ dup -7 shift zero? ] [ ] }
{ [ dup -5 shift BIN: 110 number= ] [ double ] }
{ [ dup -4 shift BIN: 1110 number= ] [ triple ] }
{ [ dup -3 shift BIN: 11110 number= ] [ quad ] }
{ [ t ] [ drop replacement-char ] }
} cond ;
: end-multibyte ( buf byte ch -- buf ch state )
f append-nums [ push-decoded ] unless* ;
: decode-utf8 ( stream -- char/f )
dup stream-read1 dup [ begin-utf8 ] when nip ;
: decode-utf8-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf8 ] }
{ double [ end-multibyte ] }
{ triple [ triple2 append-nums ] }
{ triple2 [ end-multibyte ] }
{ quad [ quad2 append-nums ] }
{ quad2 [ quad3 append-nums ] }
{ quad3 [ end-multibyte ] }
} case ;
: unpack-state ( encoding -- ch state )
{ utf8-ch utf8-state } get-slots ;
: pack-state ( ch state encoding -- )
{ set-utf8-ch set-utf8-state } set-slots ;
M: utf8 decode-step ( buf char encoding -- )
[ unpack-state decode-utf8-step ] keep pack-state drop ;
M: utf8 init-decoder nip begin over set-utf8-state ;
M: utf8 decode-char
drop decode-utf8 ;
! Encoding UTF-8
: encoded ( char -- )
BIN: 111111 bitand BIN: 10000000 bitor write1 ;
: encoded ( stream char -- )
BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ;
: char>utf8 ( char -- )
: char>utf8 ( stream char -- )
{
{ [ dup -7 shift zero? ] [ write1 ] }
{ [ dup -7 shift zero? ] [ swap stream-write1 ] }
{ [ dup -11 shift zero? ] [
dup -6 shift BIN: 11000000 bitor write1
2dup -6 shift BIN: 11000000 bitor swap stream-write1
encoded
] }
{ [ dup -16 shift zero? ] [
dup -12 shift BIN: 11100000 bitor write1
dup -6 shift encoded
2dup -12 shift BIN: 11100000 bitor swap stream-write1
2dup -6 shift encoded
encoded
] }
{ [ t ] [
dup -18 shift BIN: 11110000 bitor write1
dup -12 shift encoded
dup -6 shift encoded
2dup -18 shift BIN: 11110000 bitor swap stream-write1
2dup -12 shift encoded
2dup -6 shift encoded
encoded
] }
} cond ;
M: utf8 stream-write-encoded
! For efficiency, this should be modified to avoid variable reads
drop [ [ char>utf8 ] each ] with-stream* ;
M: utf8 encode-char
drop swap char>utf8 ;
PRIVATE>

View File

@ -1,5 +1,5 @@
USING: byte-arrays byte-vectors kernel io.encodings io.streams.string
sequences io namespaces ;
sequences io namespaces io.encodings.private ;
IN: io.streams.byte-array
: <byte-writer> ( encoding -- stream )
@ -7,7 +7,7 @@ IN: io.streams.byte-array
: with-byte-writer ( encoding quot -- byte-array )
>r <byte-writer> r> [ stdio get ] compose with-stream*
>byte-array ; inline
dup encoder? [ encoder-stream ] when >byte-array ; inline
: <byte-reader> ( byte-array encoding -- stream )
>r >byte-vector dup reverse-here r> <decoder> ;

View File

@ -1,9 +1,9 @@
! Copyright (C) 2003, 2007 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
IN: io.streams.string
USING: io kernel math namespaces sequences sbufs strings
generic splitting growable continuations io.streams.plain
io.encodings ;
io.encodings io.encodings.private ;
IN: io.streams.string
M: growable dispose drop ;
@ -49,8 +49,11 @@ M: growable stream-read
M: growable stream-read-partial
stream-read ;
TUPLE: null ;
M: null decode-char drop stream-read1 ;
: <string-reader> ( str -- stream )
>sbuf dup reverse-here f <decoder> ;
>sbuf dup reverse-here null <decoder> ;
: with-string-reader ( str quot -- )
>r <string-reader> r> with-stream ; inline

View File

@ -1,18 +1,22 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ;
USING: io io.encodings kernel math io.encodings.private ;
IN: io.encodings.ascii
: encode-check< ( string stream max -- )
[ pick <= [ encode-error ] [ stream-write1 ] if ] 2curry each ;
<PRIVATE
: encode-if< ( char stream encoding max -- )
nip 1- pick < [ encode-error ] [ stream-write1 ] if ;
: push-if< ( sbuf character max -- )
over <= [ drop HEX: fffd ] when swap push ;
: decode-if< ( stream encoding max -- character )
nip swap stream-read1
[ tuck > [ drop replacement-char ] unless ]
[ drop f ] if* ;
PRIVATE>
TUPLE: ascii ;
M: ascii stream-write-encoded ( string stream encoding -- )
drop 128 encode-check< ;
M: ascii encode-char
128 encode-if< ;
M: ascii decode-step
drop 128 push-if< ;
M: ascii decode-char
128 decode-if< ;

View File

@ -1,12 +1,12 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings strings kernel io.encodings.ascii sequences math ;
USING: io io.encodings kernel io.encodings.ascii.private ;
IN: io.encodings.latin1
TUPLE: latin1 ;
M: latin1 stream-write-encoded
drop 256 encode-check< ;
M: latin1 encode-char
256 encode-if< ;
M: latin1 decode-step
drop swap push ;
M: latin1 decode-char
drop stream-read1 ;

View File

@ -1,133 +1,101 @@
! Copyright (C) 2006, 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting io byte-arrays ;
io.encodings combinators splitting io byte-arrays inspector ;
IN: io.encodings.utf16
TUPLE: utf16be ;
TUPLE: utf16le ;
TUPLE: utf16 ;
<PRIVATE
! UTF-16BE decoding
TUPLE: utf16be ch state ;
SYMBOL: double
SYMBOL: quad1
SYMBOL: quad2
SYMBOL: quad3
SYMBOL: ignore
: do-ignore ( -- ch state ) 0 ignore ;
: append-nums ( byte ch -- ch )
8 shift bitor ;
over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
: end-multibyte ( buf byte ch -- buf ch state )
append-nums push-decoded ;
: double-be ( stream byte -- stream char )
over stream-read1 swap append-nums ;
: begin-utf16be ( buf byte -- buf ch state )
: quad-be ( stream byte -- stream char )
double-be over stream-read1 [
dup -2 shift BIN: 110111 number= [
>r 2 shift r> BIN: 11 bitand bitor
over stream-read1 swap append-nums HEX: 10000 +
] [ 2drop dup stream-read1 drop replacement-char ] if
] when* ;
: ignore ( stream -- stream char )
dup stream-read1 drop replacement-char ;
: begin-utf16be ( stream byte -- stream char )
dup -3 shift BIN: 11011 number= [
dup BIN: 00000100 bitand zero?
[ BIN: 11 bitand quad1 ]
[ drop do-ignore ] if
] [ double ] if ;
: handle-quad2be ( byte ch -- ch state )
swap dup -2 shift BIN: 110111 number= [
>r 2 shift r> BIN: 11 bitand bitor quad3
] [ 2drop do-ignore ] if ;
: decode-utf16be-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf16be ] }
{ double [ end-multibyte ] }
{ quad1 [ append-nums quad2 ] }
{ quad2 [ handle-quad2be ] }
{ quad3 [ append-nums HEX: 10000 + push-decoded ] }
{ ignore [ 2drop push-replacement ] }
} case ;
: unpack-state-be ( encoding -- ch state )
{ utf16be-ch utf16be-state } get-slots ;
: pack-state-be ( ch state encoding -- )
{ set-utf16be-ch set-utf16be-state } set-slots ;
M: utf16be decode-step
[ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ;
M: utf16be init-decoder nip begin over set-utf16be-state ;
[ BIN: 11 bitand quad-be ]
[ drop ignore ] if
] [ double-be ] if ;
M: utf16be decode-char
drop dup stream-read1 dup [ begin-utf16be ] when nip ;
! UTF-16LE decoding
TUPLE: utf16le ch state ;
: quad-le ( stream ch -- stream char )
over stream-read1 swap 10 shift bitor
over stream-read1 dup -2 shift BIN: 110111 = [
BIN: 11 bitand append-nums HEX: 10000 +
] [ 2drop replacement-char ] if ;
: handle-double ( buf byte ch -- buf ch state )
swap dup -3 shift BIN: 11011 = [
: double-le ( stream byte1 byte2 -- stream char )
dup -3 shift BIN: 11011 = [
dup BIN: 100 bitand 0 number=
[ BIN: 11 bitand 8 shift bitor quad2 ]
[ 2drop push-replacement ] if
] [ end-multibyte ] if ;
[ BIN: 11 bitand 8 shift bitor quad-le ]
[ 2drop replacement-char ] if
] [ append-nums ] if ;
: handle-quad3le ( buf byte ch -- buf ch state )
swap dup -2 shift BIN: 110111 = [
BIN: 11 bitand append-nums HEX: 10000 + push-decoded
] [ 2drop push-replacement ] if ;
: begin-utf16le ( stream byte -- stream char )
over stream-read1 [ double-le ] [ drop replacement-char ] if* ;
: decode-utf16le-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop double ] }
{ double [ handle-double ] }
{ quad1 [ append-nums quad2 ] }
{ quad2 [ 10 shift bitor quad3 ] }
{ quad3 [ handle-quad3le ] }
} case ;
: unpack-state-le ( encoding -- ch state )
{ utf16le-ch utf16le-state } get-slots ;
: pack-state-le ( ch state encoding -- )
{ set-utf16le-ch set-utf16le-state } set-slots ;
M: utf16le decode-step
[ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ;
M: utf16le init-decoder nip begin over set-utf16le-state ;
M: utf16le decode-char
drop dup stream-read1 dup [ begin-utf16le ] when nip ;
! UTF-16LE/BE encoding
: encode-first
: encode-first ( char -- byte1 byte2 )
-10 shift
dup -8 shift BIN: 11011000 bitor
swap HEX: FF bitand ;
: encode-second
: encode-second ( char -- byte3 byte4 )
BIN: 1111111111 bitand
dup -8 shift BIN: 11011100 bitor
swap BIN: 11111111 bitand ;
: char>utf16be ( char -- )
: stream-write2 ( stream char1 char2 -- )
rot [ stream-write1 ] curry 2apply ;
: char>utf16be ( stream char -- )
dup HEX: FFFF > [
HEX: 10000 -
dup encode-first swap write1 write1
encode-second swap write1 write1
] [ h>b/b write1 write1 ] if ;
2dup encode-first stream-write2
encode-second stream-write2
] [ h>b/b swap stream-write2 ] if ;
: stream-write-utf16be ( string stream -- )
[ [ char>utf16be ] each ] with-stream* ;
M: utf16be encode-char ( char stream encoding -- )
drop swap char>utf16be ;
M: utf16be stream-write-encoded ( string stream encoding -- )
drop stream-write-utf16be ;
: char>utf16le ( char -- )
: char>utf16le ( char stream -- )
dup HEX: FFFF > [
HEX: 10000 -
dup encode-first write1 write1
encode-second write1 write1
] [ h>b/b swap write1 write1 ] if ;
2dup encode-first swap stream-write2
encode-second swap stream-write2
] [ h>b/b stream-write2 ] if ;
: stream-write-utf16le ( string stream -- )
[ [ char>utf16le ] each ] with-stream* ;
M: utf16le stream-write-encoded ( string stream encoding -- )
drop stream-write-utf16le ;
M: utf16le encode-char ( char stream encoding -- )
drop swap char>utf16le ;
! UTF-16
@ -139,17 +107,18 @@ M: utf16le stream-write-encoded ( string stream encoding -- )
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
TUPLE: utf16 started? ;
M: utf16 stream-write-encoded
dup utf16-started? [ drop ]
[ t swap set-utf16-started? bom-le over stream-write ] if
stream-write-utf16le ;
TUPLE: missing-bom ;
M: missing-bom summary drop "The BOM for a UTF-16 stream was missing" ;
: bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [
bom-be sequence= [ utf16be ] [ decode-error ] if
bom-be sequence= [ utf16be ] [ missing-bom ] if
] if ;
M: utf16 init-decoder ( stream encoding -- newencoding )
2 rot stream-read bom>le/be construct-empty init-decoder ;
M: utf16 <decoder> ( stream utf16 -- decoder )
drop 2 over stream-read bom>le/be <decoder> ;
M: utf16 <encoder> ( stream utf16 -- encoder )
drop bom-le over stream-write utf16le <encoder> ;
PRIVATE>

View File

@ -1,6 +1,6 @@
IN: io.unix.launcher.tests
USING: io.files tools.test io.launcher arrays io namespaces
continuations math io.encodings.ascii io.encodings.latin1
continuations math io.encodings.binary io.encodings.ascii
accessors kernel sequences ;
[ ] [
@ -64,7 +64,7 @@ accessors kernel sequences ;
[ ] [
2 [
"launcher-test-1" temp-file ascii <file-appender> [
"launcher-test-1" temp-file binary <file-appender> [
<process>
swap >>stdout
"echo Hello" >>command
@ -84,7 +84,7 @@ accessors kernel sequences ;
<process>
"env" >>command
{ { "A" "B" } } >>environment
latin1 <process-stream> lines
ascii <process-stream> lines
"A=B" swap member?
] unit-test
@ -93,5 +93,5 @@ accessors kernel sequences ;
"env" >>command
{ { "A" "B" } } >>environment
+replace-environment+ >>environment-mode
latin1 <process-stream> lines
ascii <process-stream> lines
] unit-test