Beginning of encoded streams
							parent
							
								
									5c37add12a
								
							
						
					
					
						commit
						99ff43b404
					
				| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
text
 | 
			
		||||
| 
						 | 
				
			
			@ -23,6 +23,18 @@ SYMBOL: begin
 | 
			
		|||
: finish-decoding ( buf ch state -- str )
 | 
			
		||||
    begin eq? [ decode-error ] unless drop "" like ;
 | 
			
		||||
 | 
			
		||||
: decode ( seq quot -- str )
 | 
			
		||||
    >r [ length <sbuf> 0 begin ] keep r> each
 | 
			
		||||
    finish-decoding ; inline
 | 
			
		||||
: decode ( ch state seq quot -- buf ch state )
 | 
			
		||||
    [ -rot ] swap compose each ; inline
 | 
			
		||||
 | 
			
		||||
: start-decoding ( seq -- buf ch state seq )
 | 
			
		||||
    [ length <sbuf> 0 begin ] keep ;
 | 
			
		||||
 | 
			
		||||
GENERIC: init-decoding ( stream encoding -- decoded-stream )
 | 
			
		||||
 | 
			
		||||
: <decoding> ( stream decoding-class -- decoded-stream )
 | 
			
		||||
    construct-empty init-decoding ;
 | 
			
		||||
 | 
			
		||||
GENERIC: init-encoding ( stream encoding -- encoded-stream )
 | 
			
		||||
 | 
			
		||||
: <encoding> ( stream encoding-class -- encoded-stream )
 | 
			
		||||
    construct-empty init-encoding ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
ISO 8859-1 encoding/decoding
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
Daniel Ehrenberg
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,5 @@
 | 
			
		|||
USING: help.syntax help.markup ;
 | 
			
		||||
IN: io.encodings.latin1
 | 
			
		||||
 | 
			
		||||
HELP: latin1
 | 
			
		||||
{ $class-description "This class is used for Latin 1 (ISO 8859-1) encoding and decoding" } ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,19 @@
 | 
			
		|||
USING: io.encodings strings kernel ;
 | 
			
		||||
IN: io.encodings.latin1
 | 
			
		||||
 | 
			
		||||
TUPLE: latin1 stream ;
 | 
			
		||||
 | 
			
		||||
M: latin1 init-decoding tuck set-latin1-stream ;
 | 
			
		||||
M: latin1 init-encoding drop ;
 | 
			
		||||
 | 
			
		||||
M: latin1 stream-read1
 | 
			
		||||
    latin1-stream stream-read1 ;
 | 
			
		||||
 | 
			
		||||
M: latin1 stream-read
 | 
			
		||||
    latin1-stream stream-read >string ;
 | 
			
		||||
 | 
			
		||||
M: latin1 stream-read-until
 | 
			
		||||
    latin1-stream stream-read-until >string ;
 | 
			
		||||
 | 
			
		||||
M: latin1 stream-readln
 | 
			
		||||
    latin1-stream stream-readln >string ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
text
 | 
			
		||||
										
											Binary file not shown.
										
									
								
							| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
UTF-16, UTF-16LE, UTF-16BE encoding and decoding
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
Daniel Ehrenberg
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
UTF16 encoding/decoding
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
text
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,45 @@
 | 
			
		|||
USING: help.markup help.syntax io.encodings strings ;
 | 
			
		||||
IN: io.encodings.utf16
 | 
			
		||||
 | 
			
		||||
ARTICLE: "io.utf16" "Working with UTF16-encoded data"
 | 
			
		||||
"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences."
 | 
			
		||||
{ $subsection encode-utf16le }
 | 
			
		||||
{ $subsection encode-utf16be }
 | 
			
		||||
{ $subsection decode-utf16le }
 | 
			
		||||
{ $subsection decode-utf16be }
 | 
			
		||||
"Support for UTF16 data with a byte order mark:"
 | 
			
		||||
{ $subsection encode-utf16 }
 | 
			
		||||
{ $subsection decode-utf16 } ;
 | 
			
		||||
 | 
			
		||||
ABOUT: "io.utf16"
 | 
			
		||||
 | 
			
		||||
HELP: decode-utf16
 | 
			
		||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
 | 
			
		||||
{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." }
 | 
			
		||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
 | 
			
		||||
 | 
			
		||||
HELP: decode-utf16be
 | 
			
		||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
 | 
			
		||||
{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
 | 
			
		||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
 | 
			
		||||
 | 
			
		||||
HELP: decode-utf16le
 | 
			
		||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
 | 
			
		||||
{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
 | 
			
		||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
 | 
			
		||||
 | 
			
		||||
{ decode-utf16 decode-utf16le decode-utf16be } related-words
 | 
			
		||||
 | 
			
		||||
HELP: encode-utf16be
 | 
			
		||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
 | 
			
		||||
{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ;
 | 
			
		||||
 | 
			
		||||
HELP: encode-utf16le
 | 
			
		||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
 | 
			
		||||
{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ;
 | 
			
		||||
 | 
			
		||||
HELP: encode-utf16
 | 
			
		||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
 | 
			
		||||
{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ;
 | 
			
		||||
 | 
			
		||||
{ encode-utf16 encode-utf16be encode-utf16le } related-words
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,15 @@
 | 
			
		|||
USING: tools.test io.utf16 arrays unicode.syntax ;
 | 
			
		||||
 | 
			
		||||
[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test
 | 
			
		||||
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test
 | 
			
		||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,116 @@
 | 
			
		|||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: math kernel sequences sbufs vectors namespaces io.binary
 | 
			
		||||
io.encodings combinators splitting ;
 | 
			
		||||
IN: io.utf16
 | 
			
		||||
 | 
			
		||||
SYMBOL: double
 | 
			
		||||
SYMBOL: quad1
 | 
			
		||||
SYMBOL: quad2
 | 
			
		||||
SYMBOL: quad3
 | 
			
		||||
SYMBOL: ignore
 | 
			
		||||
 | 
			
		||||
: do-ignore ( -- ch state ) 0 ignore ;
 | 
			
		||||
 | 
			
		||||
: append-nums ( byte ch -- ch )
 | 
			
		||||
    8 shift bitor ;
 | 
			
		||||
 | 
			
		||||
: end-multibyte ( buf byte ch -- buf ch state )
 | 
			
		||||
    append-nums decoded ;
 | 
			
		||||
 | 
			
		||||
: begin-utf16be ( buf byte -- buf ch state )
 | 
			
		||||
    dup -3 shift BIN: 11011 number= [
 | 
			
		||||
        dup BIN: 00000100 bitand zero?
 | 
			
		||||
        [ BIN: 11 bitand quad1 ]
 | 
			
		||||
        [ drop do-ignore ] if
 | 
			
		||||
    ] [ double ] if ;
 | 
			
		||||
 | 
			
		||||
: handle-quad2be ( byte ch -- ch state )
 | 
			
		||||
    swap dup -2 shift BIN: 110111 number= [
 | 
			
		||||
        >r 2 shift r> BIN: 11 bitand bitor quad3
 | 
			
		||||
    ] [ 2drop do-ignore ] if ;
 | 
			
		||||
 | 
			
		||||
: (decode-utf16be) ( buf byte ch state -- buf ch state )
 | 
			
		||||
    {
 | 
			
		||||
        { begin [ drop begin-utf16be ] }
 | 
			
		||||
        { double [ end-multibyte ] }
 | 
			
		||||
        { quad1 [ append-nums quad2 ] }
 | 
			
		||||
        { quad2 [ handle-quad2be ] }
 | 
			
		||||
        { quad3 [ append-nums HEX: 10000 + decoded ] }
 | 
			
		||||
        { ignore [ 2drop push-replacement ] }
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: decode-utf16be ( seq -- str )
 | 
			
		||||
    [ (decode-utf16be) ] decode ;
 | 
			
		||||
 | 
			
		||||
: handle-double ( buf byte ch -- buf ch state )
 | 
			
		||||
    swap dup -3 shift BIN: 11011 = [
 | 
			
		||||
        dup BIN: 100 bitand 0 number=
 | 
			
		||||
        [ BIN: 11 bitand 8 shift bitor quad2 ]
 | 
			
		||||
        [ 2drop push-replacement ] if
 | 
			
		||||
    ] [ end-multibyte ] if ;
 | 
			
		||||
 | 
			
		||||
: handle-quad3le ( buf byte ch -- buf ch state )
 | 
			
		||||
    swap dup -2 shift BIN: 110111 = [
 | 
			
		||||
        BIN: 11 bitand append-nums HEX: 10000 + decoded
 | 
			
		||||
    ] [ 2drop push-replacement ] if ;
 | 
			
		||||
 | 
			
		||||
: (decode-utf16le) ( buf byte ch state -- buf ch state )
 | 
			
		||||
    {
 | 
			
		||||
        { begin [ drop double ] }
 | 
			
		||||
        { double [ handle-double ] }
 | 
			
		||||
        { quad1 [ append-nums quad2 ] }
 | 
			
		||||
        { quad2 [ 10 shift bitor quad3 ] }
 | 
			
		||||
        { quad3 [ handle-quad3le ] }
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: decode-utf16le ( seq -- str )
 | 
			
		||||
    [ (decode-utf16le) ] decode ;
 | 
			
		||||
 | 
			
		||||
: encode-first
 | 
			
		||||
    -10 shift
 | 
			
		||||
    dup -8 shift BIN: 11011000 bitor
 | 
			
		||||
    swap HEX: FF bitand ;
 | 
			
		||||
 | 
			
		||||
: encode-second
 | 
			
		||||
    BIN: 1111111111 bitand
 | 
			
		||||
    dup -8 shift BIN: 11011100 bitor
 | 
			
		||||
    swap BIN: 11111111 bitand ;
 | 
			
		||||
 | 
			
		||||
: char>utf16be ( char -- )
 | 
			
		||||
    dup HEX: FFFF > [
 | 
			
		||||
        HEX: 10000 -
 | 
			
		||||
        dup encode-first swap , ,
 | 
			
		||||
        encode-second swap , ,
 | 
			
		||||
    ] [ h>b/b , , ] if ;
 | 
			
		||||
 | 
			
		||||
: encode-utf16be ( str -- seq )
 | 
			
		||||
    [ [ char>utf16be ] each ] B{ } make ;
 | 
			
		||||
 | 
			
		||||
: char>utf16le ( char -- )
 | 
			
		||||
    dup HEX: FFFF > [
 | 
			
		||||
        HEX: 10000 -
 | 
			
		||||
        dup encode-first , ,
 | 
			
		||||
        encode-second , ,
 | 
			
		||||
    ] [ h>b/b swap , , ] if ; 
 | 
			
		||||
 | 
			
		||||
: encode-utf16le ( str -- seq )
 | 
			
		||||
    [ [ char>utf16le ] each ] B{ } make ;
 | 
			
		||||
 | 
			
		||||
: bom-le B{ HEX: ff HEX: fe } ; inline
 | 
			
		||||
 | 
			
		||||
: bom-be B{ HEX: fe HEX: ff } ; inline
 | 
			
		||||
 | 
			
		||||
: encode-utf16 ( str -- seq )
 | 
			
		||||
    encode-utf16le bom-le swap append ;
 | 
			
		||||
 | 
			
		||||
: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
 | 
			
		||||
 | 
			
		||||
: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
 | 
			
		||||
 | 
			
		||||
: decode-utf16 ( seq -- str )
 | 
			
		||||
    {
 | 
			
		||||
        { [ utf16le? ] [ decode-utf16le ] }
 | 
			
		||||
        { [ utf16be? ] [ decode-utf16be ] }
 | 
			
		||||
        { [ t ] [ decode-error ] }
 | 
			
		||||
    } cond ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
UTF-8 encoding and decoding
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
Daniel Ehrenberg
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
UTF8 encoding/decoding
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
text
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,18 @@
 | 
			
		|||
USING: help.markup help.syntax io.encodings strings ;
 | 
			
		||||
IN: io.encodings.utf8
 | 
			
		||||
 | 
			
		||||
ARTICLE: "io.utf8" "Working with UTF8-encoded data"
 | 
			
		||||
"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences."
 | 
			
		||||
{ $subsection encode-utf8 }
 | 
			
		||||
{ $subsection decode-utf8 } ;
 | 
			
		||||
 | 
			
		||||
ABOUT: "io.utf8"
 | 
			
		||||
 | 
			
		||||
HELP: decode-utf8
 | 
			
		||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
 | 
			
		||||
{ $description "Decodes a sequence of bytes representing a Unicode string in UTF8 format." }
 | 
			
		||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
 | 
			
		||||
 | 
			
		||||
HELP: encode-utf8
 | 
			
		||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
 | 
			
		||||
{ $description "Encodes a Unicode string as a sequence of bytes in UTF8 format." } ;
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,16 @@
 | 
			
		|||
USING: io.utf8 tools.test strings arrays unicode.syntax ;
 | 
			
		||||
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ "x" ] [ "x" decode-utf8 >string ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
 | 
			
		||||
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,117 @@
 | 
			
		|||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: math kernel sequences sbufs vectors growable io
 | 
			
		||||
namespaces io.encodings combinators ;
 | 
			
		||||
IN: io.utf8
 | 
			
		||||
 | 
			
		||||
! Decoding UTF-8
 | 
			
		||||
 | 
			
		||||
SYMBOL: double
 | 
			
		||||
SYMBOL: triple
 | 
			
		||||
SYMBOL: triple2
 | 
			
		||||
SYMBOL: quad
 | 
			
		||||
SYMBOL: quad2
 | 
			
		||||
SYMBOL: quad3
 | 
			
		||||
 | 
			
		||||
: starts-2? ( char -- ? )
 | 
			
		||||
    -6 shift BIN: 10 number= ;
 | 
			
		||||
 | 
			
		||||
: append-nums ( buf bottom top state-out -- buf num state )
 | 
			
		||||
    >r over starts-2?
 | 
			
		||||
    [ 6 shift swap BIN: 111111 bitand bitor r> ]
 | 
			
		||||
    [ r> 3drop push-replacement ] if ;
 | 
			
		||||
 | 
			
		||||
: begin-utf8 ( buf byte -- buf ch state )
 | 
			
		||||
    {
 | 
			
		||||
        { [ dup -7 shift zero? ] [ decoded ] }
 | 
			
		||||
        { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
 | 
			
		||||
        { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
 | 
			
		||||
        { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
 | 
			
		||||
        { [ t ] [ drop push-replacement ] }
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
: end-multibyte ( buf byte ch -- buf ch state )
 | 
			
		||||
    f append-nums [ decoded ] unless* ;
 | 
			
		||||
 | 
			
		||||
: (decode-utf8) ( buf byte ch state -- buf ch state )
 | 
			
		||||
    {
 | 
			
		||||
        { begin [ drop begin-utf8 ] }
 | 
			
		||||
        { double [ end-multibyte ] }
 | 
			
		||||
        { triple [ triple2 append-nums ] }
 | 
			
		||||
        { triple2 [ end-multibyte ] }
 | 
			
		||||
        { quad [ quad2 append-nums ] }
 | 
			
		||||
        { quad2 [ quad3 append-nums ] }
 | 
			
		||||
        { quad3 [ end-multibyte ] }
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: decode-utf8-chunk ( ch state seq -- buf ch state )
 | 
			
		||||
    [ (decode-utf8) ] decode ;
 | 
			
		||||
 | 
			
		||||
: decode-utf8 ( seq -- str )
 | 
			
		||||
    start-decoding decode-utf8-chunk finish-decoding ;
 | 
			
		||||
 | 
			
		||||
! Encoding UTF-8
 | 
			
		||||
 | 
			
		||||
: encoded ( char -- )
 | 
			
		||||
    BIN: 111111 bitand BIN: 10000000 bitor , ;
 | 
			
		||||
 | 
			
		||||
: char>utf8 ( char -- )
 | 
			
		||||
    {
 | 
			
		||||
        { [ dup -7 shift zero? ] [ , ] }
 | 
			
		||||
        { [ dup -11 shift zero? ] [
 | 
			
		||||
            dup -6 shift BIN: 11000000 bitor ,
 | 
			
		||||
            encoded
 | 
			
		||||
        ] }
 | 
			
		||||
        { [ dup -16 shift zero? ] [
 | 
			
		||||
            dup -12 shift BIN: 11100000 bitor ,
 | 
			
		||||
            dup -6 shift encoded
 | 
			
		||||
            encoded
 | 
			
		||||
        ] }
 | 
			
		||||
        { [ t ] [
 | 
			
		||||
            dup -18 shift BIN: 11110000 bitor ,
 | 
			
		||||
            dup -12 shift encoded
 | 
			
		||||
            dup -6 shift encoded
 | 
			
		||||
            encoded
 | 
			
		||||
        ] }
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
: encode-utf8 ( str -- seq )
 | 
			
		||||
    [ [ char>utf8 ] each ] B{ } make ;
 | 
			
		||||
 | 
			
		||||
! Interface for streams
 | 
			
		||||
 | 
			
		||||
TUPLE: utf8 ;
 | 
			
		||||
! In the future, this should detect and ignore a BOM at the beginning
 | 
			
		||||
 | 
			
		||||
M: utf8 init-decoding ( stream utf8 -- utf8-stream )
 | 
			
		||||
    tuck set-delegate ;
 | 
			
		||||
 | 
			
		||||
M: utf8 init-encoding ( stream utf8 -- utf8-stream )
 | 
			
		||||
    tuck set-delegate ;
 | 
			
		||||
 | 
			
		||||
M: utf8 stream-read1 1 swap stream-read ;
 | 
			
		||||
 | 
			
		||||
: space ( resizable -- room-left )
 | 
			
		||||
    dup underlying swap [ length ] 2apply - ;
 | 
			
		||||
 | 
			
		||||
: full? ( resizable -- ? ) space zero? ;
 | 
			
		||||
 | 
			
		||||
: utf8-stream-read ( buf ch state stream -- string )
 | 
			
		||||
    >r pick full? [ r> 3drop >string ]  [
 | 
			
		||||
        pick space r> [ stream-read decode-utf8-chunk ] keep
 | 
			
		||||
        utf8-stream-read
 | 
			
		||||
    ] if ;
 | 
			
		||||
 | 
			
		||||
M: utf8 stream-read
 | 
			
		||||
    >r start-decoding drop r> delegate utf8-stream-read ;
 | 
			
		||||
 | 
			
		||||
M: utf8 stream-read-until
 | 
			
		||||
    ! Copied from { c-reader stream-read-until }!!!
 | 
			
		||||
    [ swap read-until-loop ] "" make
 | 
			
		||||
    swap over empty? over not and [ 2drop f f ] when ;
 | 
			
		||||
 | 
			
		||||
M: utf8 stream-write1
 | 
			
		||||
    >r 1string r> stream-write ;
 | 
			
		||||
 | 
			
		||||
M: utf8 stream-write
 | 
			
		||||
    >r encode-utf8 r> delegate stream-write ;
 | 
			
		||||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
! Copyright (C) 2007 Daniel Ehrenberg
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: delegate sequences.private sequences assocs prettyprint.sections 
 | 
			
		||||
io definitions kernel ;
 | 
			
		||||
io definitions kernel continuations ;
 | 
			
		||||
IN: delegate.protocols
 | 
			
		||||
 | 
			
		||||
PROTOCOL: sequence-protocol
 | 
			
		||||
| 
						 | 
				
			
			@ -15,7 +15,7 @@ PROTOCOL: assoc-protocol
 | 
			
		|||
    ! everything should work, just slower (with >alist)
 | 
			
		||||
 | 
			
		||||
PROTOCOL: stream-protocol
 | 
			
		||||
    stream-read1 stream-read stream-read-until
 | 
			
		||||
    stream-read1 stream-read stream-read-until dispose
 | 
			
		||||
    stream-flush stream-write1 stream-write stream-format
 | 
			
		||||
    stream-nl make-span-stream make-block-stream stream-readln
 | 
			
		||||
    make-cell-stream stream-write-table set-timeout ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue