Encodings updates; /* */ comments in multline

db4
Daniel Ehrenberg 2008-03-05 14:51:01 -06:00
parent 84052ac5f3
commit 93c4ac23a8
15 changed files with 188 additions and 154 deletions

View File

@ -3,7 +3,7 @@ IN: io.encodings
ABOUT: "encodings" ABOUT: "encodings"
ARTICLE: "encodings" "I/O encodings" ARTICLE: "io.encodings" "I/O encodings"
"Many streams deal with bytes, rather than Unicode code points, at some level. The translation between these two things is specified by an encoding. To abstract this away from the programmer, Factor provides a system where these streams are associated with an encoding which is always used when the stream is read from or written to. For most purposes, an encoding descriptor consisting of a symbol is all that is needed when initializing a stream." "Many streams deal with bytes, rather than Unicode code points, at some level. The translation between these two things is specified by an encoding. To abstract this away from the programmer, Factor provides a system where these streams are associated with an encoding which is always used when the stream is read from or written to. For most purposes, an encoding descriptor consisting of a symbol is all that is needed when initializing a stream."
{ $subsection "encodings-constructors" } { $subsection "encodings-constructors" }
{ $subsection "encodings-descriptors" } { $subsection "encodings-descriptors" }
@ -37,17 +37,37 @@ HELP: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
{ <encoder> <decoder> <encoder-duplex> } related-words { <encoder> <decoder> <encoder-duplex> } related-words
ARTICLE: "encodings-descriptors" "Encoding descriptors" ARTICLE: "encodings-descriptors" "Encoding descriptors"
"An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use include:" "An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use are defined in the following vocabularies:"
{ $vocab-link "io.encodings.utf8" } $nl { $vocab-link "io.encodings.utf8" }
{ $vocab-link "io.encodings.ascii" } $nl { $vocab-link "io.encodings.ascii" }
{ $vocab-link "io.encodings.binary" } $nl { $vocab-link "io.encodings.binary" }
{ $vocab-link "io.encodings.utf16" } ; $nl { $vocab-link "io.encodings.utf16" } ;
ARTICLE: "encodings-protocol" "Encoding protocol" ARTICLE: "encodings-protocol" "Encoding protocol"
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again." "An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
{ $subsection decode-step } { $subsection decode-step }
{ $subsection init-decoder }
{ $subsection encode-string } ; { $subsection encode-string } ;
ARTICLE: "encodings-string" "Encoding and decoding strings" ARTICLE: "encodings-string" "Encoding and decoding strings"
"Strings can be encoded and decoded with the following words:" "Strings can be encoded and decoded with the following words:"
{ $subsection encode-string } ; { $subsection encode-string } ;
HELP: decode-step ( buf char encoding -- )
{ $values { "buf" "A string buffer which characters can be pushed to" }
{ "char" "An octet which is read from a stream" }
{ "encoding" "An encoding descriptor tuple" } }
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change." } ;
HELP: encode-string ( string encoding -- byte-array )
{ $values { "string" "a string" }
{ "encoding" "an encoding descriptor" }
{ "byte-array" "an encoded byte-array" } }
{ $description "Encodes the string with the given encoding descriptor, outputting the result to a byte-array." } ;
HELP: init-decoder ( stream encoding -- encoding )
{ $values { "stream" "an input stream" }
{ "encoding" "an encoding descriptor" } }
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM." } ;
{ init-decoder decode-step encode-string } related-words

View File

@ -1,9 +1,9 @@
USING: io.streams.lines io.files io.streams.string io USING: io.files io.streams.string io
tools.test kernel ; tools.test kernel io.encodings.ascii ;
IN: io.streams.lines.tests IN: io.streams.encodings.tests
: <resource-reader> ( resource -- stream ) : <resource-reader> ( resource -- stream )
resource-path <file-reader> ; resource-path ascii <file-reader> ;
[ { } ] [ { } ]
[ "/core/io/test/empty-file.txt" <resource-reader> lines ] [ "/core/io/test/empty-file.txt" <resource-reader> lines ]

View File

@ -3,9 +3,22 @@
USING: math kernel sequences sbufs vectors namespaces USING: math kernel sequences sbufs vectors namespaces
growable strings io classes continuations combinators growable strings io classes continuations combinators
io.styles io.streams.plain io.encodings.binary splitting io.styles io.streams.plain io.encodings.binary splitting
io.streams.duplex ; io.streams.duplex byte-arrays ;
IN: io.encodings IN: io.encodings
! The encoding descriptor protocol
GENERIC: decode-step ( buf char encoding -- )
M: object decode-step drop swap push ;
GENERIC: init-decoder ( stream encoding -- encoding )
M: tuple-class init-decoder construct-empty init-decoder ;
M: object init-decoder nip ;
GENERIC: encode-string ( string encoding -- byte-array )
M: tuple-class encode-string construct-empty encode-string ;
M: object encode-string drop >byte-array ;
! Decoding ! Decoding
TUPLE: decode-error ; TUPLE: decode-error ;
@ -21,19 +34,6 @@ SYMBOL: begin
! This is the replacement character ! This is the replacement character
HEX: fffd push-decoded ; HEX: fffd push-decoded ;
: finish-decoding ( buf ch state -- str )
begin eq? [ decode-error ] unless drop "" like ;
: start-decoding ( seq length -- buf ch state seq )
<sbuf> 0 begin roll ;
GENERIC: decode-step ( buf byte ch state encoding -- buf ch state )
: decode ( seq quot -- string )
>r dup length start-decoding r>
[ -rot ] swap compose each
finish-decoding ; inline
: space ( resizable -- room-left ) : space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ; dup underlying swap [ length ] 2apply - ;
@ -42,34 +42,34 @@ GENERIC: decode-step ( buf byte ch state encoding -- buf ch state )
: end-read-loop ( buf ch state stream quot -- string/f ) : end-read-loop ( buf ch state stream quot -- string/f )
2drop 2drop >string f like ; 2drop 2drop >string f like ;
: decode-read-loop ( buf ch state stream encoding -- string/f ) : decode-read-loop ( buf stream encoding -- string/f )
>r >r pick r> r> rot full? [ end-read-loop ] [ pick full? [ 2drop >string ] [
over stream-read1 [ over stream-read1 [
-rot tuck >r >r >r -rot r> decode-step r> r> decode-read-loop -rot tuck >r >r >r dupd r> decode-step r> r>
] [ end-read-loop ] if* decode-read-loop
] [ 2drop >string f like ] if*
] if ; ] if ;
: decode-read ( length stream encoding -- string ) : decode-read ( length stream encoding -- string )
>r swap >fixnum start-decoding r> rot <sbuf> -rot decode-read-loop ;
decode-read-loop ;
TUPLE: decoded code cr ; TUPLE: decoder code cr ;
: <decoder> ( stream encoding -- newstream ) : <decoder> ( stream encoding -- newstream )
dup binary eq? [ drop ] [ dup binary eq? [ drop ] [
construct-empty { set-delegate set-decoded-code } dupd init-decoder { set-delegate set-decoder-code }
decoded construct decoder construct
] if ; ] if ;
: cr+ t swap set-decoded-cr ; inline : cr+ t swap set-decoder-cr ; inline
: cr- f swap set-decoded-cr ; inline : cr- f swap set-decoder-cr ; inline
: line-ends/eof ( stream str -- str ) f like swap cr- ; inline : line-ends/eof ( stream str -- str ) f like swap cr- ; inline
: line-ends\r ( stream str -- str ) swap cr+ ; inline : line-ends\r ( stream str -- str ) swap cr+ ; inline
: line-ends\n ( stream str -- str ) : line-ends\n ( stream str -- str )
over decoded-cr over empty? and over decoder-cr over empty? and
[ drop dup cr- stream-readln ] [ swap cr- ] if ; inline [ drop dup cr- stream-readln ] [ swap cr- ] if ; inline
: handle-readln ( stream str ch -- str ) : handle-readln ( stream str ch -- str )
@ -80,43 +80,43 @@ TUPLE: decoded code cr ;
} case ; } case ;
: fix-read ( stream string -- string ) : fix-read ( stream string -- string )
over decoded-cr [ over decoder-cr [
over cr- over cr-
"\n" ?head [ "\n" ?head [
swap stream-read1 [ add ] when* swap stream-read1 [ add ] when*
] [ nip ] if ] [ nip ] if
] [ nip ] if ; ] [ nip ] if ;
M: decoded stream-read M: decoder stream-read
tuck { delegate decoded-code } get-slots decode-read fix-read ; tuck { delegate decoder-code } get-slots decode-read fix-read ;
M: decoded stream-read-partial stream-read ; M: decoder stream-read-partial stream-read ;
: decoded-read-until ( stream delim -- ch ) : decoder-read-until ( stream delim -- ch )
! Copied from { c-reader stream-read-until }!!! ! Copied from { c-reader stream-read-until }!!!
over stream-read1 dup [ over stream-read1 dup [
dup pick memq? [ 2nip ] [ , decoded-read-until ] if dup pick memq? [ 2nip ] [ , decoder-read-until ] if
] [ ] [
2nip 2nip
] if ; ] if ;
M: decoded stream-read-until M: decoder stream-read-until
! Copied from { c-reader stream-read-until }!!! ! Copied from { c-reader stream-read-until }!!!
[ swap decoded-read-until ] "" make [ swap decoder-read-until ] "" make
swap over empty? over not and [ 2drop f f ] when ; swap over empty? over not and [ 2drop f f ] when ;
: fix-read1 ( stream char -- char ) : fix-read1 ( stream char -- char )
over decoded-cr [ over decoder-cr [
over cr- over cr-
dup CHAR: \n = [ dup CHAR: \n = [
drop stream-read1 drop stream-read1
] [ nip ] if ] [ nip ] if
] [ nip ] if ; ] [ nip ] if ;
M: decoded stream-read1 M: decoder stream-read1
1 swap stream-read f like [ first ] [ f ] if* ; 1 swap stream-read f like [ first ] [ f ] if* ;
M: decoded stream-readln ( stream -- str ) M: decoder stream-readln ( stream -- str )
"\r\n" over stream-read-until handle-readln ; "\r\n" over stream-read-until handle-readln ;
! Encoding ! Encoding
@ -125,41 +125,30 @@ TUPLE: encode-error ;
: encode-error ( -- * ) \ encode-error construct-empty throw ; : encode-error ( -- * ) \ encode-error construct-empty throw ;
TUPLE: encoded code ; TUPLE: encoder code ;
: <encoder> ( stream encoding -- newstream ) : <encoder> ( stream encoding -- newstream )
dup binary eq? [ drop ] [ dup binary eq? [ drop ] [
construct-empty { set-delegate set-encoded-code } construct-empty { set-delegate set-encoder-code }
encoded construct encoder construct
] if ; ] if ;
GENERIC: encode-string ( string encoding -- byte-array ) M: encoder stream-write1
M: tuple-class encode-string construct-empty encode-string ;
M: encoded stream-write1
>r 1string r> stream-write ; >r 1string r> stream-write ;
M: encoded stream-write M: encoder stream-write
[ encoded-code encode-string ] keep delegate stream-write ; [ encoder-code encode-string ] keep delegate stream-write ;
M: encoded dispose delegate dispose ; M: encoder dispose delegate dispose ;
INSTANCE: encoded plain-writer INSTANCE: encoder plain-writer
! Rebinding duplex streams which have not read anything yet ! Rebinding duplex streams which have not read anything yet
: reencode ( stream encoding -- newstream ) : reencode ( stream encoding -- newstream )
over encoded? [ >r delegate r> ] when <encoder> ; over encoder? [ >r delegate r> ] when <encoder> ;
: redecode ( stream encoding -- newstream ) : redecode ( stream encoding -- newstream )
over decoded? [ >r delegate r> ] when <decoder> ; over decoder? [ >r delegate r> ] when <decoder> ;
: <encoder-duplex> ( stream-in stream-out encoding -- duplex ) : <encoder-duplex> ( stream-in stream-out encoding -- duplex )
tuck reencode >r redecode r> <duplex-stream> ; tuck reencode >r redecode r> <duplex-stream> ;
! The null encoding does nothing
! (used to wrap things as line-reader/plain-writer)
! Later this will be replaced by inheritance
TUPLE: null-encoding ;
M: null-encoding encode-string drop ;
M: null-encoding decode-step 3drop over push f f ;

View File

@ -6,6 +6,6 @@ ARTICLE: "io.encodings.utf8" "Working with UTF8-encoded data"
{ $subsection utf8 } ; { $subsection utf8 } ;
HELP: utf8 HELP: utf8
{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. You can pass this class symbol as an encoding descriptor to words like " { $link <file-reader> } " and " { $link encode-string } "." } ; { $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. You can pass this class symbol as an encoding descriptor to words like " { $link <file-reader> } " and " { $link encode-string } ". This conforms to the " { $link "encodings-protocol" } "." } ;
ABOUT: "io.encodings.utf8" ABOUT: "io.encodings.utf8"

View File

@ -6,6 +6,8 @@ IN: io.encodings.utf8
! Decoding UTF-8 ! Decoding UTF-8
TUPLE: utf8 ch state ;
SYMBOL: double SYMBOL: double
SYMBOL: triple SYMBOL: triple
SYMBOL: triple2 SYMBOL: triple2
@ -44,8 +46,16 @@ SYMBOL: quad3
{ quad3 [ end-multibyte ] } { quad3 [ end-multibyte ] }
} case ; } case ;
: decode-utf8 ( seq -- str ) : unpack-state ( encoding -- ch state )
[ decode-utf8-step ] decode ; { utf8-ch utf8-state } get-slots ;
: pack-state ( ch state encoding -- )
{ set-utf8-ch set-utf8-state } set-slots ;
M: utf8 decode-step ( buf char encoding -- )
[ unpack-state decode-utf8-step ] keep pack-state drop ;
M: utf8 init-decoder nip begin over set-utf8-state ;
! Encoding UTF-8 ! Encoding UTF-8
@ -75,10 +85,4 @@ SYMBOL: quad3
: encode-utf8 ( str -- seq ) : encode-utf8 ( str -- seq )
[ [ char>utf8 ] each ] B{ } make ; [ [ char>utf8 ] each ] B{ } make ;
! Interface for streams
TUPLE: utf8 ;
M: utf8 encode-string drop encode-utf8 ; M: utf8 encode-string drop encode-utf8 ;
M: utf8 decode-step drop decode-utf8-step ;
! In the future, this should detect and ignore a BOM at the beginning

View File

@ -100,7 +100,7 @@ $nl
{ $subsection "stream-protocol" } { $subsection "stream-protocol" }
{ $subsection "stdio" } { $subsection "stdio" }
{ $subsection "stream-utils" } { $subsection "stream-utils" }
{ $see-also "io.streams.string" "io.streams.lines" "io.streams.plain" "io.streams.duplex" } ; { $see-also "io.streams.string" "io.streams.plain" "io.streams.duplex" } ;
ABOUT: "streams" ABOUT: "streams"

View File

@ -0,0 +1,33 @@
USING: help.syntax help.markup io byte-arrays quotations ;
IN: io.streams.byte-array
ABOUT: "io.streams.byte-array"
ARTICLE: "io.streams.byte-array" "Byte-array streams"
"Byte array streams:"
{ $subsection <byte-reader> }
{ $subsection <byte-writer> }
"Utility combinators:"
{ $subsection with-byte-reader }
{ $subsection with-byte-writer } ;
HELP: <byte-reader>
{ $values { "byte-array" byte-array }
{ "encoding" "an encoding descriptor" } }
{ $description "Provides an input stream reading off the given byte array using the given encoding." } ;
HELP: <byte-writer>
{ $values { "encoding" "an encoding descriptor" }
{ "stream" "an output stream" } }
{ $description "Provides an output stream, putting things in the given encoding, storing everything written to it in a byte-array." } ;
HELP: with-byte-reader
{ $values { "encoding" "an encoding descriptor" }
{ "quot" quotation } { "byte-array" byte-array } }
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading the byte array in the given encoding from beginning to end." } ;
HELP: with-byte-writer
{ $values { "encoding" "an encoding descriptor" }
{ "quot" quotation }
{ "byte-array" byte-array } }
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to a new byte array writer, putting things in the given encoding. The accumulated byte array is output when the quotation returns." } ;

View File

@ -26,4 +26,4 @@ HELP: <string-reader>
HELP: with-string-reader HELP: with-string-reader
{ $values { "str" string } { "quot" quotation } } { $values { "str" string } { "quot" quotation } }
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading " { $snippet "str" } " from beginning to end. The accumulated string is output when the quotation returns." } ; { $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading " { $snippet "str" } " from beginning to end." } ;

View File

@ -50,7 +50,7 @@ M: growable stream-read-partial
stream-read ; stream-read ;
: <string-reader> ( str -- stream ) : <string-reader> ( str -- stream )
>sbuf dup reverse-here null-encoding <decoder> ; >sbuf dup reverse-here f <decoder> ;
: with-string-reader ( str quot -- ) : with-string-reader ( str quot -- )
>r <string-reader> r> with-stream ; inline >r <string-reader> r> with-stream ; inline

View File

@ -2,7 +2,7 @@ USING: help help.markup help.syntax help.definitions help.topics
namespaces words sequences classes assocs vocabs kernel arrays namespaces words sequences classes assocs vocabs kernel arrays
prettyprint.backend kernel.private io generic math system prettyprint.backend kernel.private io generic math system
strings sbufs vectors byte-arrays bit-arrays float-arrays strings sbufs vectors byte-arrays bit-arrays float-arrays
quotations ; quotations io.streams.byte-array ;
IN: help.handbook IN: help.handbook
ARTICLE: "conventions" "Conventions" ARTICLE: "conventions" "Conventions"
@ -176,9 +176,9 @@ ARTICLE: "io" "Input and output"
{ $subsection "streams" } { $subsection "streams" }
"Wrapper streams:" "Wrapper streams:"
{ $subsection "io.streams.duplex" } { $subsection "io.streams.duplex" }
{ $subsection "io.streams.lines" }
{ $subsection "io.streams.plain" } { $subsection "io.streams.plain" }
{ $subsection "io.streams.string" } { $subsection "io.streams.string" }
{ $subsection "io.streams.byte-array" }
"Utilities:" "Utilities:"
{ $subsection "stream-binary" } { $subsection "stream-binary" }
{ $subsection "styles" } { $subsection "styles" }
@ -187,6 +187,7 @@ ARTICLE: "io" "Input and output"
{ $subsection "io.mmap" } { $subsection "io.mmap" }
{ $subsection "io.monitors" } { $subsection "io.monitors" }
{ $heading "Other features" } { $heading "Other features" }
{ $subsection "io.encodings" }
{ $subsection "network-streams" } { $subsection "network-streams" }
{ $subsection "io.launcher" } { $subsection "io.launcher" }
{ $subsection "io.timeouts" } ; { $subsection "io.timeouts" } ;

View File

@ -12,4 +12,4 @@ M: ascii encode-string
drop 127 encode-check<= ; drop 127 encode-check<= ;
M: ascii decode-step M: ascii decode-step
3drop dup 127 >= [ encode-error ] when over push f f ; drop dup 128 >= [ encode-error ] [ swap push ] if ;

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings strings kernel io.encodings.ascii sequences ; USING: io io.encodings strings kernel io.encodings.ascii sequences math ;
IN: io.encodings.latin1 IN: io.encodings.latin1
TUPLE: latin1 ; TUPLE: latin1 ;
@ -9,4 +9,4 @@ M: latin1 encode-string
drop 255 encode-check<= ; drop 255 encode-check<= ;
M: latin1 decode-step M: latin1 decode-step
3drop over push f f ; drop dup 256 >= [ encode-error ] [ swap push ] if ;

View File

@ -1,45 +1,22 @@
USING: help.markup help.syntax io.encodings strings ; USING: help.markup help.syntax io.encodings strings ;
IN: io.encodings.utf16 IN: io.encodings.utf16
ARTICLE: "io.utf16" "Working with UTF16-encoded data" ARTICLE: "utf16" "Working with UTF-16-encoded data"
"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences." "The UTF-16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences. There are three encoding descriptor classes for working with UTF-16, depending on endianness or the presence of a BOM:"
{ $subsection encode-utf16le } { $subsection utf16le }
{ $subsection encode-utf16be } { $subsection utf16be }
{ $subsection decode-utf16le } { $subsection utf16 }
{ $subsection decode-utf16be } "All of these conform to the " { $link "encodings-protocol" } "." ;
"Support for UTF16 data with a byte order mark:"
{ $subsection encode-utf16 }
{ $subsection decode-utf16 } ;
ABOUT: "io.utf16" ABOUT: "utf16"
HELP: decode-utf16 HELP: utf16le
{ $values { "seq" "a sequence of bytes" } { "str" string } } { $class-description "The encoding protocol for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ;
{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." }
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
HELP: decode-utf16be HELP: utf16be
{ $values { "seq" "a sequence of bytes" } { "str" string } } { $class-description "The encoding protocol for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ;
{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
HELP: decode-utf16le HELP: utf16
{ $values { "seq" "a sequence of bytes" } { "str" string } } { $class-description "The encoding protocol for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ;
{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
{ decode-utf16 decode-utf16le decode-utf16be } related-words { utf16 utf16le utf16be } related-words
HELP: encode-utf16be
{ $values { "str" string } { "seq" "a sequence of bytes" } }
{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ;
HELP: encode-utf16le
{ $values { "str" string } { "seq" "a sequence of bytes" } }
{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ;
HELP: encode-utf16
{ $values { "str" string } { "seq" "a sequence of bytes" } }
{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ;
{ encode-utf16 encode-utf16be encode-utf16le } related-words

View File

@ -4,6 +4,10 @@ USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting io byte-arrays ; io.encodings combinators splitting io byte-arrays ;
IN: io.encodings.utf16 IN: io.encodings.utf16
! UTF-16BE decoding
TUPLE: utf16be ch state ;
SYMBOL: double SYMBOL: double
SYMBOL: quad1 SYMBOL: quad1
SYMBOL: quad2 SYMBOL: quad2
@ -40,8 +44,20 @@ SYMBOL: ignore
{ ignore [ 2drop push-replacement ] } { ignore [ 2drop push-replacement ] }
} case ; } case ;
: decode-utf16be ( seq -- str ) : unpack-state-be ( encoding -- ch state )
[ decode-utf16be-step ] decode ; { utf16be-ch utf16be-state } get-slots ;
: pack-state-be ( ch state encoding -- )
{ set-utf16be-ch set-utf16be-state } set-slots ;
M: utf16be decode-step
[ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ;
M: utf16be init-decoder nip begin over set-utf16be-state ;
! UTF-16LE decoding
TUPLE: utf16le ch state ;
: handle-double ( buf byte ch -- buf ch state ) : handle-double ( buf byte ch -- buf ch state )
swap dup -3 shift BIN: 11011 = [ swap dup -3 shift BIN: 11011 = [
@ -64,8 +80,18 @@ SYMBOL: ignore
{ quad3 [ handle-quad3le ] } { quad3 [ handle-quad3le ] }
} case ; } case ;
: decode-utf16le ( seq -- str ) : unpack-state-le ( encoding -- ch state )
[ decode-utf16le-step ] decode ; { utf16le-ch utf16le-state } get-slots ;
: pack-state-le ( ch state encoding -- )
{ set-utf16le-ch set-utf16le-state } set-slots ;
M: utf16le decode-step
[ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ;
M: utf16le init-decoder nip begin over set-utf16le-state ;
! UTF-16LE/BE encoding
: encode-first : encode-first
-10 shift -10 shift
@ -97,6 +123,11 @@ SYMBOL: ignore
: encode-utf16le ( str -- seq ) : encode-utf16le ( str -- seq )
[ [ char>utf16le ] each ] B{ } make ; [ [ char>utf16le ] each ] B{ } make ;
M: utf16le encode-string drop encode-utf16le ;
M: utf16be encode-string drop encode-utf16be ;
! UTF-16
: bom-le B{ HEX: ff HEX: fe } ; inline : bom-le B{ HEX: ff HEX: fe } ; inline
: bom-be B{ HEX: fe HEX: ff } ; inline : bom-be B{ HEX: fe HEX: ff } ; inline
@ -108,40 +139,17 @@ SYMBOL: ignore
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; : start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
: decode-utf16 ( seq -- str ) TUPLE: utf16 started? ;
{
{ [ start-utf16le? ] [ decode-utf16le ] }
{ [ start-utf16be? ] [ decode-utf16be ] }
{ [ t ] [ decode-error ] }
} cond ;
TUPLE: utf16le ;
M: utf16le encode-string drop encode-utf16le ;
M: utf16le decode-step drop decode-utf16le-step ;
TUPLE: utf16be ;
M: utf16be encode-string drop encode-utf16be ;
M: utf16be decode-step drop decode-utf16be-step ;
TUPLE: utf16 encoding ;
M: utf16 encode-string M: utf16 encode-string
>r encode-utf16le r> >r encode-utf16le r>
dup utf16-encoding [ drop ] dup utf16-started? [ drop ]
[ t swap set-utf16-encoding bom-le swap append ] if ; [ t swap set-utf16-started? bom-le swap append ] if ;
: bom>le/be ( bom -- le/be ) : bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [ dup bom-le sequence= [ drop utf16le ] [
bom-be sequence= [ utf16be ] [ decode-error ] if bom-be sequence= [ utf16be ] [ decode-error ] if
] if ; ] if ;
: read-bom ( utf16 -- encoding ) M: utf16 init-decoder ( stream encoding -- newencoding )
2 over delegate stream-read bom>le/be construct-empty 2 rot stream-read bom>le/be construct-empty init-decoder ;
[ swap set-utf16-encoding ] keep ;
M: utf16 decode-step
! inefficient: checks if bom is done many times
! This should transform itself into utf16be or utf16le after reading BOM
dup utf16-encoding [ ] [ read-bom ] ?if decode-step ;

View File

@ -38,3 +38,5 @@ IN: multiline
: <" : <"
"\">" parse-multiline-string parsed ; parsing "\">" parse-multiline-string parsed ; parsing
: /* "*/" parse-multiline-string drop ; parsing