Encodings updates; /* */ comments in multline
parent
84052ac5f3
commit
93c4ac23a8
|
@ -3,7 +3,7 @@ IN: io.encodings
|
|||
|
||||
ABOUT: "encodings"
|
||||
|
||||
ARTICLE: "encodings" "I/O encodings"
|
||||
ARTICLE: "io.encodings" "I/O encodings"
|
||||
"Many streams deal with bytes, rather than Unicode code points, at some level. The translation between these two things is specified by an encoding. To abstract this away from the programmer, Factor provides a system where these streams are associated with an encoding which is always used when the stream is read from or written to. For most purposes, an encoding descriptor consisting of a symbol is all that is needed when initializing a stream."
|
||||
{ $subsection "encodings-constructors" }
|
||||
{ $subsection "encodings-descriptors" }
|
||||
|
@ -37,17 +37,37 @@ HELP: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
|
|||
{ <encoder> <decoder> <encoder-duplex> } related-words
|
||||
|
||||
ARTICLE: "encodings-descriptors" "Encoding descriptors"
|
||||
"An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use include:"
|
||||
{ $vocab-link "io.encodings.utf8" }
|
||||
{ $vocab-link "io.encodings.ascii" }
|
||||
{ $vocab-link "io.encodings.binary" }
|
||||
{ $vocab-link "io.encodings.utf16" } ;
|
||||
"An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use are defined in the following vocabularies:"
|
||||
$nl { $vocab-link "io.encodings.utf8" }
|
||||
$nl { $vocab-link "io.encodings.ascii" }
|
||||
$nl { $vocab-link "io.encodings.binary" }
|
||||
$nl { $vocab-link "io.encodings.utf16" } ;
|
||||
|
||||
ARTICLE: "encodings-protocol" "Encoding protocol"
|
||||
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
|
||||
{ $subsection decode-step }
|
||||
{ $subsection init-decoder }
|
||||
{ $subsection encode-string } ;
|
||||
|
||||
ARTICLE: "encodings-string" "Encoding and decoding strings"
|
||||
"Strings can be encoded and decoded with the following words:"
|
||||
{ $subsection encode-string } ;
|
||||
|
||||
HELP: decode-step ( buf char encoding -- )
|
||||
{ $values { "buf" "A string buffer which characters can be pushed to" }
|
||||
{ "char" "An octet which is read from a stream" }
|
||||
{ "encoding" "An encoding descriptor tuple" } }
|
||||
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change." } ;
|
||||
|
||||
HELP: encode-string ( string encoding -- byte-array )
|
||||
{ $values { "string" "a string" }
|
||||
{ "encoding" "an encoding descriptor" }
|
||||
{ "byte-array" "an encoded byte-array" } }
|
||||
{ $description "Encodes the string with the given encoding descriptor, outputting the result to a byte-array." } ;
|
||||
|
||||
HELP: init-decoder ( stream encoding -- encoding )
|
||||
{ $values { "stream" "an input stream" }
|
||||
{ "encoding" "an encoding descriptor" } }
|
||||
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM." } ;
|
||||
|
||||
{ init-decoder decode-step encode-string } related-words
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
USING: io.streams.lines io.files io.streams.string io
|
||||
tools.test kernel ;
|
||||
IN: io.streams.lines.tests
|
||||
USING: io.files io.streams.string io
|
||||
tools.test kernel io.encodings.ascii ;
|
||||
IN: io.streams.encodings.tests
|
||||
|
||||
: <resource-reader> ( resource -- stream )
|
||||
resource-path <file-reader> ;
|
||||
resource-path ascii <file-reader> ;
|
||||
|
||||
[ { } ]
|
||||
[ "/core/io/test/empty-file.txt" <resource-reader> lines ]
|
|
@ -3,9 +3,22 @@
|
|||
USING: math kernel sequences sbufs vectors namespaces
|
||||
growable strings io classes continuations combinators
|
||||
io.styles io.streams.plain io.encodings.binary splitting
|
||||
io.streams.duplex ;
|
||||
io.streams.duplex byte-arrays ;
|
||||
IN: io.encodings
|
||||
|
||||
! The encoding descriptor protocol
|
||||
|
||||
GENERIC: decode-step ( buf char encoding -- )
|
||||
M: object decode-step drop swap push ;
|
||||
|
||||
GENERIC: init-decoder ( stream encoding -- encoding )
|
||||
M: tuple-class init-decoder construct-empty init-decoder ;
|
||||
M: object init-decoder nip ;
|
||||
|
||||
GENERIC: encode-string ( string encoding -- byte-array )
|
||||
M: tuple-class encode-string construct-empty encode-string ;
|
||||
M: object encode-string drop >byte-array ;
|
||||
|
||||
! Decoding
|
||||
|
||||
TUPLE: decode-error ;
|
||||
|
@ -21,19 +34,6 @@ SYMBOL: begin
|
|||
! This is the replacement character
|
||||
HEX: fffd push-decoded ;
|
||||
|
||||
: finish-decoding ( buf ch state -- str )
|
||||
begin eq? [ decode-error ] unless drop "" like ;
|
||||
|
||||
: start-decoding ( seq length -- buf ch state seq )
|
||||
<sbuf> 0 begin roll ;
|
||||
|
||||
GENERIC: decode-step ( buf byte ch state encoding -- buf ch state )
|
||||
|
||||
: decode ( seq quot -- string )
|
||||
>r dup length start-decoding r>
|
||||
[ -rot ] swap compose each
|
||||
finish-decoding ; inline
|
||||
|
||||
: space ( resizable -- room-left )
|
||||
dup underlying swap [ length ] 2apply - ;
|
||||
|
||||
|
@ -42,34 +42,34 @@ GENERIC: decode-step ( buf byte ch state encoding -- buf ch state )
|
|||
: end-read-loop ( buf ch state stream quot -- string/f )
|
||||
2drop 2drop >string f like ;
|
||||
|
||||
: decode-read-loop ( buf ch state stream encoding -- string/f )
|
||||
>r >r pick r> r> rot full? [ end-read-loop ] [
|
||||
: decode-read-loop ( buf stream encoding -- string/f )
|
||||
pick full? [ 2drop >string ] [
|
||||
over stream-read1 [
|
||||
-rot tuck >r >r >r -rot r> decode-step r> r> decode-read-loop
|
||||
] [ end-read-loop ] if*
|
||||
-rot tuck >r >r >r dupd r> decode-step r> r>
|
||||
decode-read-loop
|
||||
] [ 2drop >string f like ] if*
|
||||
] if ;
|
||||
|
||||
: decode-read ( length stream encoding -- string )
|
||||
>r swap >fixnum start-decoding r>
|
||||
decode-read-loop ;
|
||||
rot <sbuf> -rot decode-read-loop ;
|
||||
|
||||
TUPLE: decoded code cr ;
|
||||
TUPLE: decoder code cr ;
|
||||
: <decoder> ( stream encoding -- newstream )
|
||||
dup binary eq? [ drop ] [
|
||||
construct-empty { set-delegate set-decoded-code }
|
||||
decoded construct
|
||||
dupd init-decoder { set-delegate set-decoder-code }
|
||||
decoder construct
|
||||
] if ;
|
||||
|
||||
: cr+ t swap set-decoded-cr ; inline
|
||||
: cr+ t swap set-decoder-cr ; inline
|
||||
|
||||
: cr- f swap set-decoded-cr ; inline
|
||||
: cr- f swap set-decoder-cr ; inline
|
||||
|
||||
: line-ends/eof ( stream str -- str ) f like swap cr- ; inline
|
||||
|
||||
: line-ends\r ( stream str -- str ) swap cr+ ; inline
|
||||
|
||||
: line-ends\n ( stream str -- str )
|
||||
over decoded-cr over empty? and
|
||||
over decoder-cr over empty? and
|
||||
[ drop dup cr- stream-readln ] [ swap cr- ] if ; inline
|
||||
|
||||
: handle-readln ( stream str ch -- str )
|
||||
|
@ -80,43 +80,43 @@ TUPLE: decoded code cr ;
|
|||
} case ;
|
||||
|
||||
: fix-read ( stream string -- string )
|
||||
over decoded-cr [
|
||||
over decoder-cr [
|
||||
over cr-
|
||||
"\n" ?head [
|
||||
swap stream-read1 [ add ] when*
|
||||
] [ nip ] if
|
||||
] [ nip ] if ;
|
||||
|
||||
M: decoded stream-read
|
||||
tuck { delegate decoded-code } get-slots decode-read fix-read ;
|
||||
M: decoder stream-read
|
||||
tuck { delegate decoder-code } get-slots decode-read fix-read ;
|
||||
|
||||
M: decoded stream-read-partial stream-read ;
|
||||
M: decoder stream-read-partial stream-read ;
|
||||
|
||||
: decoded-read-until ( stream delim -- ch )
|
||||
: decoder-read-until ( stream delim -- ch )
|
||||
! Copied from { c-reader stream-read-until }!!!
|
||||
over stream-read1 dup [
|
||||
dup pick memq? [ 2nip ] [ , decoded-read-until ] if
|
||||
dup pick memq? [ 2nip ] [ , decoder-read-until ] if
|
||||
] [
|
||||
2nip
|
||||
] if ;
|
||||
|
||||
M: decoded stream-read-until
|
||||
M: decoder stream-read-until
|
||||
! Copied from { c-reader stream-read-until }!!!
|
||||
[ swap decoded-read-until ] "" make
|
||||
[ swap decoder-read-until ] "" make
|
||||
swap over empty? over not and [ 2drop f f ] when ;
|
||||
|
||||
: fix-read1 ( stream char -- char )
|
||||
over decoded-cr [
|
||||
over decoder-cr [
|
||||
over cr-
|
||||
dup CHAR: \n = [
|
||||
drop stream-read1
|
||||
] [ nip ] if
|
||||
] [ nip ] if ;
|
||||
|
||||
M: decoded stream-read1
|
||||
M: decoder stream-read1
|
||||
1 swap stream-read f like [ first ] [ f ] if* ;
|
||||
|
||||
M: decoded stream-readln ( stream -- str )
|
||||
M: decoder stream-readln ( stream -- str )
|
||||
"\r\n" over stream-read-until handle-readln ;
|
||||
|
||||
! Encoding
|
||||
|
@ -125,41 +125,30 @@ TUPLE: encode-error ;
|
|||
|
||||
: encode-error ( -- * ) \ encode-error construct-empty throw ;
|
||||
|
||||
TUPLE: encoded code ;
|
||||
TUPLE: encoder code ;
|
||||
: <encoder> ( stream encoding -- newstream )
|
||||
dup binary eq? [ drop ] [
|
||||
construct-empty { set-delegate set-encoded-code }
|
||||
encoded construct
|
||||
construct-empty { set-delegate set-encoder-code }
|
||||
encoder construct
|
||||
] if ;
|
||||
|
||||
GENERIC: encode-string ( string encoding -- byte-array )
|
||||
M: tuple-class encode-string construct-empty encode-string ;
|
||||
|
||||
M: encoded stream-write1
|
||||
M: encoder stream-write1
|
||||
>r 1string r> stream-write ;
|
||||
|
||||
M: encoded stream-write
|
||||
[ encoded-code encode-string ] keep delegate stream-write ;
|
||||
M: encoder stream-write
|
||||
[ encoder-code encode-string ] keep delegate stream-write ;
|
||||
|
||||
M: encoded dispose delegate dispose ;
|
||||
M: encoder dispose delegate dispose ;
|
||||
|
||||
INSTANCE: encoded plain-writer
|
||||
INSTANCE: encoder plain-writer
|
||||
|
||||
! Rebinding duplex streams which have not read anything yet
|
||||
|
||||
: reencode ( stream encoding -- newstream )
|
||||
over encoded? [ >r delegate r> ] when <encoder> ;
|
||||
over encoder? [ >r delegate r> ] when <encoder> ;
|
||||
|
||||
: redecode ( stream encoding -- newstream )
|
||||
over decoded? [ >r delegate r> ] when <decoder> ;
|
||||
over decoder? [ >r delegate r> ] when <decoder> ;
|
||||
|
||||
: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
|
||||
tuck reencode >r redecode r> <duplex-stream> ;
|
||||
|
||||
! The null encoding does nothing
|
||||
! (used to wrap things as line-reader/plain-writer)
|
||||
! Later this will be replaced by inheritance
|
||||
|
||||
TUPLE: null-encoding ;
|
||||
M: null-encoding encode-string drop ;
|
||||
M: null-encoding decode-step 3drop over push f f ;
|
||||
|
|
|
@ -6,6 +6,6 @@ ARTICLE: "io.encodings.utf8" "Working with UTF8-encoded data"
|
|||
{ $subsection utf8 } ;
|
||||
|
||||
HELP: utf8
|
||||
{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. You can pass this class symbol as an encoding descriptor to words like " { $link <file-reader> } " and " { $link encode-string } "." } ;
|
||||
{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. You can pass this class symbol as an encoding descriptor to words like " { $link <file-reader> } " and " { $link encode-string } ". This conforms to the " { $link "encodings-protocol" } "." } ;
|
||||
|
||||
ABOUT: "io.encodings.utf8"
|
||||
|
|
|
@ -6,6 +6,8 @@ IN: io.encodings.utf8
|
|||
|
||||
! Decoding UTF-8
|
||||
|
||||
TUPLE: utf8 ch state ;
|
||||
|
||||
SYMBOL: double
|
||||
SYMBOL: triple
|
||||
SYMBOL: triple2
|
||||
|
@ -44,8 +46,16 @@ SYMBOL: quad3
|
|||
{ quad3 [ end-multibyte ] }
|
||||
} case ;
|
||||
|
||||
: decode-utf8 ( seq -- str )
|
||||
[ decode-utf8-step ] decode ;
|
||||
: unpack-state ( encoding -- ch state )
|
||||
{ utf8-ch utf8-state } get-slots ;
|
||||
|
||||
: pack-state ( ch state encoding -- )
|
||||
{ set-utf8-ch set-utf8-state } set-slots ;
|
||||
|
||||
M: utf8 decode-step ( buf char encoding -- )
|
||||
[ unpack-state decode-utf8-step ] keep pack-state drop ;
|
||||
|
||||
M: utf8 init-decoder nip begin over set-utf8-state ;
|
||||
|
||||
! Encoding UTF-8
|
||||
|
||||
|
@ -75,10 +85,4 @@ SYMBOL: quad3
|
|||
: encode-utf8 ( str -- seq )
|
||||
[ [ char>utf8 ] each ] B{ } make ;
|
||||
|
||||
! Interface for streams
|
||||
|
||||
TUPLE: utf8 ;
|
||||
|
||||
M: utf8 encode-string drop encode-utf8 ;
|
||||
M: utf8 decode-step drop decode-utf8-step ;
|
||||
! In the future, this should detect and ignore a BOM at the beginning
|
||||
|
|
|
@ -100,7 +100,7 @@ $nl
|
|||
{ $subsection "stream-protocol" }
|
||||
{ $subsection "stdio" }
|
||||
{ $subsection "stream-utils" }
|
||||
{ $see-also "io.streams.string" "io.streams.lines" "io.streams.plain" "io.streams.duplex" } ;
|
||||
{ $see-also "io.streams.string" "io.streams.plain" "io.streams.duplex" } ;
|
||||
|
||||
ABOUT: "streams"
|
||||
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
USING: help.syntax help.markup io byte-arrays quotations ;
|
||||
IN: io.streams.byte-array
|
||||
|
||||
ABOUT: "io.streams.byte-array"
|
||||
|
||||
ARTICLE: "io.streams.byte-array" "Byte-array streams"
|
||||
"Byte array streams:"
|
||||
{ $subsection <byte-reader> }
|
||||
{ $subsection <byte-writer> }
|
||||
"Utility combinators:"
|
||||
{ $subsection with-byte-reader }
|
||||
{ $subsection with-byte-writer } ;
|
||||
|
||||
HELP: <byte-reader>
|
||||
{ $values { "byte-array" byte-array }
|
||||
{ "encoding" "an encoding descriptor" } }
|
||||
{ $description "Provides an input stream reading off the given byte array using the given encoding." } ;
|
||||
|
||||
HELP: <byte-writer>
|
||||
{ $values { "encoding" "an encoding descriptor" }
|
||||
{ "stream" "an output stream" } }
|
||||
{ $description "Provides an output stream, putting things in the given encoding, storing everything written to it in a byte-array." } ;
|
||||
|
||||
HELP: with-byte-reader
|
||||
{ $values { "encoding" "an encoding descriptor" }
|
||||
{ "quot" quotation } { "byte-array" byte-array } }
|
||||
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading the byte array in the given encoding from beginning to end." } ;
|
||||
|
||||
HELP: with-byte-writer
|
||||
{ $values { "encoding" "an encoding descriptor" }
|
||||
{ "quot" quotation }
|
||||
{ "byte-array" byte-array } }
|
||||
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to a new byte array writer, putting things in the given encoding. The accumulated byte array is output when the quotation returns." } ;
|
|
@ -26,4 +26,4 @@ HELP: <string-reader>
|
|||
|
||||
HELP: with-string-reader
|
||||
{ $values { "str" string } { "quot" quotation } }
|
||||
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading " { $snippet "str" } " from beginning to end. The accumulated string is output when the quotation returns." } ;
|
||||
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading " { $snippet "str" } " from beginning to end." } ;
|
||||
|
|
|
@ -50,7 +50,7 @@ M: growable stream-read-partial
|
|||
stream-read ;
|
||||
|
||||
: <string-reader> ( str -- stream )
|
||||
>sbuf dup reverse-here null-encoding <decoder> ;
|
||||
>sbuf dup reverse-here f <decoder> ;
|
||||
|
||||
: with-string-reader ( str quot -- )
|
||||
>r <string-reader> r> with-stream ; inline
|
||||
|
|
|
@ -2,7 +2,7 @@ USING: help help.markup help.syntax help.definitions help.topics
|
|||
namespaces words sequences classes assocs vocabs kernel arrays
|
||||
prettyprint.backend kernel.private io generic math system
|
||||
strings sbufs vectors byte-arrays bit-arrays float-arrays
|
||||
quotations ;
|
||||
quotations io.streams.byte-array ;
|
||||
IN: help.handbook
|
||||
|
||||
ARTICLE: "conventions" "Conventions"
|
||||
|
@ -176,9 +176,9 @@ ARTICLE: "io" "Input and output"
|
|||
{ $subsection "streams" }
|
||||
"Wrapper streams:"
|
||||
{ $subsection "io.streams.duplex" }
|
||||
{ $subsection "io.streams.lines" }
|
||||
{ $subsection "io.streams.plain" }
|
||||
{ $subsection "io.streams.string" }
|
||||
{ $subsection "io.streams.byte-array" }
|
||||
"Utilities:"
|
||||
{ $subsection "stream-binary" }
|
||||
{ $subsection "styles" }
|
||||
|
@ -187,6 +187,7 @@ ARTICLE: "io" "Input and output"
|
|||
{ $subsection "io.mmap" }
|
||||
{ $subsection "io.monitors" }
|
||||
{ $heading "Other features" }
|
||||
{ $subsection "io.encodings" }
|
||||
{ $subsection "network-streams" }
|
||||
{ $subsection "io.launcher" }
|
||||
{ $subsection "io.timeouts" } ;
|
||||
|
|
|
@ -12,4 +12,4 @@ M: ascii encode-string
|
|||
drop 127 encode-check<= ;
|
||||
|
||||
M: ascii decode-step
|
||||
3drop dup 127 >= [ encode-error ] when over push f f ;
|
||||
drop dup 128 >= [ encode-error ] [ swap push ] if ;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: io io.encodings strings kernel io.encodings.ascii sequences ;
|
||||
USING: io io.encodings strings kernel io.encodings.ascii sequences math ;
|
||||
IN: io.encodings.latin1
|
||||
|
||||
TUPLE: latin1 ;
|
||||
|
@ -9,4 +9,4 @@ M: latin1 encode-string
|
|||
drop 255 encode-check<= ;
|
||||
|
||||
M: latin1 decode-step
|
||||
3drop over push f f ;
|
||||
drop dup 256 >= [ encode-error ] [ swap push ] if ;
|
||||
|
|
|
@ -1,45 +1,22 @@
|
|||
USING: help.markup help.syntax io.encodings strings ;
|
||||
IN: io.encodings.utf16
|
||||
|
||||
ARTICLE: "io.utf16" "Working with UTF16-encoded data"
|
||||
"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences."
|
||||
{ $subsection encode-utf16le }
|
||||
{ $subsection encode-utf16be }
|
||||
{ $subsection decode-utf16le }
|
||||
{ $subsection decode-utf16be }
|
||||
"Support for UTF16 data with a byte order mark:"
|
||||
{ $subsection encode-utf16 }
|
||||
{ $subsection decode-utf16 } ;
|
||||
ARTICLE: "utf16" "Working with UTF-16-encoded data"
|
||||
"The UTF-16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences. There are three encoding descriptor classes for working with UTF-16, depending on endianness or the presence of a BOM:"
|
||||
{ $subsection utf16le }
|
||||
{ $subsection utf16be }
|
||||
{ $subsection utf16 }
|
||||
"All of these conform to the " { $link "encodings-protocol" } "." ;
|
||||
|
||||
ABOUT: "io.utf16"
|
||||
ABOUT: "utf16"
|
||||
|
||||
HELP: decode-utf16
|
||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
|
||||
{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." }
|
||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
|
||||
HELP: utf16le
|
||||
{ $class-description "The encoding protocol for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ;
|
||||
|
||||
HELP: decode-utf16be
|
||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
|
||||
{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
|
||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
|
||||
HELP: utf16be
|
||||
{ $class-description "The encoding protocol for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ;
|
||||
|
||||
HELP: decode-utf16le
|
||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
|
||||
{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
|
||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
|
||||
HELP: utf16
|
||||
{ $class-description "The encoding protocol for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ;
|
||||
|
||||
{ decode-utf16 decode-utf16le decode-utf16be } related-words
|
||||
|
||||
HELP: encode-utf16be
|
||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
|
||||
{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ;
|
||||
|
||||
HELP: encode-utf16le
|
||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
|
||||
{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ;
|
||||
|
||||
HELP: encode-utf16
|
||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
|
||||
{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ;
|
||||
|
||||
{ encode-utf16 encode-utf16be encode-utf16le } related-words
|
||||
{ utf16 utf16le utf16be } related-words
|
||||
|
|
|
@ -4,6 +4,10 @@ USING: math kernel sequences sbufs vectors namespaces io.binary
|
|||
io.encodings combinators splitting io byte-arrays ;
|
||||
IN: io.encodings.utf16
|
||||
|
||||
! UTF-16BE decoding
|
||||
|
||||
TUPLE: utf16be ch state ;
|
||||
|
||||
SYMBOL: double
|
||||
SYMBOL: quad1
|
||||
SYMBOL: quad2
|
||||
|
@ -40,8 +44,20 @@ SYMBOL: ignore
|
|||
{ ignore [ 2drop push-replacement ] }
|
||||
} case ;
|
||||
|
||||
: decode-utf16be ( seq -- str )
|
||||
[ decode-utf16be-step ] decode ;
|
||||
: unpack-state-be ( encoding -- ch state )
|
||||
{ utf16be-ch utf16be-state } get-slots ;
|
||||
|
||||
: pack-state-be ( ch state encoding -- )
|
||||
{ set-utf16be-ch set-utf16be-state } set-slots ;
|
||||
|
||||
M: utf16be decode-step
|
||||
[ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ;
|
||||
|
||||
M: utf16be init-decoder nip begin over set-utf16be-state ;
|
||||
|
||||
! UTF-16LE decoding
|
||||
|
||||
TUPLE: utf16le ch state ;
|
||||
|
||||
: handle-double ( buf byte ch -- buf ch state )
|
||||
swap dup -3 shift BIN: 11011 = [
|
||||
|
@ -64,8 +80,18 @@ SYMBOL: ignore
|
|||
{ quad3 [ handle-quad3le ] }
|
||||
} case ;
|
||||
|
||||
: decode-utf16le ( seq -- str )
|
||||
[ decode-utf16le-step ] decode ;
|
||||
: unpack-state-le ( encoding -- ch state )
|
||||
{ utf16le-ch utf16le-state } get-slots ;
|
||||
|
||||
: pack-state-le ( ch state encoding -- )
|
||||
{ set-utf16le-ch set-utf16le-state } set-slots ;
|
||||
|
||||
M: utf16le decode-step
|
||||
[ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ;
|
||||
|
||||
M: utf16le init-decoder nip begin over set-utf16le-state ;
|
||||
|
||||
! UTF-16LE/BE encoding
|
||||
|
||||
: encode-first
|
||||
-10 shift
|
||||
|
@ -97,6 +123,11 @@ SYMBOL: ignore
|
|||
: encode-utf16le ( str -- seq )
|
||||
[ [ char>utf16le ] each ] B{ } make ;
|
||||
|
||||
M: utf16le encode-string drop encode-utf16le ;
|
||||
M: utf16be encode-string drop encode-utf16be ;
|
||||
|
||||
! UTF-16
|
||||
|
||||
: bom-le B{ HEX: ff HEX: fe } ; inline
|
||||
|
||||
: bom-be B{ HEX: fe HEX: ff } ; inline
|
||||
|
@ -108,40 +139,17 @@ SYMBOL: ignore
|
|||
|
||||
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
||||
|
||||
: decode-utf16 ( seq -- str )
|
||||
{
|
||||
{ [ start-utf16le? ] [ decode-utf16le ] }
|
||||
{ [ start-utf16be? ] [ decode-utf16be ] }
|
||||
{ [ t ] [ decode-error ] }
|
||||
} cond ;
|
||||
|
||||
TUPLE: utf16le ;
|
||||
|
||||
M: utf16le encode-string drop encode-utf16le ;
|
||||
M: utf16le decode-step drop decode-utf16le-step ;
|
||||
|
||||
TUPLE: utf16be ;
|
||||
|
||||
M: utf16be encode-string drop encode-utf16be ;
|
||||
M: utf16be decode-step drop decode-utf16be-step ;
|
||||
|
||||
TUPLE: utf16 encoding ;
|
||||
TUPLE: utf16 started? ;
|
||||
|
||||
M: utf16 encode-string
|
||||
>r encode-utf16le r>
|
||||
dup utf16-encoding [ drop ]
|
||||
[ t swap set-utf16-encoding bom-le swap append ] if ;
|
||||
dup utf16-started? [ drop ]
|
||||
[ t swap set-utf16-started? bom-le swap append ] if ;
|
||||
|
||||
: bom>le/be ( bom -- le/be )
|
||||
dup bom-le sequence= [ drop utf16le ] [
|
||||
bom-be sequence= [ utf16be ] [ decode-error ] if
|
||||
] if ;
|
||||
|
||||
: read-bom ( utf16 -- encoding )
|
||||
2 over delegate stream-read bom>le/be construct-empty
|
||||
[ swap set-utf16-encoding ] keep ;
|
||||
|
||||
M: utf16 decode-step
|
||||
! inefficient: checks if bom is done many times
|
||||
! This should transform itself into utf16be or utf16le after reading BOM
|
||||
dup utf16-encoding [ ] [ read-bom ] ?if decode-step ;
|
||||
M: utf16 init-decoder ( stream encoding -- newencoding )
|
||||
2 rot stream-read bom>le/be construct-empty init-decoder ;
|
||||
|
|
|
@ -38,3 +38,5 @@ IN: multiline
|
|||
|
||||
: <"
|
||||
"\">" parse-multiline-string parsed ; parsing
|
||||
|
||||
: /* "*/" parse-multiline-string drop ; parsing
|
||||
|
|
Loading…
Reference in New Issue