Encodings updates; /* */ comments in multline
parent
84052ac5f3
commit
93c4ac23a8
|
@ -3,7 +3,7 @@ IN: io.encodings
|
||||||
|
|
||||||
ABOUT: "encodings"
|
ABOUT: "encodings"
|
||||||
|
|
||||||
ARTICLE: "encodings" "I/O encodings"
|
ARTICLE: "io.encodings" "I/O encodings"
|
||||||
"Many streams deal with bytes, rather than Unicode code points, at some level. The translation between these two things is specified by an encoding. To abstract this away from the programmer, Factor provides a system where these streams are associated with an encoding which is always used when the stream is read from or written to. For most purposes, an encoding descriptor consisting of a symbol is all that is needed when initializing a stream."
|
"Many streams deal with bytes, rather than Unicode code points, at some level. The translation between these two things is specified by an encoding. To abstract this away from the programmer, Factor provides a system where these streams are associated with an encoding which is always used when the stream is read from or written to. For most purposes, an encoding descriptor consisting of a symbol is all that is needed when initializing a stream."
|
||||||
{ $subsection "encodings-constructors" }
|
{ $subsection "encodings-constructors" }
|
||||||
{ $subsection "encodings-descriptors" }
|
{ $subsection "encodings-descriptors" }
|
||||||
|
@ -37,17 +37,37 @@ HELP: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
|
||||||
{ <encoder> <decoder> <encoder-duplex> } related-words
|
{ <encoder> <decoder> <encoder-duplex> } related-words
|
||||||
|
|
||||||
ARTICLE: "encodings-descriptors" "Encoding descriptors"
|
ARTICLE: "encodings-descriptors" "Encoding descriptors"
|
||||||
"An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use include:"
|
"An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use are defined in the following vocabularies:"
|
||||||
{ $vocab-link "io.encodings.utf8" }
|
$nl { $vocab-link "io.encodings.utf8" }
|
||||||
{ $vocab-link "io.encodings.ascii" }
|
$nl { $vocab-link "io.encodings.ascii" }
|
||||||
{ $vocab-link "io.encodings.binary" }
|
$nl { $vocab-link "io.encodings.binary" }
|
||||||
{ $vocab-link "io.encodings.utf16" } ;
|
$nl { $vocab-link "io.encodings.utf16" } ;
|
||||||
|
|
||||||
ARTICLE: "encodings-protocol" "Encoding protocol"
|
ARTICLE: "encodings-protocol" "Encoding protocol"
|
||||||
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
|
"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again."
|
||||||
{ $subsection decode-step }
|
{ $subsection decode-step }
|
||||||
|
{ $subsection init-decoder }
|
||||||
{ $subsection encode-string } ;
|
{ $subsection encode-string } ;
|
||||||
|
|
||||||
ARTICLE: "encodings-string" "Encoding and decoding strings"
|
ARTICLE: "encodings-string" "Encoding and decoding strings"
|
||||||
"Strings can be encoded and decoded with the following words:"
|
"Strings can be encoded and decoded with the following words:"
|
||||||
{ $subsection encode-string } ;
|
{ $subsection encode-string } ;
|
||||||
|
|
||||||
|
HELP: decode-step ( buf char encoding -- )
|
||||||
|
{ $values { "buf" "A string buffer which characters can be pushed to" }
|
||||||
|
{ "char" "An octet which is read from a stream" }
|
||||||
|
{ "encoding" "An encoding descriptor tuple" } }
|
||||||
|
{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change." } ;
|
||||||
|
|
||||||
|
HELP: encode-string ( string encoding -- byte-array )
|
||||||
|
{ $values { "string" "a string" }
|
||||||
|
{ "encoding" "an encoding descriptor" }
|
||||||
|
{ "byte-array" "an encoded byte-array" } }
|
||||||
|
{ $description "Encodes the string with the given encoding descriptor, outputting the result to a byte-array." } ;
|
||||||
|
|
||||||
|
HELP: init-decoder ( stream encoding -- encoding )
|
||||||
|
{ $values { "stream" "an input stream" }
|
||||||
|
{ "encoding" "an encoding descriptor" } }
|
||||||
|
{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM." } ;
|
||||||
|
|
||||||
|
{ init-decoder decode-step encode-string } related-words
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
USING: io.streams.lines io.files io.streams.string io
|
USING: io.files io.streams.string io
|
||||||
tools.test kernel ;
|
tools.test kernel io.encodings.ascii ;
|
||||||
IN: io.streams.lines.tests
|
IN: io.streams.encodings.tests
|
||||||
|
|
||||||
: <resource-reader> ( resource -- stream )
|
: <resource-reader> ( resource -- stream )
|
||||||
resource-path <file-reader> ;
|
resource-path ascii <file-reader> ;
|
||||||
|
|
||||||
[ { } ]
|
[ { } ]
|
||||||
[ "/core/io/test/empty-file.txt" <resource-reader> lines ]
|
[ "/core/io/test/empty-file.txt" <resource-reader> lines ]
|
|
@ -3,9 +3,22 @@
|
||||||
USING: math kernel sequences sbufs vectors namespaces
|
USING: math kernel sequences sbufs vectors namespaces
|
||||||
growable strings io classes continuations combinators
|
growable strings io classes continuations combinators
|
||||||
io.styles io.streams.plain io.encodings.binary splitting
|
io.styles io.streams.plain io.encodings.binary splitting
|
||||||
io.streams.duplex ;
|
io.streams.duplex byte-arrays ;
|
||||||
IN: io.encodings
|
IN: io.encodings
|
||||||
|
|
||||||
|
! The encoding descriptor protocol
|
||||||
|
|
||||||
|
GENERIC: decode-step ( buf char encoding -- )
|
||||||
|
M: object decode-step drop swap push ;
|
||||||
|
|
||||||
|
GENERIC: init-decoder ( stream encoding -- encoding )
|
||||||
|
M: tuple-class init-decoder construct-empty init-decoder ;
|
||||||
|
M: object init-decoder nip ;
|
||||||
|
|
||||||
|
GENERIC: encode-string ( string encoding -- byte-array )
|
||||||
|
M: tuple-class encode-string construct-empty encode-string ;
|
||||||
|
M: object encode-string drop >byte-array ;
|
||||||
|
|
||||||
! Decoding
|
! Decoding
|
||||||
|
|
||||||
TUPLE: decode-error ;
|
TUPLE: decode-error ;
|
||||||
|
@ -21,19 +34,6 @@ SYMBOL: begin
|
||||||
! This is the replacement character
|
! This is the replacement character
|
||||||
HEX: fffd push-decoded ;
|
HEX: fffd push-decoded ;
|
||||||
|
|
||||||
: finish-decoding ( buf ch state -- str )
|
|
||||||
begin eq? [ decode-error ] unless drop "" like ;
|
|
||||||
|
|
||||||
: start-decoding ( seq length -- buf ch state seq )
|
|
||||||
<sbuf> 0 begin roll ;
|
|
||||||
|
|
||||||
GENERIC: decode-step ( buf byte ch state encoding -- buf ch state )
|
|
||||||
|
|
||||||
: decode ( seq quot -- string )
|
|
||||||
>r dup length start-decoding r>
|
|
||||||
[ -rot ] swap compose each
|
|
||||||
finish-decoding ; inline
|
|
||||||
|
|
||||||
: space ( resizable -- room-left )
|
: space ( resizable -- room-left )
|
||||||
dup underlying swap [ length ] 2apply - ;
|
dup underlying swap [ length ] 2apply - ;
|
||||||
|
|
||||||
|
@ -42,34 +42,34 @@ GENERIC: decode-step ( buf byte ch state encoding -- buf ch state )
|
||||||
: end-read-loop ( buf ch state stream quot -- string/f )
|
: end-read-loop ( buf ch state stream quot -- string/f )
|
||||||
2drop 2drop >string f like ;
|
2drop 2drop >string f like ;
|
||||||
|
|
||||||
: decode-read-loop ( buf ch state stream encoding -- string/f )
|
: decode-read-loop ( buf stream encoding -- string/f )
|
||||||
>r >r pick r> r> rot full? [ end-read-loop ] [
|
pick full? [ 2drop >string ] [
|
||||||
over stream-read1 [
|
over stream-read1 [
|
||||||
-rot tuck >r >r >r -rot r> decode-step r> r> decode-read-loop
|
-rot tuck >r >r >r dupd r> decode-step r> r>
|
||||||
] [ end-read-loop ] if*
|
decode-read-loop
|
||||||
|
] [ 2drop >string f like ] if*
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
: decode-read ( length stream encoding -- string )
|
: decode-read ( length stream encoding -- string )
|
||||||
>r swap >fixnum start-decoding r>
|
rot <sbuf> -rot decode-read-loop ;
|
||||||
decode-read-loop ;
|
|
||||||
|
|
||||||
TUPLE: decoded code cr ;
|
TUPLE: decoder code cr ;
|
||||||
: <decoder> ( stream encoding -- newstream )
|
: <decoder> ( stream encoding -- newstream )
|
||||||
dup binary eq? [ drop ] [
|
dup binary eq? [ drop ] [
|
||||||
construct-empty { set-delegate set-decoded-code }
|
dupd init-decoder { set-delegate set-decoder-code }
|
||||||
decoded construct
|
decoder construct
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
: cr+ t swap set-decoded-cr ; inline
|
: cr+ t swap set-decoder-cr ; inline
|
||||||
|
|
||||||
: cr- f swap set-decoded-cr ; inline
|
: cr- f swap set-decoder-cr ; inline
|
||||||
|
|
||||||
: line-ends/eof ( stream str -- str ) f like swap cr- ; inline
|
: line-ends/eof ( stream str -- str ) f like swap cr- ; inline
|
||||||
|
|
||||||
: line-ends\r ( stream str -- str ) swap cr+ ; inline
|
: line-ends\r ( stream str -- str ) swap cr+ ; inline
|
||||||
|
|
||||||
: line-ends\n ( stream str -- str )
|
: line-ends\n ( stream str -- str )
|
||||||
over decoded-cr over empty? and
|
over decoder-cr over empty? and
|
||||||
[ drop dup cr- stream-readln ] [ swap cr- ] if ; inline
|
[ drop dup cr- stream-readln ] [ swap cr- ] if ; inline
|
||||||
|
|
||||||
: handle-readln ( stream str ch -- str )
|
: handle-readln ( stream str ch -- str )
|
||||||
|
@ -80,43 +80,43 @@ TUPLE: decoded code cr ;
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: fix-read ( stream string -- string )
|
: fix-read ( stream string -- string )
|
||||||
over decoded-cr [
|
over decoder-cr [
|
||||||
over cr-
|
over cr-
|
||||||
"\n" ?head [
|
"\n" ?head [
|
||||||
swap stream-read1 [ add ] when*
|
swap stream-read1 [ add ] when*
|
||||||
] [ nip ] if
|
] [ nip ] if
|
||||||
] [ nip ] if ;
|
] [ nip ] if ;
|
||||||
|
|
||||||
M: decoded stream-read
|
M: decoder stream-read
|
||||||
tuck { delegate decoded-code } get-slots decode-read fix-read ;
|
tuck { delegate decoder-code } get-slots decode-read fix-read ;
|
||||||
|
|
||||||
M: decoded stream-read-partial stream-read ;
|
M: decoder stream-read-partial stream-read ;
|
||||||
|
|
||||||
: decoded-read-until ( stream delim -- ch )
|
: decoder-read-until ( stream delim -- ch )
|
||||||
! Copied from { c-reader stream-read-until }!!!
|
! Copied from { c-reader stream-read-until }!!!
|
||||||
over stream-read1 dup [
|
over stream-read1 dup [
|
||||||
dup pick memq? [ 2nip ] [ , decoded-read-until ] if
|
dup pick memq? [ 2nip ] [ , decoder-read-until ] if
|
||||||
] [
|
] [
|
||||||
2nip
|
2nip
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
M: decoded stream-read-until
|
M: decoder stream-read-until
|
||||||
! Copied from { c-reader stream-read-until }!!!
|
! Copied from { c-reader stream-read-until }!!!
|
||||||
[ swap decoded-read-until ] "" make
|
[ swap decoder-read-until ] "" make
|
||||||
swap over empty? over not and [ 2drop f f ] when ;
|
swap over empty? over not and [ 2drop f f ] when ;
|
||||||
|
|
||||||
: fix-read1 ( stream char -- char )
|
: fix-read1 ( stream char -- char )
|
||||||
over decoded-cr [
|
over decoder-cr [
|
||||||
over cr-
|
over cr-
|
||||||
dup CHAR: \n = [
|
dup CHAR: \n = [
|
||||||
drop stream-read1
|
drop stream-read1
|
||||||
] [ nip ] if
|
] [ nip ] if
|
||||||
] [ nip ] if ;
|
] [ nip ] if ;
|
||||||
|
|
||||||
M: decoded stream-read1
|
M: decoder stream-read1
|
||||||
1 swap stream-read f like [ first ] [ f ] if* ;
|
1 swap stream-read f like [ first ] [ f ] if* ;
|
||||||
|
|
||||||
M: decoded stream-readln ( stream -- str )
|
M: decoder stream-readln ( stream -- str )
|
||||||
"\r\n" over stream-read-until handle-readln ;
|
"\r\n" over stream-read-until handle-readln ;
|
||||||
|
|
||||||
! Encoding
|
! Encoding
|
||||||
|
@ -125,41 +125,30 @@ TUPLE: encode-error ;
|
||||||
|
|
||||||
: encode-error ( -- * ) \ encode-error construct-empty throw ;
|
: encode-error ( -- * ) \ encode-error construct-empty throw ;
|
||||||
|
|
||||||
TUPLE: encoded code ;
|
TUPLE: encoder code ;
|
||||||
: <encoder> ( stream encoding -- newstream )
|
: <encoder> ( stream encoding -- newstream )
|
||||||
dup binary eq? [ drop ] [
|
dup binary eq? [ drop ] [
|
||||||
construct-empty { set-delegate set-encoded-code }
|
construct-empty { set-delegate set-encoder-code }
|
||||||
encoded construct
|
encoder construct
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
GENERIC: encode-string ( string encoding -- byte-array )
|
M: encoder stream-write1
|
||||||
M: tuple-class encode-string construct-empty encode-string ;
|
|
||||||
|
|
||||||
M: encoded stream-write1
|
|
||||||
>r 1string r> stream-write ;
|
>r 1string r> stream-write ;
|
||||||
|
|
||||||
M: encoded stream-write
|
M: encoder stream-write
|
||||||
[ encoded-code encode-string ] keep delegate stream-write ;
|
[ encoder-code encode-string ] keep delegate stream-write ;
|
||||||
|
|
||||||
M: encoded dispose delegate dispose ;
|
M: encoder dispose delegate dispose ;
|
||||||
|
|
||||||
INSTANCE: encoded plain-writer
|
INSTANCE: encoder plain-writer
|
||||||
|
|
||||||
! Rebinding duplex streams which have not read anything yet
|
! Rebinding duplex streams which have not read anything yet
|
||||||
|
|
||||||
: reencode ( stream encoding -- newstream )
|
: reencode ( stream encoding -- newstream )
|
||||||
over encoded? [ >r delegate r> ] when <encoder> ;
|
over encoder? [ >r delegate r> ] when <encoder> ;
|
||||||
|
|
||||||
: redecode ( stream encoding -- newstream )
|
: redecode ( stream encoding -- newstream )
|
||||||
over decoded? [ >r delegate r> ] when <decoder> ;
|
over decoder? [ >r delegate r> ] when <decoder> ;
|
||||||
|
|
||||||
: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
|
: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
|
||||||
tuck reencode >r redecode r> <duplex-stream> ;
|
tuck reencode >r redecode r> <duplex-stream> ;
|
||||||
|
|
||||||
! The null encoding does nothing
|
|
||||||
! (used to wrap things as line-reader/plain-writer)
|
|
||||||
! Later this will be replaced by inheritance
|
|
||||||
|
|
||||||
TUPLE: null-encoding ;
|
|
||||||
M: null-encoding encode-string drop ;
|
|
||||||
M: null-encoding decode-step 3drop over push f f ;
|
|
||||||
|
|
|
@ -6,6 +6,6 @@ ARTICLE: "io.encodings.utf8" "Working with UTF8-encoded data"
|
||||||
{ $subsection utf8 } ;
|
{ $subsection utf8 } ;
|
||||||
|
|
||||||
HELP: utf8
|
HELP: utf8
|
||||||
{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. You can pass this class symbol as an encoding descriptor to words like " { $link <file-reader> } " and " { $link encode-string } "." } ;
|
{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. You can pass this class symbol as an encoding descriptor to words like " { $link <file-reader> } " and " { $link encode-string } ". This conforms to the " { $link "encodings-protocol" } "." } ;
|
||||||
|
|
||||||
ABOUT: "io.encodings.utf8"
|
ABOUT: "io.encodings.utf8"
|
||||||
|
|
|
@ -6,6 +6,8 @@ IN: io.encodings.utf8
|
||||||
|
|
||||||
! Decoding UTF-8
|
! Decoding UTF-8
|
||||||
|
|
||||||
|
TUPLE: utf8 ch state ;
|
||||||
|
|
||||||
SYMBOL: double
|
SYMBOL: double
|
||||||
SYMBOL: triple
|
SYMBOL: triple
|
||||||
SYMBOL: triple2
|
SYMBOL: triple2
|
||||||
|
@ -44,8 +46,16 @@ SYMBOL: quad3
|
||||||
{ quad3 [ end-multibyte ] }
|
{ quad3 [ end-multibyte ] }
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: decode-utf8 ( seq -- str )
|
: unpack-state ( encoding -- ch state )
|
||||||
[ decode-utf8-step ] decode ;
|
{ utf8-ch utf8-state } get-slots ;
|
||||||
|
|
||||||
|
: pack-state ( ch state encoding -- )
|
||||||
|
{ set-utf8-ch set-utf8-state } set-slots ;
|
||||||
|
|
||||||
|
M: utf8 decode-step ( buf char encoding -- )
|
||||||
|
[ unpack-state decode-utf8-step ] keep pack-state drop ;
|
||||||
|
|
||||||
|
M: utf8 init-decoder nip begin over set-utf8-state ;
|
||||||
|
|
||||||
! Encoding UTF-8
|
! Encoding UTF-8
|
||||||
|
|
||||||
|
@ -75,10 +85,4 @@ SYMBOL: quad3
|
||||||
: encode-utf8 ( str -- seq )
|
: encode-utf8 ( str -- seq )
|
||||||
[ [ char>utf8 ] each ] B{ } make ;
|
[ [ char>utf8 ] each ] B{ } make ;
|
||||||
|
|
||||||
! Interface for streams
|
|
||||||
|
|
||||||
TUPLE: utf8 ;
|
|
||||||
|
|
||||||
M: utf8 encode-string drop encode-utf8 ;
|
M: utf8 encode-string drop encode-utf8 ;
|
||||||
M: utf8 decode-step drop decode-utf8-step ;
|
|
||||||
! In the future, this should detect and ignore a BOM at the beginning
|
|
||||||
|
|
|
@ -100,7 +100,7 @@ $nl
|
||||||
{ $subsection "stream-protocol" }
|
{ $subsection "stream-protocol" }
|
||||||
{ $subsection "stdio" }
|
{ $subsection "stdio" }
|
||||||
{ $subsection "stream-utils" }
|
{ $subsection "stream-utils" }
|
||||||
{ $see-also "io.streams.string" "io.streams.lines" "io.streams.plain" "io.streams.duplex" } ;
|
{ $see-also "io.streams.string" "io.streams.plain" "io.streams.duplex" } ;
|
||||||
|
|
||||||
ABOUT: "streams"
|
ABOUT: "streams"
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
USING: help.syntax help.markup io byte-arrays quotations ;
|
||||||
|
IN: io.streams.byte-array
|
||||||
|
|
||||||
|
ABOUT: "io.streams.byte-array"
|
||||||
|
|
||||||
|
ARTICLE: "io.streams.byte-array" "Byte-array streams"
|
||||||
|
"Byte array streams:"
|
||||||
|
{ $subsection <byte-reader> }
|
||||||
|
{ $subsection <byte-writer> }
|
||||||
|
"Utility combinators:"
|
||||||
|
{ $subsection with-byte-reader }
|
||||||
|
{ $subsection with-byte-writer } ;
|
||||||
|
|
||||||
|
HELP: <byte-reader>
|
||||||
|
{ $values { "byte-array" byte-array }
|
||||||
|
{ "encoding" "an encoding descriptor" } }
|
||||||
|
{ $description "Provides an input stream reading off the given byte array using the given encoding." } ;
|
||||||
|
|
||||||
|
HELP: <byte-writer>
|
||||||
|
{ $values { "encoding" "an encoding descriptor" }
|
||||||
|
{ "stream" "an output stream" } }
|
||||||
|
{ $description "Provides an output stream, putting things in the given encoding, storing everything written to it in a byte-array." } ;
|
||||||
|
|
||||||
|
HELP: with-byte-reader
|
||||||
|
{ $values { "encoding" "an encoding descriptor" }
|
||||||
|
{ "quot" quotation } { "byte-array" byte-array } }
|
||||||
|
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading the byte array in the given encoding from beginning to end." } ;
|
||||||
|
|
||||||
|
HELP: with-byte-writer
|
||||||
|
{ $values { "encoding" "an encoding descriptor" }
|
||||||
|
{ "quot" quotation }
|
||||||
|
{ "byte-array" byte-array } }
|
||||||
|
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to a new byte array writer, putting things in the given encoding. The accumulated byte array is output when the quotation returns." } ;
|
|
@ -26,4 +26,4 @@ HELP: <string-reader>
|
||||||
|
|
||||||
HELP: with-string-reader
|
HELP: with-string-reader
|
||||||
{ $values { "str" string } { "quot" quotation } }
|
{ $values { "str" string } { "quot" quotation } }
|
||||||
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading " { $snippet "str" } " from beginning to end. The accumulated string is output when the quotation returns." } ;
|
{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading " { $snippet "str" } " from beginning to end." } ;
|
||||||
|
|
|
@ -50,7 +50,7 @@ M: growable stream-read-partial
|
||||||
stream-read ;
|
stream-read ;
|
||||||
|
|
||||||
: <string-reader> ( str -- stream )
|
: <string-reader> ( str -- stream )
|
||||||
>sbuf dup reverse-here null-encoding <decoder> ;
|
>sbuf dup reverse-here f <decoder> ;
|
||||||
|
|
||||||
: with-string-reader ( str quot -- )
|
: with-string-reader ( str quot -- )
|
||||||
>r <string-reader> r> with-stream ; inline
|
>r <string-reader> r> with-stream ; inline
|
||||||
|
|
|
@ -2,7 +2,7 @@ USING: help help.markup help.syntax help.definitions help.topics
|
||||||
namespaces words sequences classes assocs vocabs kernel arrays
|
namespaces words sequences classes assocs vocabs kernel arrays
|
||||||
prettyprint.backend kernel.private io generic math system
|
prettyprint.backend kernel.private io generic math system
|
||||||
strings sbufs vectors byte-arrays bit-arrays float-arrays
|
strings sbufs vectors byte-arrays bit-arrays float-arrays
|
||||||
quotations ;
|
quotations io.streams.byte-array ;
|
||||||
IN: help.handbook
|
IN: help.handbook
|
||||||
|
|
||||||
ARTICLE: "conventions" "Conventions"
|
ARTICLE: "conventions" "Conventions"
|
||||||
|
@ -176,9 +176,9 @@ ARTICLE: "io" "Input and output"
|
||||||
{ $subsection "streams" }
|
{ $subsection "streams" }
|
||||||
"Wrapper streams:"
|
"Wrapper streams:"
|
||||||
{ $subsection "io.streams.duplex" }
|
{ $subsection "io.streams.duplex" }
|
||||||
{ $subsection "io.streams.lines" }
|
|
||||||
{ $subsection "io.streams.plain" }
|
{ $subsection "io.streams.plain" }
|
||||||
{ $subsection "io.streams.string" }
|
{ $subsection "io.streams.string" }
|
||||||
|
{ $subsection "io.streams.byte-array" }
|
||||||
"Utilities:"
|
"Utilities:"
|
||||||
{ $subsection "stream-binary" }
|
{ $subsection "stream-binary" }
|
||||||
{ $subsection "styles" }
|
{ $subsection "styles" }
|
||||||
|
@ -187,6 +187,7 @@ ARTICLE: "io" "Input and output"
|
||||||
{ $subsection "io.mmap" }
|
{ $subsection "io.mmap" }
|
||||||
{ $subsection "io.monitors" }
|
{ $subsection "io.monitors" }
|
||||||
{ $heading "Other features" }
|
{ $heading "Other features" }
|
||||||
|
{ $subsection "io.encodings" }
|
||||||
{ $subsection "network-streams" }
|
{ $subsection "network-streams" }
|
||||||
{ $subsection "io.launcher" }
|
{ $subsection "io.launcher" }
|
||||||
{ $subsection "io.timeouts" } ;
|
{ $subsection "io.timeouts" } ;
|
||||||
|
|
|
@ -12,4 +12,4 @@ M: ascii encode-string
|
||||||
drop 127 encode-check<= ;
|
drop 127 encode-check<= ;
|
||||||
|
|
||||||
M: ascii decode-step
|
M: ascii decode-step
|
||||||
3drop dup 127 >= [ encode-error ] when over push f f ;
|
drop dup 128 >= [ encode-error ] [ swap push ] if ;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: io io.encodings strings kernel io.encodings.ascii sequences ;
|
USING: io io.encodings strings kernel io.encodings.ascii sequences math ;
|
||||||
IN: io.encodings.latin1
|
IN: io.encodings.latin1
|
||||||
|
|
||||||
TUPLE: latin1 ;
|
TUPLE: latin1 ;
|
||||||
|
@ -9,4 +9,4 @@ M: latin1 encode-string
|
||||||
drop 255 encode-check<= ;
|
drop 255 encode-check<= ;
|
||||||
|
|
||||||
M: latin1 decode-step
|
M: latin1 decode-step
|
||||||
3drop over push f f ;
|
drop dup 256 >= [ encode-error ] [ swap push ] if ;
|
||||||
|
|
|
@ -1,45 +1,22 @@
|
||||||
USING: help.markup help.syntax io.encodings strings ;
|
USING: help.markup help.syntax io.encodings strings ;
|
||||||
IN: io.encodings.utf16
|
IN: io.encodings.utf16
|
||||||
|
|
||||||
ARTICLE: "io.utf16" "Working with UTF16-encoded data"
|
ARTICLE: "utf16" "Working with UTF-16-encoded data"
|
||||||
"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences."
|
"The UTF-16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences. There are three encoding descriptor classes for working with UTF-16, depending on endianness or the presence of a BOM:"
|
||||||
{ $subsection encode-utf16le }
|
{ $subsection utf16le }
|
||||||
{ $subsection encode-utf16be }
|
{ $subsection utf16be }
|
||||||
{ $subsection decode-utf16le }
|
{ $subsection utf16 }
|
||||||
{ $subsection decode-utf16be }
|
"All of these conform to the " { $link "encodings-protocol" } "." ;
|
||||||
"Support for UTF16 data with a byte order mark:"
|
|
||||||
{ $subsection encode-utf16 }
|
|
||||||
{ $subsection decode-utf16 } ;
|
|
||||||
|
|
||||||
ABOUT: "io.utf16"
|
ABOUT: "utf16"
|
||||||
|
|
||||||
HELP: decode-utf16
|
HELP: utf16le
|
||||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
|
{ $class-description "The encoding protocol for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ;
|
||||||
{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." }
|
|
||||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
|
|
||||||
|
|
||||||
HELP: decode-utf16be
|
HELP: utf16be
|
||||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
|
{ $class-description "The encoding protocol for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ;
|
||||||
{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
|
|
||||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
|
|
||||||
|
|
||||||
HELP: decode-utf16le
|
HELP: utf16
|
||||||
{ $values { "seq" "a sequence of bytes" } { "str" string } }
|
{ $class-description "The encoding protocol for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ;
|
||||||
{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
|
|
||||||
{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
|
|
||||||
|
|
||||||
{ decode-utf16 decode-utf16le decode-utf16be } related-words
|
{ utf16 utf16le utf16be } related-words
|
||||||
|
|
||||||
HELP: encode-utf16be
|
|
||||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
|
|
||||||
{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ;
|
|
||||||
|
|
||||||
HELP: encode-utf16le
|
|
||||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
|
|
||||||
{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ;
|
|
||||||
|
|
||||||
HELP: encode-utf16
|
|
||||||
{ $values { "str" string } { "seq" "a sequence of bytes" } }
|
|
||||||
{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ;
|
|
||||||
|
|
||||||
{ encode-utf16 encode-utf16be encode-utf16le } related-words
|
|
||||||
|
|
|
@ -4,6 +4,10 @@ USING: math kernel sequences sbufs vectors namespaces io.binary
|
||||||
io.encodings combinators splitting io byte-arrays ;
|
io.encodings combinators splitting io byte-arrays ;
|
||||||
IN: io.encodings.utf16
|
IN: io.encodings.utf16
|
||||||
|
|
||||||
|
! UTF-16BE decoding
|
||||||
|
|
||||||
|
TUPLE: utf16be ch state ;
|
||||||
|
|
||||||
SYMBOL: double
|
SYMBOL: double
|
||||||
SYMBOL: quad1
|
SYMBOL: quad1
|
||||||
SYMBOL: quad2
|
SYMBOL: quad2
|
||||||
|
@ -40,8 +44,20 @@ SYMBOL: ignore
|
||||||
{ ignore [ 2drop push-replacement ] }
|
{ ignore [ 2drop push-replacement ] }
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: decode-utf16be ( seq -- str )
|
: unpack-state-be ( encoding -- ch state )
|
||||||
[ decode-utf16be-step ] decode ;
|
{ utf16be-ch utf16be-state } get-slots ;
|
||||||
|
|
||||||
|
: pack-state-be ( ch state encoding -- )
|
||||||
|
{ set-utf16be-ch set-utf16be-state } set-slots ;
|
||||||
|
|
||||||
|
M: utf16be decode-step
|
||||||
|
[ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ;
|
||||||
|
|
||||||
|
M: utf16be init-decoder nip begin over set-utf16be-state ;
|
||||||
|
|
||||||
|
! UTF-16LE decoding
|
||||||
|
|
||||||
|
TUPLE: utf16le ch state ;
|
||||||
|
|
||||||
: handle-double ( buf byte ch -- buf ch state )
|
: handle-double ( buf byte ch -- buf ch state )
|
||||||
swap dup -3 shift BIN: 11011 = [
|
swap dup -3 shift BIN: 11011 = [
|
||||||
|
@ -64,8 +80,18 @@ SYMBOL: ignore
|
||||||
{ quad3 [ handle-quad3le ] }
|
{ quad3 [ handle-quad3le ] }
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: decode-utf16le ( seq -- str )
|
: unpack-state-le ( encoding -- ch state )
|
||||||
[ decode-utf16le-step ] decode ;
|
{ utf16le-ch utf16le-state } get-slots ;
|
||||||
|
|
||||||
|
: pack-state-le ( ch state encoding -- )
|
||||||
|
{ set-utf16le-ch set-utf16le-state } set-slots ;
|
||||||
|
|
||||||
|
M: utf16le decode-step
|
||||||
|
[ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ;
|
||||||
|
|
||||||
|
M: utf16le init-decoder nip begin over set-utf16le-state ;
|
||||||
|
|
||||||
|
! UTF-16LE/BE encoding
|
||||||
|
|
||||||
: encode-first
|
: encode-first
|
||||||
-10 shift
|
-10 shift
|
||||||
|
@ -97,6 +123,11 @@ SYMBOL: ignore
|
||||||
: encode-utf16le ( str -- seq )
|
: encode-utf16le ( str -- seq )
|
||||||
[ [ char>utf16le ] each ] B{ } make ;
|
[ [ char>utf16le ] each ] B{ } make ;
|
||||||
|
|
||||||
|
M: utf16le encode-string drop encode-utf16le ;
|
||||||
|
M: utf16be encode-string drop encode-utf16be ;
|
||||||
|
|
||||||
|
! UTF-16
|
||||||
|
|
||||||
: bom-le B{ HEX: ff HEX: fe } ; inline
|
: bom-le B{ HEX: ff HEX: fe } ; inline
|
||||||
|
|
||||||
: bom-be B{ HEX: fe HEX: ff } ; inline
|
: bom-be B{ HEX: fe HEX: ff } ; inline
|
||||||
|
@ -108,40 +139,17 @@ SYMBOL: ignore
|
||||||
|
|
||||||
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
||||||
|
|
||||||
: decode-utf16 ( seq -- str )
|
TUPLE: utf16 started? ;
|
||||||
{
|
|
||||||
{ [ start-utf16le? ] [ decode-utf16le ] }
|
|
||||||
{ [ start-utf16be? ] [ decode-utf16be ] }
|
|
||||||
{ [ t ] [ decode-error ] }
|
|
||||||
} cond ;
|
|
||||||
|
|
||||||
TUPLE: utf16le ;
|
|
||||||
|
|
||||||
M: utf16le encode-string drop encode-utf16le ;
|
|
||||||
M: utf16le decode-step drop decode-utf16le-step ;
|
|
||||||
|
|
||||||
TUPLE: utf16be ;
|
|
||||||
|
|
||||||
M: utf16be encode-string drop encode-utf16be ;
|
|
||||||
M: utf16be decode-step drop decode-utf16be-step ;
|
|
||||||
|
|
||||||
TUPLE: utf16 encoding ;
|
|
||||||
|
|
||||||
M: utf16 encode-string
|
M: utf16 encode-string
|
||||||
>r encode-utf16le r>
|
>r encode-utf16le r>
|
||||||
dup utf16-encoding [ drop ]
|
dup utf16-started? [ drop ]
|
||||||
[ t swap set-utf16-encoding bom-le swap append ] if ;
|
[ t swap set-utf16-started? bom-le swap append ] if ;
|
||||||
|
|
||||||
: bom>le/be ( bom -- le/be )
|
: bom>le/be ( bom -- le/be )
|
||||||
dup bom-le sequence= [ drop utf16le ] [
|
dup bom-le sequence= [ drop utf16le ] [
|
||||||
bom-be sequence= [ utf16be ] [ decode-error ] if
|
bom-be sequence= [ utf16be ] [ decode-error ] if
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
: read-bom ( utf16 -- encoding )
|
M: utf16 init-decoder ( stream encoding -- newencoding )
|
||||||
2 over delegate stream-read bom>le/be construct-empty
|
2 rot stream-read bom>le/be construct-empty init-decoder ;
|
||||||
[ swap set-utf16-encoding ] keep ;
|
|
||||||
|
|
||||||
M: utf16 decode-step
|
|
||||||
! inefficient: checks if bom is done many times
|
|
||||||
! This should transform itself into utf16be or utf16le after reading BOM
|
|
||||||
dup utf16-encoding [ ] [ read-bom ] ?if decode-step ;
|
|
||||||
|
|
|
@ -38,3 +38,5 @@ IN: multiline
|
||||||
|
|
||||||
: <"
|
: <"
|
||||||
"\">" parse-multiline-string parsed ; parsing
|
"\">" parse-multiline-string parsed ; parsing
|
||||||
|
|
||||||
|
: /* "*/" parse-multiline-string drop ; parsing
|
||||||
|
|
Loading…
Reference in New Issue