diff --git a/core/io/encodings/encodings-docs.factor b/core/io/encodings/encodings-docs.factor index 132da2bb70..6dfd94a2b9 100644 --- a/core/io/encodings/encodings-docs.factor +++ b/core/io/encodings/encodings-docs.factor @@ -3,7 +3,7 @@ IN: io.encodings ABOUT: "encodings" -ARTICLE: "encodings" "I/O encodings" +ARTICLE: "io.encodings" "I/O encodings" "Many streams deal with bytes, rather than Unicode code points, at some level. The translation between these two things is specified by an encoding. To abstract this away from the programmer, Factor provides a system where these streams are associated with an encoding which is always used when the stream is read from or written to. For most purposes, an encoding descriptor consisting of a symbol is all that is needed when initializing a stream." { $subsection "encodings-constructors" } { $subsection "encodings-descriptors" } @@ -37,17 +37,37 @@ HELP: ( stream-in stream-out encoding -- duplex ) { } related-words ARTICLE: "encodings-descriptors" "Encoding descriptors" -"An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use include:" -{ $vocab-link "io.encodings.utf8" } -{ $vocab-link "io.encodings.ascii" } -{ $vocab-link "io.encodings.binary" } -{ $vocab-link "io.encodings.utf16" } ; +"An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use are defined in the following vocabularies:" +$nl { $vocab-link "io.encodings.utf8" } +$nl { $vocab-link "io.encodings.ascii" } +$nl { $vocab-link "io.encodings.binary" } +$nl { $vocab-link "io.encodings.utf16" } ; ARTICLE: "encodings-protocol" "Encoding protocol" "An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again." { $subsection decode-step } +{ $subsection init-decoder } { $subsection encode-string } ; ARTICLE: "encodings-string" "Encoding and decoding strings" "Strings can be encoded and decoded with the following words:" { $subsection encode-string } ; + +HELP: decode-step ( buf char encoding -- ) +{ $values { "buf" "A string buffer which characters can be pushed to" } + { "char" "An octet which is read from a stream" } + { "encoding" "An encoding descriptor tuple" } } +{ $description "A single step in the decoding process must be defined for the decoding descriptor. When each octet is read, this word is called, and depending on the decoder's internal state, something may be pushed to the buffer or the state may change." } ; + +HELP: encode-string ( string encoding -- byte-array ) +{ $values { "string" "a string" } + { "encoding" "an encoding descriptor" } + { "byte-array" "an encoded byte-array" } } +{ $description "Encodes the string with the given encoding descriptor, outputting the result to a byte-array." } ; + +HELP: init-decoder ( stream encoding -- encoding ) +{ $values { "stream" "an input stream" } + { "encoding" "an encoding descriptor" } } +{ $description "Initializes the decoder tuple's state. The stream is exposed so that it can be read, eg for a BOM." } ; + +{ init-decoder decode-step encode-string } related-words diff --git a/core/io/streams/lines/lines-tests.factor b/core/io/encodings/encodings-tests.factor similarity index 86% rename from core/io/streams/lines/lines-tests.factor rename to core/io/encodings/encodings-tests.factor index e8ecc65526..73d2efa7d4 100755 --- a/core/io/streams/lines/lines-tests.factor +++ b/core/io/encodings/encodings-tests.factor @@ -1,9 +1,9 @@ -USING: io.streams.lines io.files io.streams.string io -tools.test kernel ; -IN: io.streams.lines.tests +USING: io.files io.streams.string io +tools.test kernel io.encodings.ascii ; +IN: io.streams.encodings.tests : ( resource -- stream ) - resource-path ; + resource-path ascii ; [ { } ] [ "/core/io/test/empty-file.txt" lines ] diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index bc4f21e8e2..8489c46d2e 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -3,9 +3,22 @@ USING: math kernel sequences sbufs vectors namespaces growable strings io classes continuations combinators io.styles io.streams.plain io.encodings.binary splitting -io.streams.duplex ; +io.streams.duplex byte-arrays ; IN: io.encodings +! The encoding descriptor protocol + +GENERIC: decode-step ( buf char encoding -- ) +M: object decode-step drop swap push ; + +GENERIC: init-decoder ( stream encoding -- encoding ) +M: tuple-class init-decoder construct-empty init-decoder ; +M: object init-decoder nip ; + +GENERIC: encode-string ( string encoding -- byte-array ) +M: tuple-class encode-string construct-empty encode-string ; +M: object encode-string drop >byte-array ; + ! Decoding TUPLE: decode-error ; @@ -21,19 +34,6 @@ SYMBOL: begin ! This is the replacement character HEX: fffd push-decoded ; -: finish-decoding ( buf ch state -- str ) - begin eq? [ decode-error ] unless drop "" like ; - -: start-decoding ( seq length -- buf ch state seq ) - 0 begin roll ; - -GENERIC: decode-step ( buf byte ch state encoding -- buf ch state ) - -: decode ( seq quot -- string ) - >r dup length start-decoding r> - [ -rot ] swap compose each - finish-decoding ; inline - : space ( resizable -- room-left ) dup underlying swap [ length ] 2apply - ; @@ -42,34 +42,34 @@ GENERIC: decode-step ( buf byte ch state encoding -- buf ch state ) : end-read-loop ( buf ch state stream quot -- string/f ) 2drop 2drop >string f like ; -: decode-read-loop ( buf ch state stream encoding -- string/f ) - >r >r pick r> r> rot full? [ end-read-loop ] [ +: decode-read-loop ( buf stream encoding -- string/f ) + pick full? [ 2drop >string ] [ over stream-read1 [ - -rot tuck >r >r >r -rot r> decode-step r> r> decode-read-loop - ] [ end-read-loop ] if* + -rot tuck >r >r >r dupd r> decode-step r> r> + decode-read-loop + ] [ 2drop >string f like ] if* ] if ; : decode-read ( length stream encoding -- string ) - >r swap >fixnum start-decoding r> - decode-read-loop ; + rot -rot decode-read-loop ; -TUPLE: decoded code cr ; +TUPLE: decoder code cr ; : ( stream encoding -- newstream ) dup binary eq? [ drop ] [ - construct-empty { set-delegate set-decoded-code } - decoded construct + dupd init-decoder { set-delegate set-decoder-code } + decoder construct ] if ; -: cr+ t swap set-decoded-cr ; inline +: cr+ t swap set-decoder-cr ; inline -: cr- f swap set-decoded-cr ; inline +: cr- f swap set-decoder-cr ; inline : line-ends/eof ( stream str -- str ) f like swap cr- ; inline : line-ends\r ( stream str -- str ) swap cr+ ; inline : line-ends\n ( stream str -- str ) - over decoded-cr over empty? and + over decoder-cr over empty? and [ drop dup cr- stream-readln ] [ swap cr- ] if ; inline : handle-readln ( stream str ch -- str ) @@ -80,43 +80,43 @@ TUPLE: decoded code cr ; } case ; : fix-read ( stream string -- string ) - over decoded-cr [ + over decoder-cr [ over cr- "\n" ?head [ swap stream-read1 [ add ] when* ] [ nip ] if ] [ nip ] if ; -M: decoded stream-read - tuck { delegate decoded-code } get-slots decode-read fix-read ; +M: decoder stream-read + tuck { delegate decoder-code } get-slots decode-read fix-read ; -M: decoded stream-read-partial stream-read ; +M: decoder stream-read-partial stream-read ; -: decoded-read-until ( stream delim -- ch ) +: decoder-read-until ( stream delim -- ch ) ! Copied from { c-reader stream-read-until }!!! over stream-read1 dup [ - dup pick memq? [ 2nip ] [ , decoded-read-until ] if + dup pick memq? [ 2nip ] [ , decoder-read-until ] if ] [ 2nip ] if ; -M: decoded stream-read-until +M: decoder stream-read-until ! Copied from { c-reader stream-read-until }!!! - [ swap decoded-read-until ] "" make + [ swap decoder-read-until ] "" make swap over empty? over not and [ 2drop f f ] when ; : fix-read1 ( stream char -- char ) - over decoded-cr [ + over decoder-cr [ over cr- dup CHAR: \n = [ drop stream-read1 ] [ nip ] if ] [ nip ] if ; -M: decoded stream-read1 +M: decoder stream-read1 1 swap stream-read f like [ first ] [ f ] if* ; -M: decoded stream-readln ( stream -- str ) +M: decoder stream-readln ( stream -- str ) "\r\n" over stream-read-until handle-readln ; ! Encoding @@ -125,41 +125,30 @@ TUPLE: encode-error ; : encode-error ( -- * ) \ encode-error construct-empty throw ; -TUPLE: encoded code ; +TUPLE: encoder code ; : ( stream encoding -- newstream ) dup binary eq? [ drop ] [ - construct-empty { set-delegate set-encoded-code } - encoded construct + construct-empty { set-delegate set-encoder-code } + encoder construct ] if ; -GENERIC: encode-string ( string encoding -- byte-array ) -M: tuple-class encode-string construct-empty encode-string ; - -M: encoded stream-write1 +M: encoder stream-write1 >r 1string r> stream-write ; -M: encoded stream-write - [ encoded-code encode-string ] keep delegate stream-write ; +M: encoder stream-write + [ encoder-code encode-string ] keep delegate stream-write ; -M: encoded dispose delegate dispose ; +M: encoder dispose delegate dispose ; -INSTANCE: encoded plain-writer +INSTANCE: encoder plain-writer ! Rebinding duplex streams which have not read anything yet : reencode ( stream encoding -- newstream ) - over encoded? [ >r delegate r> ] when ; + over encoder? [ >r delegate r> ] when ; : redecode ( stream encoding -- newstream ) - over decoded? [ >r delegate r> ] when ; + over decoder? [ >r delegate r> ] when ; : ( stream-in stream-out encoding -- duplex ) tuck reencode >r redecode r> ; - -! The null encoding does nothing -! (used to wrap things as line-reader/plain-writer) -! Later this will be replaced by inheritance - -TUPLE: null-encoding ; -M: null-encoding encode-string drop ; -M: null-encoding decode-step 3drop over push f f ; diff --git a/core/io/encodings/utf8/utf8-docs.factor b/core/io/encodings/utf8/utf8-docs.factor index 734a7f7236..3e901ec83b 100755 --- a/core/io/encodings/utf8/utf8-docs.factor +++ b/core/io/encodings/utf8/utf8-docs.factor @@ -6,6 +6,6 @@ ARTICLE: "io.encodings.utf8" "Working with UTF8-encoded data" { $subsection utf8 } ; HELP: utf8 -{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. You can pass this class symbol as an encoding descriptor to words like " { $link } " and " { $link encode-string } "." } ; +{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. You can pass this class symbol as an encoding descriptor to words like " { $link } " and " { $link encode-string } ". This conforms to the " { $link "encodings-protocol" } "." } ; ABOUT: "io.encodings.utf8" diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index 2e7585b8a9..edc4663214 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -6,6 +6,8 @@ IN: io.encodings.utf8 ! Decoding UTF-8 +TUPLE: utf8 ch state ; + SYMBOL: double SYMBOL: triple SYMBOL: triple2 @@ -44,8 +46,16 @@ SYMBOL: quad3 { quad3 [ end-multibyte ] } } case ; -: decode-utf8 ( seq -- str ) - [ decode-utf8-step ] decode ; +: unpack-state ( encoding -- ch state ) + { utf8-ch utf8-state } get-slots ; + +: pack-state ( ch state encoding -- ) + { set-utf8-ch set-utf8-state } set-slots ; + +M: utf8 decode-step ( buf char encoding -- ) + [ unpack-state decode-utf8-step ] keep pack-state drop ; + +M: utf8 init-decoder nip begin over set-utf8-state ; ! Encoding UTF-8 @@ -75,10 +85,4 @@ SYMBOL: quad3 : encode-utf8 ( str -- seq ) [ [ char>utf8 ] each ] B{ } make ; -! Interface for streams - -TUPLE: utf8 ; - M: utf8 encode-string drop encode-utf8 ; -M: utf8 decode-step drop decode-utf8-step ; -! In the future, this should detect and ignore a BOM at the beginning diff --git a/core/io/io-docs.factor b/core/io/io-docs.factor index 0986196e8d..fd40950e62 100755 --- a/core/io/io-docs.factor +++ b/core/io/io-docs.factor @@ -100,7 +100,7 @@ $nl { $subsection "stream-protocol" } { $subsection "stdio" } { $subsection "stream-utils" } -{ $see-also "io.streams.string" "io.streams.lines" "io.streams.plain" "io.streams.duplex" } ; +{ $see-also "io.streams.string" "io.streams.plain" "io.streams.duplex" } ; ABOUT: "streams" diff --git a/core/io/streams/byte-array/byte-array-docs.factor b/core/io/streams/byte-array/byte-array-docs.factor new file mode 100644 index 0000000000..8e0b97e06b --- /dev/null +++ b/core/io/streams/byte-array/byte-array-docs.factor @@ -0,0 +1,33 @@ +USING: help.syntax help.markup io byte-arrays quotations ; +IN: io.streams.byte-array + +ABOUT: "io.streams.byte-array" + +ARTICLE: "io.streams.byte-array" "Byte-array streams" +"Byte array streams:" +{ $subsection } +{ $subsection } +"Utility combinators:" +{ $subsection with-byte-reader } +{ $subsection with-byte-writer } ; + +HELP: +{ $values { "byte-array" byte-array } + { "encoding" "an encoding descriptor" } } +{ $description "Provides an input stream reading off the given byte array using the given encoding." } ; + +HELP: +{ $values { "encoding" "an encoding descriptor" } + { "stream" "an output stream" } } +{ $description "Provides an output stream, putting things in the given encoding, storing everything written to it in a byte-array." } ; + +HELP: with-byte-reader +{ $values { "encoding" "an encoding descriptor" } + { "quot" quotation } { "byte-array" byte-array } } +{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading the byte array in the given encoding from beginning to end." } ; + +HELP: with-byte-writer +{ $values { "encoding" "an encoding descriptor" } + { "quot" quotation } + { "byte-array" byte-array } } +{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to a new byte array writer, putting things in the given encoding. The accumulated byte array is output when the quotation returns." } ; diff --git a/core/io/streams/string/string-docs.factor b/core/io/streams/string/string-docs.factor index e948d2162a..91ac244608 100644 --- a/core/io/streams/string/string-docs.factor +++ b/core/io/streams/string/string-docs.factor @@ -26,4 +26,4 @@ HELP: HELP: with-string-reader { $values { "str" string } { "quot" quotation } } -{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading " { $snippet "str" } " from beginning to end. The accumulated string is output when the quotation returns." } ; +{ $description "Calls the quotation in a new dynamic scope with " { $link stdio } " rebound to an input stream reading " { $snippet "str" } " from beginning to end." } ; diff --git a/core/io/streams/string/string.factor b/core/io/streams/string/string.factor index 15fc2b704e..7833e0aa47 100755 --- a/core/io/streams/string/string.factor +++ b/core/io/streams/string/string.factor @@ -50,7 +50,7 @@ M: growable stream-read-partial stream-read ; : ( str -- stream ) - >sbuf dup reverse-here null-encoding ; + >sbuf dup reverse-here f ; : with-string-reader ( str quot -- ) >r r> with-stream ; inline diff --git a/extra/help/handbook/handbook.factor b/extra/help/handbook/handbook.factor index 178b7a5d35..a078db8762 100755 --- a/extra/help/handbook/handbook.factor +++ b/extra/help/handbook/handbook.factor @@ -2,7 +2,7 @@ USING: help help.markup help.syntax help.definitions help.topics namespaces words sequences classes assocs vocabs kernel arrays prettyprint.backend kernel.private io generic math system strings sbufs vectors byte-arrays bit-arrays float-arrays -quotations ; +quotations io.streams.byte-array ; IN: help.handbook ARTICLE: "conventions" "Conventions" @@ -176,9 +176,9 @@ ARTICLE: "io" "Input and output" { $subsection "streams" } "Wrapper streams:" { $subsection "io.streams.duplex" } -{ $subsection "io.streams.lines" } { $subsection "io.streams.plain" } { $subsection "io.streams.string" } +{ $subsection "io.streams.byte-array" } "Utilities:" { $subsection "stream-binary" } { $subsection "styles" } @@ -187,6 +187,7 @@ ARTICLE: "io" "Input and output" { $subsection "io.mmap" } { $subsection "io.monitors" } { $heading "Other features" } +{ $subsection "io.encodings" } { $subsection "network-streams" } { $subsection "io.launcher" } { $subsection "io.timeouts" } ; diff --git a/extra/io/encodings/ascii/ascii.factor b/extra/io/encodings/ascii/ascii.factor index 0878a7a624..6dd8d23155 100644 --- a/extra/io/encodings/ascii/ascii.factor +++ b/extra/io/encodings/ascii/ascii.factor @@ -12,4 +12,4 @@ M: ascii encode-string drop 127 encode-check<= ; M: ascii decode-step - 3drop dup 127 >= [ encode-error ] when over push f f ; + drop dup 128 >= [ encode-error ] [ swap push ] if ; diff --git a/extra/io/encodings/latin1/latin1.factor b/extra/io/encodings/latin1/latin1.factor index b914589dc9..36e38caa1c 100755 --- a/extra/io/encodings/latin1/latin1.factor +++ b/extra/io/encodings/latin1/latin1.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: io io.encodings strings kernel io.encodings.ascii sequences ; +USING: io io.encodings strings kernel io.encodings.ascii sequences math ; IN: io.encodings.latin1 TUPLE: latin1 ; @@ -9,4 +9,4 @@ M: latin1 encode-string drop 255 encode-check<= ; M: latin1 decode-step - 3drop over push f f ; + drop dup 256 >= [ encode-error ] [ swap push ] if ; diff --git a/extra/io/encodings/utf16/utf16-docs.factor b/extra/io/encodings/utf16/utf16-docs.factor index c49c030ef3..018a15a534 100644 --- a/extra/io/encodings/utf16/utf16-docs.factor +++ b/extra/io/encodings/utf16/utf16-docs.factor @@ -1,45 +1,22 @@ USING: help.markup help.syntax io.encodings strings ; IN: io.encodings.utf16 -ARTICLE: "io.utf16" "Working with UTF16-encoded data" -"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences." -{ $subsection encode-utf16le } -{ $subsection encode-utf16be } -{ $subsection decode-utf16le } -{ $subsection decode-utf16be } -"Support for UTF16 data with a byte order mark:" -{ $subsection encode-utf16 } -{ $subsection decode-utf16 } ; +ARTICLE: "utf16" "Working with UTF-16-encoded data" +"The UTF-16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences. There are three encoding descriptor classes for working with UTF-16, depending on endianness or the presence of a BOM:" +{ $subsection utf16le } +{ $subsection utf16be } +{ $subsection utf16 } +"All of these conform to the " { $link "encodings-protocol" } "." ; -ABOUT: "io.utf16" +ABOUT: "utf16" -HELP: decode-utf16 -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; +HELP: utf16le +{ $class-description "The encoding protocol for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; -HELP: decode-utf16be -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; +HELP: utf16be +{ $class-description "The encoding protocol for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; -HELP: decode-utf16le -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; +HELP: utf16 +{ $class-description "The encoding protocol for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ; -{ decode-utf16 decode-utf16le decode-utf16be } related-words - -HELP: encode-utf16be -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ; - -HELP: encode-utf16le -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ; - -HELP: encode-utf16 -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ; - -{ encode-utf16 encode-utf16be encode-utf16le } related-words +{ utf16 utf16le utf16be } related-words diff --git a/extra/io/encodings/utf16/utf16.factor b/extra/io/encodings/utf16/utf16.factor index 3e10dcba35..8815d588ad 100755 --- a/extra/io/encodings/utf16/utf16.factor +++ b/extra/io/encodings/utf16/utf16.factor @@ -4,6 +4,10 @@ USING: math kernel sequences sbufs vectors namespaces io.binary io.encodings combinators splitting io byte-arrays ; IN: io.encodings.utf16 +! UTF-16BE decoding + +TUPLE: utf16be ch state ; + SYMBOL: double SYMBOL: quad1 SYMBOL: quad2 @@ -40,8 +44,20 @@ SYMBOL: ignore { ignore [ 2drop push-replacement ] } } case ; -: decode-utf16be ( seq -- str ) - [ decode-utf16be-step ] decode ; +: unpack-state-be ( encoding -- ch state ) + { utf16be-ch utf16be-state } get-slots ; + +: pack-state-be ( ch state encoding -- ) + { set-utf16be-ch set-utf16be-state } set-slots ; + +M: utf16be decode-step + [ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ; + +M: utf16be init-decoder nip begin over set-utf16be-state ; + +! UTF-16LE decoding + +TUPLE: utf16le ch state ; : handle-double ( buf byte ch -- buf ch state ) swap dup -3 shift BIN: 11011 = [ @@ -64,8 +80,18 @@ SYMBOL: ignore { quad3 [ handle-quad3le ] } } case ; -: decode-utf16le ( seq -- str ) - [ decode-utf16le-step ] decode ; +: unpack-state-le ( encoding -- ch state ) + { utf16le-ch utf16le-state } get-slots ; + +: pack-state-le ( ch state encoding -- ) + { set-utf16le-ch set-utf16le-state } set-slots ; + +M: utf16le decode-step + [ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ; + +M: utf16le init-decoder nip begin over set-utf16le-state ; + +! UTF-16LE/BE encoding : encode-first -10 shift @@ -97,6 +123,11 @@ SYMBOL: ignore : encode-utf16le ( str -- seq ) [ [ char>utf16le ] each ] B{ } make ; +M: utf16le encode-string drop encode-utf16le ; +M: utf16be encode-string drop encode-utf16be ; + +! UTF-16 + : bom-le B{ HEX: ff HEX: fe } ; inline : bom-be B{ HEX: fe HEX: ff } ; inline @@ -108,40 +139,17 @@ SYMBOL: ignore : start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; -: decode-utf16 ( seq -- str ) - { - { [ start-utf16le? ] [ decode-utf16le ] } - { [ start-utf16be? ] [ decode-utf16be ] } - { [ t ] [ decode-error ] } - } cond ; - -TUPLE: utf16le ; - -M: utf16le encode-string drop encode-utf16le ; -M: utf16le decode-step drop decode-utf16le-step ; - -TUPLE: utf16be ; - -M: utf16be encode-string drop encode-utf16be ; -M: utf16be decode-step drop decode-utf16be-step ; - -TUPLE: utf16 encoding ; +TUPLE: utf16 started? ; M: utf16 encode-string >r encode-utf16le r> - dup utf16-encoding [ drop ] - [ t swap set-utf16-encoding bom-le swap append ] if ; + dup utf16-started? [ drop ] + [ t swap set-utf16-started? bom-le swap append ] if ; : bom>le/be ( bom -- le/be ) dup bom-le sequence= [ drop utf16le ] [ bom-be sequence= [ utf16be ] [ decode-error ] if ] if ; -: read-bom ( utf16 -- encoding ) - 2 over delegate stream-read bom>le/be construct-empty - [ swap set-utf16-encoding ] keep ; - -M: utf16 decode-step - ! inefficient: checks if bom is done many times - ! This should transform itself into utf16be or utf16le after reading BOM - dup utf16-encoding [ ] [ read-bom ] ?if decode-step ; +M: utf16 init-decoder ( stream encoding -- newencoding ) + 2 rot stream-read bom>le/be construct-empty init-decoder ; diff --git a/extra/multiline/multiline.factor b/extra/multiline/multiline.factor index d32c11dd06..5baa205d15 100755 --- a/extra/multiline/multiline.factor +++ b/extra/multiline/multiline.factor @@ -38,3 +38,5 @@ IN: multiline : <" "\">" parse-multiline-string parsed ; parsing + +: /* "*/" parse-multiline-string drop ; parsing