From 2bc882bf5aabb65198a798fb645c65c90bceacf0 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Tue, 25 Mar 2008 22:45:26 -0400 Subject: [PATCH 1/3] XML reports its encoding as UTF-8 --- extra/xml/tests/errors.factor | 2 +- extra/xml/tests/templating.factor | 2 +- extra/xml/tests/test.factor | 6 +++--- extra/xml/tokenize/tokenize.factor | 2 +- extra/xml/utilities/utilities.factor | 2 +- extra/xml/xml.factor | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/extra/xml/tests/errors.factor b/extra/xml/tests/errors.factor index b421ae011a..6ba0b0d560 100755 --- a/extra/xml/tests/errors.factor +++ b/extra/xml/tests/errors.factor @@ -16,7 +16,7 @@ T{ extra-attrs T{ parsing-error f 1 32 } V{ T{ name f "" "foo" f } } T{ bad-version T{ parsing-error f 1 28 } "5 million" } "" xml-error-test T{ notags f } "" xml-error-test T{ multitags f } "" xml-error-test -T{ bad-prolog T{ parsing-error f 1 26 } T{ prolog f "1.0" "iso-8859-1" f } +T{ bad-prolog T{ parsing-error f 1 26 } T{ prolog f "1.0" "UTF-8" f } } "" xml-error-test T{ capitalized-prolog T{ parsing-error f 1 6 } "XmL" } "" xml-error-test diff --git a/extra/xml/tests/templating.factor b/extra/xml/tests/templating.factor index 6db98ec848..d81e807fe5 100644 --- a/extra/xml/tests/templating.factor +++ b/extra/xml/tests/templating.factor @@ -40,4 +40,4 @@ M: object (r-ref) drop ; sample-doc string>xml dup template xml>string ] with-scope ; -[ "foo
blah

" ] [ test-refs ] unit-test +[ "foo

" ] [ test-refs ] unit-test diff --git a/extra/xml/tests/test.factor b/extra/xml/tests/test.factor index 02c7aecb13..98146136e6 100644 --- a/extra/xml/tests/test.factor +++ b/extra/xml/tests/test.factor @@ -26,7 +26,7 @@ SYMBOL: xml-file ] unit-test [ V{ "fa&g" } ] [ xml-file get "x" get-id tag-children ] unit-test [ "that" ] [ xml-file get "this" swap at ] unit-test -[ "" ] +[ "" ] [ "" string>xml xml>string ] unit-test [ "abcd" ] [ "

abcd
" string>xml @@ -44,7 +44,7 @@ SYMBOL: xml-file at swap "z" >r tuck r> swap set-at T{ name f "blah" "z" f } swap at ] unit-test [ "foo" ] [ "" string>xml children>string ] unit-test -[ "bar baz" ] +[ "bar baz" ] [ "bar" string>xml [ " baz" append ] map xml>string ] unit-test -[ "\n\n bar\n" ] +[ "\n\n bar\n" ] [ " bar " string>xml pprint-xml>string ] unit-test diff --git a/extra/xml/tokenize/tokenize.factor b/extra/xml/tokenize/tokenize.factor index d99c306b2b..b2b7d78b3e 100644 --- a/extra/xml/tokenize/tokenize.factor +++ b/extra/xml/tokenize/tokenize.factor @@ -172,7 +172,7 @@ SYMBOL: ns-stack [ T{ name f "" "version" f } swap at [ good-version ] [ throw ] if* ] keep [ T{ name f "" "encoding" f } swap at - "iso-8859-1" or ] keep + "UTF-8" or ] keep T{ name f "" "standalone" f } swap at [ yes/no>bool ] [ f ] if* ; diff --git a/extra/xml/utilities/utilities.factor b/extra/xml/utilities/utilities.factor index d6814851ee..b397e3c7b1 100755 --- a/extra/xml/utilities/utilities.factor +++ b/extra/xml/utilities/utilities.factor @@ -42,7 +42,7 @@ M: process-missing error. >r 1array r> build-tag* ; : standard-prolog ( -- prolog ) - T{ prolog f "1.0" "iso-8859-1" f } ; + T{ prolog f "1.0" "UTF-8" f } ; : build-xml ( tag -- xml ) standard-prolog { } rot { } ; diff --git a/extra/xml/xml.factor b/extra/xml/xml.factor index 970ff39cf1..61ef27b72e 100644 --- a/extra/xml/xml.factor +++ b/extra/xml/xml.factor @@ -63,7 +63,7 @@ M: closer process V{ } clone xml-stack set f push-xml ; : default-prolog ( -- prolog ) - "1.0" "iso-8859-1" f ; + "1.0" "UTF-8" f ; : reset-prolog ( -- ) default-prolog prolog-data set ; From 64203f762d23849b23f0421f20b6123bcd0e6665 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 26 Mar 2008 14:41:09 -0400 Subject: [PATCH 2/3] Docs improvements; simplification of design of io.encodings.8-bit --- core/io/encodings/encodings-docs.factor | 15 +++++--- extra/help/handbook/handbook.factor | 11 ++++-- extra/io/encodings/8-bit/8-bit-docs.factor | 8 +++-- extra/io/encodings/8-bit/8-bit.factor | 41 ++++++++-------------- extra/io/encodings/utf16/utf16-docs.factor | 12 ++++--- 5 files changed, 47 insertions(+), 40 deletions(-) diff --git a/core/io/encodings/encodings-docs.factor b/core/io/encodings/encodings-docs.factor index 07e0f9f401..bdd9e56d87 100644 --- a/core/io/encodings/encodings-docs.factor +++ b/core/io/encodings/encodings-docs.factor @@ -19,20 +19,23 @@ HELP: { $values { "stream" "an output stream" } { "encoding" "an encoding descriptor" } { "newstream" "an encoded output stream" } } -{ $description "Wraps the given stream in a new stream using the given encoding for all output. The encoding descriptor can either be a class or an instance of something conforming to the " { $link "encodings-protocol" } "." } ; +{ $description "Wraps the given stream in a new stream using the given encoding for all output. The encoding descriptor can either be a class or an instance of something conforming to the " { $link "encodings-protocol" } "." } +$low-level-note ; HELP: { $values { "stream" "an input stream" } { "encoding" "an encoding descriptor" } { "newstream" "an encoded output stream" } } -{ $description "Wraps the given stream in a new stream using the given encoding for all input. The encoding descriptor can either be a class or an instance of something conforming to the " { $link "encodings-protocol" } "." } ; +{ $description "Wraps the given stream in a new stream using the given encoding for all input. The encoding descriptor can either be a class or an instance of something conforming to the " { $link "encodings-protocol" } "." } +$low-level-note ; HELP: { $values { "stream-in" "an input stream" } { "stream-out" "an output stream" } { "encoding" "an encoding descriptor" } { "duplex" "an encoded duplex stream" } } -{ $description "Wraps the given streams in an encoder or decoder stream, and puts them together in a duplex stream for input and output. If either input stream is already encoded, that encoding is stripped off before it is reencoded. The encoding descriptor must conform to the " { $link "encodings-protocol" } "." } ; +{ $description "Wraps the given streams in an encoder or decoder stream, and puts them together in a duplex stream for input and output. If either input stream is already encoded, that encoding is stripped off before it is reencoded. The encoding descriptor must conform to the " { $link "encodings-protocol" } "." } +$low-level-note ; { } related-words @@ -58,12 +61,14 @@ ARTICLE: "encodings-protocol" "Encoding protocol" HELP: decode-char { $values { "stream" "an underlying input stream" } { "encoding" "An encoding descriptor tuple" } { "char/f" "a code point or " { $link f } } } -{ $description "Reads a single code point from the underlying stream, interpreting it by the encoding. This should not be used directly." } ; +{ $contract "Reads a single code point from the underlying stream, interpreting it by the encoding." } +$low-level-note ; HELP: encode-char { $values { "char" "a character" } { "stream" "an underlying output stream" } { "encoding" "an encoding descriptor" } } -{ $description "Writes the code point in the encoding to the underlying stream given. This should not be used directly." } ; +{ $contract "Writes the code point in the encoding to the underlying stream given." } +$low-level-note ; { encode-char decode-char } related-words diff --git a/extra/help/handbook/handbook.factor b/extra/help/handbook/handbook.factor index 4079386d7f..8963c2b1ad 100755 --- a/extra/help/handbook/handbook.factor +++ b/extra/help/handbook/handbook.factor @@ -178,9 +178,16 @@ ARTICLE: "encodings-introduction" "An introduction to encodings" "Not all encodings can represent all Unicode code points, but Unicode can represent basically everything that exists in modern encodings. Some encodings are language-specific, and some can represent everything in Unicode. Though the world is moving toward Unicode and UTF-8, the reality today is that there are several encodings which must be taken into account." $nl "Factor uses a system of encoding descriptors to denote encodings. Encoding descriptors are objects which describe encodings. Examples are " { $link utf8 } ", " { $link ascii } " and " { $link binary } ". Encoding descriptors can be passed around independently. Each encoding descriptor has some method for constructing an encoded or decoded stream, and the resulting stream has an encoding descriptor stored which has methods for reading or writing characters." $nl "Constructors for streams which deal with bytes usually take an encoding as an explicit parameter. For example, to open a text file for reading whose contents are in UTF-8, use the following" -{ $code "\"filename\" utf8 " } +{ $code "\"file.txt\" utf8 " } "If there is an error in the encoded stream, a replacement character (0xFFFD) will be inserted. To throw an exception upon error, use a strict encoding as follows" -{ $code "\"filename\" utf8 strict " } ; +{ $code "\"file.txt\" utf8 strict " } +"In a similar way, encodings can be specified when opening a file for writing." +{ $code "\"file.txt\" ascii " } +"An encoding is also needed for some words that don't return streams, such as " { $link file-contents } ", for example" +{ $code "\"file.txt\" utf16 file-contents" } +"Encoding descriptors are also used by " { $link "io.streams.byte-array" } " and taken by combinators like " { $link with-file-writer } " and " { $link with-byte-reader } " which deal with streams. It is " { $emphasis "not" } " used with " { $link "io.streams.string" } " because these deal with abstract text." +$nl +"When the " { $link binary } " encoding is used, a " { $link byte-array } " is expected for writing and returned for reading, since the stream deals with bytes. All other encodings deal with strings, since they are used to represent text." ; ARTICLE: "io" "Input and output" { $heading "Streams" } diff --git a/extra/io/encodings/8-bit/8-bit-docs.factor b/extra/io/encodings/8-bit/8-bit-docs.factor index 8e5fd815bc..e8dadc13f7 100644 --- a/extra/io/encodings/8-bit/8-bit-docs.factor +++ b/extra/io/encodings/8-bit/8-bit-docs.factor @@ -24,14 +24,18 @@ ARTICLE: "io.encodings.8-bit" "8-bit encodings" { $subsection windows-1252 } { $subsection ebcdic } { $subsection mac-roman } -"Other encodings can be defined using the following utility" +"Words used in defining these" +{ $subsection 8-bit } { $subsection define-8-bit-encoding } ; ABOUT: "io.encodings.8-bit" +HELP: 8-bit +{ $class-description "Describes an 8-bit encoding, including its name (a symbol) and a table used for encoding and decoding." } ; + HELP: define-8-bit-encoding { $values { "name" "a string" } { "path" "a path" } } -{ $description "Creates a new encoding with the given name, using the resource file at the path to tell how to encode and decode octets. The resource file should be in a similar format to those at ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/" } ; +{ $description "Creates a new encoding with the given name, using the resource file at the path to tell how to encode and decode octets. The resource file should be in a similar format to those at " { $url "ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/" } } ; HELP: latin1 { $description "This is the ISO-8859-1 encoding, also called Latin-1: Western European. It is an 8-bit superset of ASCII which is the default for a mimetype starting with 'text' and provides the characters necessary for most western European languages." } diff --git a/extra/io/encodings/8-bit/8-bit.factor b/extra/io/encodings/8-bit/8-bit.factor index c041e699a2..2e33075df0 100644 --- a/extra/io/encodings/8-bit/8-bit.factor +++ b/extra/io/encodings/8-bit/8-bit.factor @@ -3,7 +3,7 @@ USING: math.parser arrays io.encodings sequences kernel assocs hashtables io.encodings.ascii combinators.cleave generic parser tuples words io io.files splitting namespaces -classes quotations math compiler.units ; +classes quotations math compiler.units accessors ; IN: io.encodings.8-bit ch ] [ ch>byte ] bi ; -: empty-tuple-class ( string -- class ) - in get create - dup { f } "slots" set-word-prop - dup predicate-word drop - dup { } define-tuple-class ; +TUPLE: 8-bit name decode encode ; -: data-quot ( class word data -- quot ) - >r [ word-name ] 2apply "/" swap 3append - "/data" append in get create dup 1quotation swap r> - 1quotation define ; +: encode-8-bit ( char stream assoc -- ) + swapd at* [ encode-error ] unless swap stream-write1 ; -: method-with-data ( class data word quot -- ) - >r swap >r 2dup r> data-quot r> - compose >r create-method r> define ; +M: 8-bit encode-char + encode>> encode-8-bit ; -: encode-8-bit ( char stream encoding assoc -- ) - nip swapd at* [ encode-error ] unless swap stream-write1 ; - -: define-encode-char ( class assoc -- ) - \ encode-char [ encode-8-bit ] method-with-data ; - -: decode-8-bit ( stream encoding array -- char/f ) - nip swap stream-read1 +: decode-8-bit ( stream array -- char/f ) + swap stream-read1 dup [ swap nth [ replacement-char ] unless* ] - [ drop f ] if* ; + [ nip ] if ; -: define-decode-char ( class array -- ) - \ decode-char [ decode-8-bit ] method-with-data ; +M: 8-bit decode-char + decode>> decode-8-bit ; -: 8-bit-methods ( class byte>ch ch>byte -- ) - >r over r> define-encode-char define-decode-char ; +: make-8-bit ( word byte>ch ch>byte -- ) + [ 8-bit construct-boa ] 2curry dupd curry define ; : define-8-bit-encoding ( name path -- ) - >r empty-tuple-class r> parse-file 8-bit-methods ; + >r in get create r> parse-file make-8-bit ; PRIVATE> diff --git a/extra/io/encodings/utf16/utf16-docs.factor b/extra/io/encodings/utf16/utf16-docs.factor index 7198cb2b27..bc0e943415 100644 --- a/extra/io/encodings/utf16/utf16-docs.factor +++ b/extra/io/encodings/utf16/utf16-docs.factor @@ -11,15 +11,19 @@ ARTICLE: "io.encodings.utf16" "UTF-16" ABOUT: "io.encodings.utf16" HELP: utf16le -{ $class-description "The encoding descriptor for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } +{ $see-also "encodings-introduction" } ; HELP: utf16be -{ $class-description "The encoding descriptor for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } +{ $see-also "encodings-introduction" } ; HELP: utf16 -{ $class-description "The encoding descriptor for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } +{ $see-also "encodings-introduction" } ; HELP: utf16n -{ $class-description "The encoding descriptor for UTF-16 without a byte order mark in native endian order. This is useful mostly for FFI calls which take input of strings in of wide_t*." } ; +{ $class-description "The encoding descriptor for UTF-16 without a byte order mark in native endian order. This is useful mostly for FFI calls which take input of strings in of wide_t*." } +{ $see-also "encodings-introduction" } ; { utf16 utf16le utf16be utf16n } related-words From caf3ebb31d9278970cf78bdcc80e98f0f320c121 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 27 Mar 2008 00:32:41 -0400 Subject: [PATCH 3/3] Fixing 8-bit encodings --- extra/io/encodings/8-bit/8-bit.factor | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/extra/io/encodings/8-bit/8-bit.factor b/extra/io/encodings/8-bit/8-bit.factor index 2e33075df0..d29760a3e0 100644 --- a/extra/io/encodings/8-bit/8-bit.factor +++ b/extra/io/encodings/8-bit/8-bit.factor @@ -3,7 +3,7 @@ USING: math.parser arrays io.encodings sequences kernel assocs hashtables io.encodings.ascii combinators.cleave generic parser tuples words io io.files splitting namespaces -classes quotations math compiler.units accessors ; +math compiler.units accessors ; IN: io.encodings.8-bit ] map ] map ; + [ "\t" split 2 head [ 2 tail-if hex> ] map ] map ; : byte>ch ( assoc -- array ) 256 replacement-char @@ -77,4 +77,8 @@ M: 8-bit decode-char PRIVATE> -[ mappings [ full-path define-8-bit-encoding ] assoc-each ] with-compilation-unit +[ + "io.encodings.8-bit" in [ + mappings [ full-path define-8-bit-encoding ] assoc-each + ] with-variable +] with-compilation-unit