From 246c4bb13cf67bacc88b7c05a77cb42e9d703660 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Tue, 3 Feb 2009 17:32:05 -0600 Subject: [PATCH] Fixing encodings/XML --- basis/io/encodings/8-bit/8-bit.factor | 63 +++++++++-------- basis/io/encodings/ascii/ascii.factor | 4 +- basis/io/encodings/chinese/chinese.factor | 4 +- basis/io/encodings/iana/iana-docs.factor | 29 +++++++- basis/io/encodings/iana/iana-tests.factor | 31 ++++++-- basis/io/encodings/iana/iana.factor | 70 ++++++++----------- .../encodings/japanese/japanese-docs.factor | 2 +- basis/io/encodings/japanese/japanese.factor | 24 +++++-- basis/io/encodings/utf16/utf16.factor | 10 ++- basis/io/encodings/utf32/utf32.factor | 8 ++- basis/xml/tests/encodings.factor | 5 +- 11 files changed, 163 insertions(+), 87 deletions(-) diff --git a/basis/io/encodings/8-bit/8-bit.factor b/basis/io/encodings/8-bit/8-bit.factor index f5b102ba31..a11edeb703 100644 --- a/basis/io/encodings/8-bit/8-bit.factor +++ b/basis/io/encodings/8-bit/8-bit.factor @@ -3,31 +3,33 @@ USING: math.parser arrays io.encodings sequences kernel assocs hashtables io.encodings.ascii generic parser classes.tuple words words.symbol io io.files splitting namespaces math -compiler.units accessors ; +compiler.units accessors classes.singleton classes.mixin +io.encodings.iana ; IN: io.encodings.8-bit > encode-8-bit ; M: 8-bit decode-char decode>> decode-8-bit ; -PREDICATE: 8-bit-encoding < word - 8-bit-encodings get-global key? ; +MIXIN: 8-bit-encoding M: 8-bit-encoding 8-bit-encodings get-global at ; @@ -74,15 +75,21 @@ M: 8-bit-encoding M: 8-bit-encoding 8-bit-encodings get-global at ; +: create-encoding ( name -- word ) + "io.encodings.8-bit" create + [ define-singleton-class ] + [ 8-bit-encoding add-mixin-instance ] + [ ] tri ; + PRIVATE> [ mappings [ - [ "io.encodings.8-bit" create ] + first3 + [ create-encoding ] + [ dupd register-encoding ] [ encoding-file parse-file 8-bit boa ] - bi* - ] assoc-map - [ keys [ define-symbol ] each ] - [ 8-bit-encodings set-global ] - bi + tri* + ] H{ } map>assoc + 8-bit-encodings set-global ] with-compilation-unit diff --git a/basis/io/encodings/ascii/ascii.factor b/basis/io/encodings/ascii/ascii.factor index d971cf2e60..deb1a7121f 100644 --- a/basis/io/encodings/ascii/ascii.factor +++ b/basis/io/encodings/ascii/ascii.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: io io.encodings kernel math io.encodings.private ; +USING: io io.encodings kernel math io.encodings.private io.encodings.iana ; IN: io.encodings.ascii encoding } +{ $subsection encoding>name } +"To let a new encoding be used with the above words, use the following:" +{ $subsection register-encoding } +"Exceptions when encodings or names are not found:" +{ $subsection missing-encoding } +{ $subsection missing-name } ; + +HELP: missing-encoding +{ $error-description "The error called from " { $link name>encoding } " when there is no encoding descriptor registered corresponding to the given name." } ; + +HELP: missing-name +{ $error-description "The error called from " { $link encoding>name } " when there is no name registered corresponding to the given encoding." } ; + HELP: name>encoding { $values { "name" "an encoding name" } { "encoding" "an encoding descriptor" } } -{ "Given an IANA-registered encoding name, find the encoding descriptor that represents it, or " { $code f } " if it is not found (either not implemented in Factor or not registered)." } ; +{ $description "Given an IANA-registered encoding name, find the encoding descriptor that represents it, or " { $code f } " if it is not found (either not implemented in Factor or not registered)." } ; HELP: encoding>name { $values { "encoding" "an encoding descriptor" } { "name" "an encoding name" } } -{ "Given an encoding descriptor, return the preferred IANA name." } ; +{ $description "Given an encoding descriptor, return the preferred IANA name." } ; { name>encoding encoding>name } related-words + +HELP: register-encoding +{ $values { "descriptor" "an encoding descriptor" } { "name" string } } +{ $description "Registers an encoding descriptor with the given name, available for lookup through " { $link name>encoding } " and " { $link encoding>name } ". IANA-registered aliases are automatically included. The name given must be the first name in the " { $snippet "resources:basis/io/encodings/iana/character-sets" } " file." } ; diff --git a/basis/io/encodings/iana/iana-tests.factor b/basis/io/encodings/iana/iana-tests.factor index 8cee07b984..5ffcc161d4 100644 --- a/basis/io/encodings/iana/iana-tests.factor +++ b/basis/io/encodings/iana/iana-tests.factor @@ -1,5 +1,28 @@ -USING: io.encodings.iana io.encodings.ascii tools.test ; +USING: io.encodings.iana io.encodings.iana.private +io.encodings.utf8 tools.test assocs ; +IN: io.encodings.iana.tests -[ ascii ] [ "US-ASCII" name>encoding ] unit-test -[ ascii ] [ "ASCII" name>encoding ] unit-test -[ "US-ASCII" ] [ ascii encoding>name ] unit-test +[ utf8 ] [ "UTF-8" name>encoding ] unit-test +[ utf8 ] [ "utf8" name>encoding ] unit-test +[ "UTF-8" ] [ utf8 encoding>name ] unit-test + +! We will never implement EBCDIC-FI-SE-A +SINGLETON: ebcdic-fisea +ebcdic-fisea "EBCDIC-FI-SE-A" register-encoding +[ ebcdic-fisea ] [ "EBCDIC-FI-SE-A" name>encoding ] unit-test +[ ebcdic-fisea ] [ "csEBCDICFISEA" name>encoding ] unit-test +[ "EBCDIC-FI-SE-A" ] [ ebcdic-fisea encoding>name ] unit-test + +! Clean up after myself +[ ] [ + "EBCDIC-FI-SE-A" n>e-table delete-at + "csEBCDICFISEA" n>e-table delete-at + ebcdic-fisea e>n-table delete-at +] unit-test +[ "EBCDIC-FI-SE-A" name>encoding ] must-fail +[ "csEBCDICFISEA" name>encoding ] must-fail +[ ebcdic-fisea encoding>name ] must-fail + +[ ebcdic-fisea "foobar" register-encoding ] must-fail +[ "foobar" name>encoding ] must-fail +[ ebcdic-fisea encoding>name ] must-fail diff --git a/basis/io/encodings/iana/iana.factor b/basis/io/encodings/iana/iana.factor index 5e192025fc..a56bd1194b 100644 --- a/basis/io/encodings/iana/iana.factor +++ b/basis/io/encodings/iana/iana.factor @@ -1,41 +1,24 @@ ! Copyright (C) 2008 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: kernel strings values io.files assocs -splitting sequences io namespaces sets io.encodings.8-bit -io.encodings.ascii io.encodings.utf8 io.encodings.utf16 -io.encodings.chinese io.encodings.japanese ; +splitting sequences io namespaces sets io.encodings.utf8 ; IN: io.encodings.iana e-table - -: e>n-table H{ - { ascii "US-ASCII" } - { utf8 "UTF-8" } - { utf16 "UTF-16" } - { utf16be "UTF-16BE" } - { utf16le "UTF-16LE" } - { latin1 "ISO-8859-1" } - { latin2 "ISO-8859-2" } - { latin3 "ISO-8859-3" } - { latin4 "ISO-8859-4" } - { latin/cyrillic "ISO-8859-5" } - { latin/arabic "ISO-8859-6" } - { latin/greek "ISO-8859-7" } - { latin/hebrew "ISO-8859-8" } - { latin5 "ISO-8859-9" } - { latin6 "ISO-8859-10" } - { shift-jis "Shift_JIS" } - { windows-31j "Windows-31J" } - { gb18030 "GB18030" } -} ; +SYMBOL: n>e-table +SYMBOL: e>n-table +SYMBOL: aliases PRIVATE> +ERROR: missing-encoding name ; + : name>encoding ( name -- encoding ) - n>e-table at ; + dup n>e-table get-global at [ ] [ missing-encoding ] ?if ; + +ERROR: missing-name encoding ; : encoding>name ( encoding -- name ) - e>n-table at ; + dup e>n-table get-global at [ ] [ missing-name ] ?if ; [ " " split ] map [ first { "Name:" "Alias:" } member? ] filter [ second ] map { "None" } diff - ] map ; + ] map harvest ; -: more-aliases ( -- assoc ) +: make-aliases ( stream -- n>e ) + parse-iana [ [ first ] [ ] bi ] H{ } map>assoc ; + +: initial-n>e ( -- assoc ) H{ { "UTF8" utf8 } { "utf8" utf8 } { "utf-8" utf8 } - } ; + { "UTF-8" utf8 } + } clone ; + +: initial-e>n ( -- assoc ) + H{ { utf8 "UTF-8" } } clone ; -: make-n>e ( stream -- n>e ) - parse-iana [ [ - dup [ - e>n-table value-at - [ swap [ set ] with each ] - [ drop ] if* - ] with each - ] each ] H{ } make-assoc more-aliases assoc-union ; PRIVATE> "resource:basis/io/encodings/iana/character-sets" -ascii make-n>e to: n>e-table +utf8 make-aliases aliases set-global + +n>e-table global [ initial-n>e or ] change-at +e>n-table global [ initial-e>n or ] change-at + +: register-encoding ( descriptor name -- ) + [ + aliases get at [ + [ n>e-table get-global set-at ] with each + ] [ "Bad encoding registration" throw ] if* + ] [ swap e>n-table get-global set-at ] 2bi ; diff --git a/basis/io/encodings/japanese/japanese-docs.factor b/basis/io/encodings/japanese/japanese-docs.factor index e34f5736f2..48f94af7b4 100644 --- a/basis/io/encodings/japanese/japanese-docs.factor +++ b/basis/io/encodings/japanese/japanese-docs.factor @@ -4,7 +4,7 @@ USING: help.markup help.syntax ; IN: io.encodings.japanese ARTICLE: "io.encodings.japanese" "Japanese text encodings" -"The " { $vocab-link "io.encodings.japanese" } " vocabulary implements Japanese-specific text encodings. Several encodings are used for Japanese text besides the standard UTF encodings for Unicode strings. These are mostly based on the character set defined in the JIS X 208 standard. Current coverage of encodings is incomplete." +"Several encodings are used for Japanese text besides the standard UTF encodings for Unicode strings. These are mostly based on the character set defined in the JIS X 208 standard. Current coverage of encodings is incomplete." { $subsection shift-jis } { $subsection windows-31j } ; diff --git a/basis/io/encodings/japanese/japanese.factor b/basis/io/encodings/japanese/japanese.factor index 3a66181db1..e3257ad63e 100644 --- a/basis/io/encodings/japanese/japanese.factor +++ b/basis/io/encodings/japanese/japanese.factor @@ -3,15 +3,29 @@ USING: sequences kernel io io.files combinators.short-circuit math.order values assocs io.encodings io.binary fry strings math io.encodings.ascii arrays accessors splitting math.parser -biassocs ; +biassocs io.encodings.iana ; IN: io.encodings.japanese -VALUE: shift-jis +SINGLETON: shift-jis -VALUE: windows-31j +shift-jis "Shift_JIS" register-encoding + +SINGLETON: windows-31j + +windows-31j "Windows-31J" register-encoding drop shift-jis-table ; +M: shift-jis drop shift-jis-table ; + +VALUE: windows-31j-table + +M: windows-31j drop windows-31j-table ; +M: windows-31j drop windows-31j-table ; + TUPLE: jis assoc ; : ( assoc -- jis ) @@ -31,10 +45,10 @@ TUPLE: jis assoc ; ascii file-lines process-jis ; "resource:basis/io/encodings/japanese/CP932.txt" -make-jis to: windows-31j +make-jis to: windows-31j-table "resource:basis/io/encodings/japanese/sjis-0208-1997-std.txt" -make-jis to: shift-jis +make-jis to: shift-jis-table : small? ( char -- ? ) ! ASCII range or single-byte halfwidth katakana diff --git a/basis/io/encodings/utf16/utf16.factor b/basis/io/encodings/utf16/utf16.factor index f8a6434d90..d61c07f806 100644 --- a/basis/io/encodings/utf16/utf16.factor +++ b/basis/io/encodings/utf16/utf16.factor @@ -1,15 +1,21 @@ -! Copyright (C) 2006, 2008 Daniel Ehrenberg. +! Copyright (C) 2006, 2009 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors namespaces io.binary -io.encodings combinators splitting io byte-arrays ; +io.encodings combinators splitting io byte-arrays io.encodings.iana ; IN: io.encodings.utf16 SINGLETON: utf16be +utf16be "UTF-16BE" register-encoding + SINGLETON: utf16le +utf16le "UTF-16LE" register-encoding + SINGLETON: utf16 +utf16 "UTF-16" register-encoding + ERROR: missing-bom ; xml children>string ] unit-test [ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test @@ -11,4 +12,4 @@ USING: xml xml.data xml.utilities tools.test accessors kernel ; [ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test [ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test [ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test -[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test \ No newline at end of file +[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test