Fixing encodings/XML

db4
Daniel Ehrenberg 2009-02-03 17:32:05 -06:00
parent a3e52f283b
commit 246c4bb13c
11 changed files with 163 additions and 87 deletions

View File

@ -3,31 +3,33 @@
USING: math.parser arrays io.encodings sequences kernel assocs
hashtables io.encodings.ascii generic parser classes.tuple words
words.symbol io io.files splitting namespaces math
compiler.units accessors ;
compiler.units accessors classes.singleton classes.mixin
io.encodings.iana ;
IN: io.encodings.8-bit
<PRIVATE
: mappings {
{ "latin1" "8859-1" }
{ "latin2" "8859-2" }
{ "latin3" "8859-3" }
{ "latin4" "8859-4" }
{ "latin/cyrillic" "8859-5" }
{ "latin/arabic" "8859-6" }
{ "latin/greek" "8859-7" }
{ "latin/hebrew" "8859-8" }
{ "latin5" "8859-9" }
{ "latin6" "8859-10" }
{ "latin/thai" "8859-11" }
{ "latin7" "8859-13" }
{ "latin8" "8859-14" }
{ "latin9" "8859-15" }
{ "latin10" "8859-16" }
{ "koi8-r" "KOI8-R" }
{ "windows-1252" "CP1252" }
{ "ebcdic" "CP037" }
{ "mac-roman" "ROMAN" }
! encoding-name iana-name file-name
{ "latin1" "ISO_8859-1:1987" "8859-1" }
{ "latin2" "ISO_8859-2:1987" "8859-2" }
{ "latin3" "ISO_8859-3:1988" "8859-3" }
{ "latin4" "ISO_8859-4:1988" "8859-4" }
{ "latin/cyrillic" "ISO_8859-5:1988" "8859-5" }
{ "latin/arabic" "ISO_8859-6:1987" "8859-6" }
{ "latin/greek" "ISO_8859-7:1987" "8859-7" }
{ "latin/hebrew" "ISO_8859-8:1988" "8859-8" }
{ "latin5" "ISO_8859-9:1989" "8859-9" }
{ "latin6" "ISO-8859-10" "8859-10" }
{ "latin/thai" "TIS-620" "8859-11" }
{ "latin7" "ISO-8859-13" "8859-13" }
{ "latin8" "ISO-8859-14" "8859-14" }
{ "latin9" "ISO-8859-15" "8859-15" }
{ "latin10" "ISO-8859-16" "8859-16" }
{ "koi8-r" "KOI8-R" "KOI8-R" }
{ "windows-1252" "windows-1252" "CP1252" }
{ "ebcdic" "IBM037" "CP037" }
{ "mac-roman" "macintosh" "ROMAN" }
} ;
: encoding-file ( file-name -- stream )
@ -65,8 +67,7 @@ M: 8-bit encode-char encode>> encode-8-bit ;
M: 8-bit decode-char decode>> decode-8-bit ;
PREDICATE: 8-bit-encoding < word
8-bit-encodings get-global key? ;
MIXIN: 8-bit-encoding
M: 8-bit-encoding <encoder>
8-bit-encodings get-global at <encoder> ;
@ -74,15 +75,21 @@ M: 8-bit-encoding <encoder>
M: 8-bit-encoding <decoder>
8-bit-encodings get-global at <decoder> ;
: create-encoding ( name -- word )
"io.encodings.8-bit" create
[ define-singleton-class ]
[ 8-bit-encoding add-mixin-instance ]
[ ] tri ;
PRIVATE>
[
mappings [
[ "io.encodings.8-bit" create ]
first3
[ create-encoding ]
[ dupd register-encoding ]
[ encoding-file parse-file 8-bit boa ]
bi*
] assoc-map
[ keys [ define-symbol ] each ]
[ 8-bit-encodings set-global ]
bi
tri*
] H{ } map>assoc
8-bit-encodings set-global
] with-compilation-unit

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings kernel math io.encodings.private ;
USING: io io.encodings kernel math io.encodings.private io.encodings.iana ;
IN: io.encodings.ascii
<PRIVATE
@ -20,3 +20,5 @@ M: ascii encode-char
M: ascii decode-char
128 decode-if< ;
ascii "ANSI_X3.4-1968" register-encoding

View File

@ -3,11 +3,13 @@
USING: xml xml.data kernel io io.encodings interval-maps splitting fry
math.parser sequences combinators assocs locals accessors math
arrays values io.encodings.ascii ascii io.files biassocs math.order
combinators.short-circuit io.binary ;
combinators.short-circuit io.binary io.encodings.iana ;
IN: io.encodings.chinese
SINGLETON: gb18030
gb18030 "GB18030" register-encoding
<PRIVATE
! GB to mean GB18030 is a terrible abuse of notation

View File

@ -1,12 +1,35 @@
USING: help.syntax help.markup ;
USING: help.syntax help.markup strings ;
IN: io.encodings.iana
ABOUT: "io.encodings.iana"
ARTICLE: "io.encodings.iana" "IANA-registered encoding names"
"The " { $vocab-link "io.encodings.iana" } " vocabulary provides words for accessing the names of encodings and the encoding descriptors corresponding to names." $nl
"Most text encodings in common use have been registered with IANA. There is a standard set of names for each encoding. Simple conversion functions:"
{ $subsection name>encoding }
{ $subsection encoding>name }
"To let a new encoding be used with the above words, use the following:"
{ $subsection register-encoding }
"Exceptions when encodings or names are not found:"
{ $subsection missing-encoding }
{ $subsection missing-name } ;
HELP: missing-encoding
{ $error-description "The error called from " { $link name>encoding } " when there is no encoding descriptor registered corresponding to the given name." } ;
HELP: missing-name
{ $error-description "The error called from " { $link encoding>name } " when there is no name registered corresponding to the given encoding." } ;
HELP: name>encoding
{ $values { "name" "an encoding name" } { "encoding" "an encoding descriptor" } }
{ "Given an IANA-registered encoding name, find the encoding descriptor that represents it, or " { $code f } " if it is not found (either not implemented in Factor or not registered)." } ;
{ $description "Given an IANA-registered encoding name, find the encoding descriptor that represents it, or " { $code f } " if it is not found (either not implemented in Factor or not registered)." } ;
HELP: encoding>name
{ $values { "encoding" "an encoding descriptor" } { "name" "an encoding name" } }
{ "Given an encoding descriptor, return the preferred IANA name." } ;
{ $description "Given an encoding descriptor, return the preferred IANA name." } ;
{ name>encoding encoding>name } related-words
HELP: register-encoding
{ $values { "descriptor" "an encoding descriptor" } { "name" string } }
{ $description "Registers an encoding descriptor with the given name, available for lookup through " { $link name>encoding } " and " { $link encoding>name } ". IANA-registered aliases are automatically included. The name given must be the first name in the " { $snippet "resources:basis/io/encodings/iana/character-sets" } " file." } ;

View File

@ -1,5 +1,28 @@
USING: io.encodings.iana io.encodings.ascii tools.test ;
USING: io.encodings.iana io.encodings.iana.private
io.encodings.utf8 tools.test assocs ;
IN: io.encodings.iana.tests
[ ascii ] [ "US-ASCII" name>encoding ] unit-test
[ ascii ] [ "ASCII" name>encoding ] unit-test
[ "US-ASCII" ] [ ascii encoding>name ] unit-test
[ utf8 ] [ "UTF-8" name>encoding ] unit-test
[ utf8 ] [ "utf8" name>encoding ] unit-test
[ "UTF-8" ] [ utf8 encoding>name ] unit-test
! We will never implement EBCDIC-FI-SE-A
SINGLETON: ebcdic-fisea
ebcdic-fisea "EBCDIC-FI-SE-A" register-encoding
[ ebcdic-fisea ] [ "EBCDIC-FI-SE-A" name>encoding ] unit-test
[ ebcdic-fisea ] [ "csEBCDICFISEA" name>encoding ] unit-test
[ "EBCDIC-FI-SE-A" ] [ ebcdic-fisea encoding>name ] unit-test
! Clean up after myself
[ ] [
"EBCDIC-FI-SE-A" n>e-table delete-at
"csEBCDICFISEA" n>e-table delete-at
ebcdic-fisea e>n-table delete-at
] unit-test
[ "EBCDIC-FI-SE-A" name>encoding ] must-fail
[ "csEBCDICFISEA" name>encoding ] must-fail
[ ebcdic-fisea encoding>name ] must-fail
[ ebcdic-fisea "foobar" register-encoding ] must-fail
[ "foobar" name>encoding ] must-fail
[ ebcdic-fisea encoding>name ] must-fail

View File

@ -1,41 +1,24 @@
! Copyright (C) 2008 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings values io.files assocs
splitting sequences io namespaces sets io.encodings.8-bit
io.encodings.ascii io.encodings.utf8 io.encodings.utf16
io.encodings.chinese io.encodings.japanese ;
splitting sequences io namespaces sets io.encodings.utf8 ;
IN: io.encodings.iana
<PRIVATE
VALUE: n>e-table
: e>n-table H{
{ ascii "US-ASCII" }
{ utf8 "UTF-8" }
{ utf16 "UTF-16" }
{ utf16be "UTF-16BE" }
{ utf16le "UTF-16LE" }
{ latin1 "ISO-8859-1" }
{ latin2 "ISO-8859-2" }
{ latin3 "ISO-8859-3" }
{ latin4 "ISO-8859-4" }
{ latin/cyrillic "ISO-8859-5" }
{ latin/arabic "ISO-8859-6" }
{ latin/greek "ISO-8859-7" }
{ latin/hebrew "ISO-8859-8" }
{ latin5 "ISO-8859-9" }
{ latin6 "ISO-8859-10" }
{ shift-jis "Shift_JIS" }
{ windows-31j "Windows-31J" }
{ gb18030 "GB18030" }
} ;
SYMBOL: n>e-table
SYMBOL: e>n-table
SYMBOL: aliases
PRIVATE>
ERROR: missing-encoding name ;
: name>encoding ( name -- encoding )
n>e-table at ;
dup n>e-table get-global at [ ] [ missing-encoding ] ?if ;
ERROR: missing-name encoding ;
: encoding>name ( encoding -- name )
e>n-table at ;
dup e>n-table get-global at [ ] [ missing-name ] ?if ;
<PRIVATE
: parse-iana ( stream -- synonym-set )
@ -43,24 +26,33 @@ PRIVATE>
[ " " split ] map
[ first { "Name:" "Alias:" } member? ] filter
[ second ] map { "None" } diff
] map ;
] map harvest ;
: more-aliases ( -- assoc )
: make-aliases ( stream -- n>e )
parse-iana [ [ first ] [ ] bi ] H{ } map>assoc ;
: initial-n>e ( -- assoc )
H{
{ "UTF8" utf8 }
{ "utf8" utf8 }
{ "utf-8" utf8 }
} ;
{ "UTF-8" utf8 }
} clone ;
: initial-e>n ( -- assoc )
H{ { utf8 "UTF-8" } } clone ;
: make-n>e ( stream -- n>e )
parse-iana [ [
dup [
e>n-table value-at
[ swap [ set ] with each ]
[ drop ] if*
] with each
] each ] H{ } make-assoc more-aliases assoc-union ;
PRIVATE>
"resource:basis/io/encodings/iana/character-sets"
ascii <file-reader> make-n>e to: n>e-table
utf8 <file-reader> make-aliases aliases set-global
n>e-table global [ initial-n>e or ] change-at
e>n-table global [ initial-e>n or ] change-at
: register-encoding ( descriptor name -- )
[
aliases get at [
[ n>e-table get-global set-at ] with each
] [ "Bad encoding registration" throw ] if*
] [ swap e>n-table get-global set-at ] 2bi ;

View File

@ -4,7 +4,7 @@ USING: help.markup help.syntax ;
IN: io.encodings.japanese
ARTICLE: "io.encodings.japanese" "Japanese text encodings"
"The " { $vocab-link "io.encodings.japanese" } " vocabulary implements Japanese-specific text encodings. Several encodings are used for Japanese text besides the standard UTF encodings for Unicode strings. These are mostly based on the character set defined in the JIS X 208 standard. Current coverage of encodings is incomplete."
"Several encodings are used for Japanese text besides the standard UTF encodings for Unicode strings. These are mostly based on the character set defined in the JIS X 208 standard. Current coverage of encodings is incomplete."
{ $subsection shift-jis }
{ $subsection windows-31j } ;

View File

@ -3,15 +3,29 @@
USING: sequences kernel io io.files combinators.short-circuit
math.order values assocs io.encodings io.binary fry strings
math io.encodings.ascii arrays accessors splitting math.parser
biassocs ;
biassocs io.encodings.iana ;
IN: io.encodings.japanese
VALUE: shift-jis
SINGLETON: shift-jis
VALUE: windows-31j
shift-jis "Shift_JIS" register-encoding
SINGLETON: windows-31j
windows-31j "Windows-31J" register-encoding
<PRIVATE
VALUE: shift-jis-table
M: shift-jis <encoder> drop shift-jis-table <encoder> ;
M: shift-jis <decoder> drop shift-jis-table <decoder> ;
VALUE: windows-31j-table
M: windows-31j <encoder> drop windows-31j-table <encoder> ;
M: windows-31j <decoder> drop windows-31j-table <decoder> ;
TUPLE: jis assoc ;
: <jis> ( assoc -- jis )
@ -31,10 +45,10 @@ TUPLE: jis assoc ;
ascii file-lines process-jis <jis> ;
"resource:basis/io/encodings/japanese/CP932.txt"
make-jis to: windows-31j
make-jis to: windows-31j-table
"resource:basis/io/encodings/japanese/sjis-0208-1997-std.txt"
make-jis to: shift-jis
make-jis to: shift-jis-table
: small? ( char -- ? )
! ASCII range or single-byte halfwidth katakana

View File

@ -1,15 +1,21 @@
! Copyright (C) 2006, 2008 Daniel Ehrenberg.
! Copyright (C) 2006, 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting io byte-arrays ;
io.encodings combinators splitting io byte-arrays io.encodings.iana ;
IN: io.encodings.utf16
SINGLETON: utf16be
utf16be "UTF-16BE" register-encoding
SINGLETON: utf16le
utf16le "UTF-16LE" register-encoding
SINGLETON: utf16
utf16 "UTF-16" register-encoding
ERROR: missing-bom ;
<PRIVATE

View File

@ -1,15 +1,21 @@
! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel io.encodings combinators io io.encodings.utf16
sequences io.binary ;
sequences io.binary io.encodings.iana ;
IN: io.encodings.utf32
SINGLETON: utf32be
utf32be "UTF-32BE" register-encoding
SINGLETON: utf32le
utf32le "UTF-32LE" register-encoding
SINGLETON: utf32
utf32 "UTF-32" register-encoding
<PRIVATE
! Decoding

View File

@ -1,4 +1,5 @@
USING: xml xml.data xml.utilities tools.test accessors kernel ;
USING: xml xml.data xml.utilities tools.test accessors kernel
io.encodings.8-bit ;
[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test