Fixing encodings/XML

db4
Daniel Ehrenberg 2009-02-03 17:32:05 -06:00
parent a3e52f283b
commit 246c4bb13c
11 changed files with 163 additions and 87 deletions

View File

@ -3,31 +3,33 @@
USING: math.parser arrays io.encodings sequences kernel assocs USING: math.parser arrays io.encodings sequences kernel assocs
hashtables io.encodings.ascii generic parser classes.tuple words hashtables io.encodings.ascii generic parser classes.tuple words
words.symbol io io.files splitting namespaces math words.symbol io io.files splitting namespaces math
compiler.units accessors ; compiler.units accessors classes.singleton classes.mixin
io.encodings.iana ;
IN: io.encodings.8-bit IN: io.encodings.8-bit
<PRIVATE <PRIVATE
: mappings { : mappings {
{ "latin1" "8859-1" } ! encoding-name iana-name file-name
{ "latin2" "8859-2" } { "latin1" "ISO_8859-1:1987" "8859-1" }
{ "latin3" "8859-3" } { "latin2" "ISO_8859-2:1987" "8859-2" }
{ "latin4" "8859-4" } { "latin3" "ISO_8859-3:1988" "8859-3" }
{ "latin/cyrillic" "8859-5" } { "latin4" "ISO_8859-4:1988" "8859-4" }
{ "latin/arabic" "8859-6" } { "latin/cyrillic" "ISO_8859-5:1988" "8859-5" }
{ "latin/greek" "8859-7" } { "latin/arabic" "ISO_8859-6:1987" "8859-6" }
{ "latin/hebrew" "8859-8" } { "latin/greek" "ISO_8859-7:1987" "8859-7" }
{ "latin5" "8859-9" } { "latin/hebrew" "ISO_8859-8:1988" "8859-8" }
{ "latin6" "8859-10" } { "latin5" "ISO_8859-9:1989" "8859-9" }
{ "latin/thai" "8859-11" } { "latin6" "ISO-8859-10" "8859-10" }
{ "latin7" "8859-13" } { "latin/thai" "TIS-620" "8859-11" }
{ "latin8" "8859-14" } { "latin7" "ISO-8859-13" "8859-13" }
{ "latin9" "8859-15" } { "latin8" "ISO-8859-14" "8859-14" }
{ "latin10" "8859-16" } { "latin9" "ISO-8859-15" "8859-15" }
{ "koi8-r" "KOI8-R" } { "latin10" "ISO-8859-16" "8859-16" }
{ "windows-1252" "CP1252" } { "koi8-r" "KOI8-R" "KOI8-R" }
{ "ebcdic" "CP037" } { "windows-1252" "windows-1252" "CP1252" }
{ "mac-roman" "ROMAN" } { "ebcdic" "IBM037" "CP037" }
{ "mac-roman" "macintosh" "ROMAN" }
} ; } ;
: encoding-file ( file-name -- stream ) : encoding-file ( file-name -- stream )
@ -65,8 +67,7 @@ M: 8-bit encode-char encode>> encode-8-bit ;
M: 8-bit decode-char decode>> decode-8-bit ; M: 8-bit decode-char decode>> decode-8-bit ;
PREDICATE: 8-bit-encoding < word MIXIN: 8-bit-encoding
8-bit-encodings get-global key? ;
M: 8-bit-encoding <encoder> M: 8-bit-encoding <encoder>
8-bit-encodings get-global at <encoder> ; 8-bit-encodings get-global at <encoder> ;
@ -74,15 +75,21 @@ M: 8-bit-encoding <encoder>
M: 8-bit-encoding <decoder> M: 8-bit-encoding <decoder>
8-bit-encodings get-global at <decoder> ; 8-bit-encodings get-global at <decoder> ;
: create-encoding ( name -- word )
"io.encodings.8-bit" create
[ define-singleton-class ]
[ 8-bit-encoding add-mixin-instance ]
[ ] tri ;
PRIVATE> PRIVATE>
[ [
mappings [ mappings [
[ "io.encodings.8-bit" create ] first3
[ create-encoding ]
[ dupd register-encoding ]
[ encoding-file parse-file 8-bit boa ] [ encoding-file parse-file 8-bit boa ]
bi* tri*
] assoc-map ] H{ } map>assoc
[ keys [ define-symbol ] each ] 8-bit-encodings set-global
[ 8-bit-encodings set-global ]
bi
] with-compilation-unit ] with-compilation-unit

View File

@ -1,6 +1,6 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings kernel math io.encodings.private ; USING: io io.encodings kernel math io.encodings.private io.encodings.iana ;
IN: io.encodings.ascii IN: io.encodings.ascii
<PRIVATE <PRIVATE
@ -20,3 +20,5 @@ M: ascii encode-char
M: ascii decode-char M: ascii decode-char
128 decode-if< ; 128 decode-if< ;
ascii "ANSI_X3.4-1968" register-encoding

View File

@ -3,11 +3,13 @@
USING: xml xml.data kernel io io.encodings interval-maps splitting fry USING: xml xml.data kernel io io.encodings interval-maps splitting fry
math.parser sequences combinators assocs locals accessors math math.parser sequences combinators assocs locals accessors math
arrays values io.encodings.ascii ascii io.files biassocs math.order arrays values io.encodings.ascii ascii io.files biassocs math.order
combinators.short-circuit io.binary ; combinators.short-circuit io.binary io.encodings.iana ;
IN: io.encodings.chinese IN: io.encodings.chinese
SINGLETON: gb18030 SINGLETON: gb18030
gb18030 "GB18030" register-encoding
<PRIVATE <PRIVATE
! GB to mean GB18030 is a terrible abuse of notation ! GB to mean GB18030 is a terrible abuse of notation

View File

@ -1,12 +1,35 @@
USING: help.syntax help.markup ; USING: help.syntax help.markup strings ;
IN: io.encodings.iana IN: io.encodings.iana
ABOUT: "io.encodings.iana"
ARTICLE: "io.encodings.iana" "IANA-registered encoding names"
"The " { $vocab-link "io.encodings.iana" } " vocabulary provides words for accessing the names of encodings and the encoding descriptors corresponding to names." $nl
"Most text encodings in common use have been registered with IANA. There is a standard set of names for each encoding. Simple conversion functions:"
{ $subsection name>encoding }
{ $subsection encoding>name }
"To let a new encoding be used with the above words, use the following:"
{ $subsection register-encoding }
"Exceptions when encodings or names are not found:"
{ $subsection missing-encoding }
{ $subsection missing-name } ;
HELP: missing-encoding
{ $error-description "The error called from " { $link name>encoding } " when there is no encoding descriptor registered corresponding to the given name." } ;
HELP: missing-name
{ $error-description "The error called from " { $link encoding>name } " when there is no name registered corresponding to the given encoding." } ;
HELP: name>encoding HELP: name>encoding
{ $values { "name" "an encoding name" } { "encoding" "an encoding descriptor" } } { $values { "name" "an encoding name" } { "encoding" "an encoding descriptor" } }
{ "Given an IANA-registered encoding name, find the encoding descriptor that represents it, or " { $code f } " if it is not found (either not implemented in Factor or not registered)." } ; { $description "Given an IANA-registered encoding name, find the encoding descriptor that represents it, or " { $code f } " if it is not found (either not implemented in Factor or not registered)." } ;
HELP: encoding>name HELP: encoding>name
{ $values { "encoding" "an encoding descriptor" } { "name" "an encoding name" } } { $values { "encoding" "an encoding descriptor" } { "name" "an encoding name" } }
{ "Given an encoding descriptor, return the preferred IANA name." } ; { $description "Given an encoding descriptor, return the preferred IANA name." } ;
{ name>encoding encoding>name } related-words { name>encoding encoding>name } related-words
HELP: register-encoding
{ $values { "descriptor" "an encoding descriptor" } { "name" string } }
{ $description "Registers an encoding descriptor with the given name, available for lookup through " { $link name>encoding } " and " { $link encoding>name } ". IANA-registered aliases are automatically included. The name given must be the first name in the " { $snippet "resources:basis/io/encodings/iana/character-sets" } " file." } ;

View File

@ -1,5 +1,28 @@
USING: io.encodings.iana io.encodings.ascii tools.test ; USING: io.encodings.iana io.encodings.iana.private
io.encodings.utf8 tools.test assocs ;
IN: io.encodings.iana.tests
[ ascii ] [ "US-ASCII" name>encoding ] unit-test [ utf8 ] [ "UTF-8" name>encoding ] unit-test
[ ascii ] [ "ASCII" name>encoding ] unit-test [ utf8 ] [ "utf8" name>encoding ] unit-test
[ "US-ASCII" ] [ ascii encoding>name ] unit-test [ "UTF-8" ] [ utf8 encoding>name ] unit-test
! We will never implement EBCDIC-FI-SE-A
SINGLETON: ebcdic-fisea
ebcdic-fisea "EBCDIC-FI-SE-A" register-encoding
[ ebcdic-fisea ] [ "EBCDIC-FI-SE-A" name>encoding ] unit-test
[ ebcdic-fisea ] [ "csEBCDICFISEA" name>encoding ] unit-test
[ "EBCDIC-FI-SE-A" ] [ ebcdic-fisea encoding>name ] unit-test
! Clean up after myself
[ ] [
"EBCDIC-FI-SE-A" n>e-table delete-at
"csEBCDICFISEA" n>e-table delete-at
ebcdic-fisea e>n-table delete-at
] unit-test
[ "EBCDIC-FI-SE-A" name>encoding ] must-fail
[ "csEBCDICFISEA" name>encoding ] must-fail
[ ebcdic-fisea encoding>name ] must-fail
[ ebcdic-fisea "foobar" register-encoding ] must-fail
[ "foobar" name>encoding ] must-fail
[ ebcdic-fisea encoding>name ] must-fail

View File

@ -1,41 +1,24 @@
! Copyright (C) 2008 Daniel Ehrenberg ! Copyright (C) 2008 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: kernel strings values io.files assocs USING: kernel strings values io.files assocs
splitting sequences io namespaces sets io.encodings.8-bit splitting sequences io namespaces sets io.encodings.utf8 ;
io.encodings.ascii io.encodings.utf8 io.encodings.utf16
io.encodings.chinese io.encodings.japanese ;
IN: io.encodings.iana IN: io.encodings.iana
<PRIVATE <PRIVATE
VALUE: n>e-table SYMBOL: n>e-table
SYMBOL: e>n-table
: e>n-table H{ SYMBOL: aliases
{ ascii "US-ASCII" }
{ utf8 "UTF-8" }
{ utf16 "UTF-16" }
{ utf16be "UTF-16BE" }
{ utf16le "UTF-16LE" }
{ latin1 "ISO-8859-1" }
{ latin2 "ISO-8859-2" }
{ latin3 "ISO-8859-3" }
{ latin4 "ISO-8859-4" }
{ latin/cyrillic "ISO-8859-5" }
{ latin/arabic "ISO-8859-6" }
{ latin/greek "ISO-8859-7" }
{ latin/hebrew "ISO-8859-8" }
{ latin5 "ISO-8859-9" }
{ latin6 "ISO-8859-10" }
{ shift-jis "Shift_JIS" }
{ windows-31j "Windows-31J" }
{ gb18030 "GB18030" }
} ;
PRIVATE> PRIVATE>
ERROR: missing-encoding name ;
: name>encoding ( name -- encoding ) : name>encoding ( name -- encoding )
n>e-table at ; dup n>e-table get-global at [ ] [ missing-encoding ] ?if ;
ERROR: missing-name encoding ;
: encoding>name ( encoding -- name ) : encoding>name ( encoding -- name )
e>n-table at ; dup e>n-table get-global at [ ] [ missing-name ] ?if ;
<PRIVATE <PRIVATE
: parse-iana ( stream -- synonym-set ) : parse-iana ( stream -- synonym-set )
@ -43,24 +26,33 @@ PRIVATE>
[ " " split ] map [ " " split ] map
[ first { "Name:" "Alias:" } member? ] filter [ first { "Name:" "Alias:" } member? ] filter
[ second ] map { "None" } diff [ second ] map { "None" } diff
] map ; ] map harvest ;
: more-aliases ( -- assoc ) : make-aliases ( stream -- n>e )
parse-iana [ [ first ] [ ] bi ] H{ } map>assoc ;
: initial-n>e ( -- assoc )
H{ H{
{ "UTF8" utf8 } { "UTF8" utf8 }
{ "utf8" utf8 } { "utf8" utf8 }
{ "utf-8" utf8 } { "utf-8" utf8 }
} ; { "UTF-8" utf8 }
} clone ;
: initial-e>n ( -- assoc )
H{ { utf8 "UTF-8" } } clone ;
: make-n>e ( stream -- n>e )
parse-iana [ [
dup [
e>n-table value-at
[ swap [ set ] with each ]
[ drop ] if*
] with each
] each ] H{ } make-assoc more-aliases assoc-union ;
PRIVATE> PRIVATE>
"resource:basis/io/encodings/iana/character-sets" "resource:basis/io/encodings/iana/character-sets"
ascii <file-reader> make-n>e to: n>e-table utf8 <file-reader> make-aliases aliases set-global
n>e-table global [ initial-n>e or ] change-at
e>n-table global [ initial-e>n or ] change-at
: register-encoding ( descriptor name -- )
[
aliases get at [
[ n>e-table get-global set-at ] with each
] [ "Bad encoding registration" throw ] if*
] [ swap e>n-table get-global set-at ] 2bi ;

View File

@ -4,7 +4,7 @@ USING: help.markup help.syntax ;
IN: io.encodings.japanese IN: io.encodings.japanese
ARTICLE: "io.encodings.japanese" "Japanese text encodings" ARTICLE: "io.encodings.japanese" "Japanese text encodings"
"The " { $vocab-link "io.encodings.japanese" } " vocabulary implements Japanese-specific text encodings. Several encodings are used for Japanese text besides the standard UTF encodings for Unicode strings. These are mostly based on the character set defined in the JIS X 208 standard. Current coverage of encodings is incomplete." "Several encodings are used for Japanese text besides the standard UTF encodings for Unicode strings. These are mostly based on the character set defined in the JIS X 208 standard. Current coverage of encodings is incomplete."
{ $subsection shift-jis } { $subsection shift-jis }
{ $subsection windows-31j } ; { $subsection windows-31j } ;

View File

@ -3,15 +3,29 @@
USING: sequences kernel io io.files combinators.short-circuit USING: sequences kernel io io.files combinators.short-circuit
math.order values assocs io.encodings io.binary fry strings math.order values assocs io.encodings io.binary fry strings
math io.encodings.ascii arrays accessors splitting math.parser math io.encodings.ascii arrays accessors splitting math.parser
biassocs ; biassocs io.encodings.iana ;
IN: io.encodings.japanese IN: io.encodings.japanese
VALUE: shift-jis SINGLETON: shift-jis
VALUE: windows-31j shift-jis "Shift_JIS" register-encoding
SINGLETON: windows-31j
windows-31j "Windows-31J" register-encoding
<PRIVATE <PRIVATE
VALUE: shift-jis-table
M: shift-jis <encoder> drop shift-jis-table <encoder> ;
M: shift-jis <decoder> drop shift-jis-table <decoder> ;
VALUE: windows-31j-table
M: windows-31j <encoder> drop windows-31j-table <encoder> ;
M: windows-31j <decoder> drop windows-31j-table <decoder> ;
TUPLE: jis assoc ; TUPLE: jis assoc ;
: <jis> ( assoc -- jis ) : <jis> ( assoc -- jis )
@ -31,10 +45,10 @@ TUPLE: jis assoc ;
ascii file-lines process-jis <jis> ; ascii file-lines process-jis <jis> ;
"resource:basis/io/encodings/japanese/CP932.txt" "resource:basis/io/encodings/japanese/CP932.txt"
make-jis to: windows-31j make-jis to: windows-31j-table
"resource:basis/io/encodings/japanese/sjis-0208-1997-std.txt" "resource:basis/io/encodings/japanese/sjis-0208-1997-std.txt"
make-jis to: shift-jis make-jis to: shift-jis-table
: small? ( char -- ? ) : small? ( char -- ? )
! ASCII range or single-byte halfwidth katakana ! ASCII range or single-byte halfwidth katakana

View File

@ -1,15 +1,21 @@
! Copyright (C) 2006, 2008 Daniel Ehrenberg. ! Copyright (C) 2006, 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces io.binary USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting io byte-arrays ; io.encodings combinators splitting io byte-arrays io.encodings.iana ;
IN: io.encodings.utf16 IN: io.encodings.utf16
SINGLETON: utf16be SINGLETON: utf16be
utf16be "UTF-16BE" register-encoding
SINGLETON: utf16le SINGLETON: utf16le
utf16le "UTF-16LE" register-encoding
SINGLETON: utf16 SINGLETON: utf16
utf16 "UTF-16" register-encoding
ERROR: missing-bom ; ERROR: missing-bom ;
<PRIVATE <PRIVATE

View File

@ -1,15 +1,21 @@
! Copyright (C) 2009 Daniel Ehrenberg. ! Copyright (C) 2009 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: math kernel io.encodings combinators io io.encodings.utf16 USING: math kernel io.encodings combinators io io.encodings.utf16
sequences io.binary ; sequences io.binary io.encodings.iana ;
IN: io.encodings.utf32 IN: io.encodings.utf32
SINGLETON: utf32be SINGLETON: utf32be
utf32be "UTF-32BE" register-encoding
SINGLETON: utf32le SINGLETON: utf32le
utf32le "UTF-32LE" register-encoding
SINGLETON: utf32 SINGLETON: utf32
utf32 "UTF-32" register-encoding
<PRIVATE <PRIVATE
! Decoding ! Decoding

View File

@ -1,4 +1,5 @@
USING: xml xml.data xml.utilities tools.test accessors kernel ; USING: xml xml.data xml.utilities tools.test accessors kernel
io.encodings.8-bit ;
[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test [ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test [ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test