Changing 8-bit encoding names; documentation

db4
Daniel Ehrenberg 2008-03-23 00:43:43 -04:00
parent 86efc8467c
commit d967d04e4c
7 changed files with 128 additions and 27 deletions

View File

@ -9,7 +9,7 @@ IN: io.tests
] unit-test
: <resource-reader> ( resource -- stream )
resource-path iso-8859-1 <file-reader> ;
resource-path latin1 <file-reader> ;
[
"This is a line.\rThis is another line.\r"

View File

@ -52,7 +52,7 @@ PRIVATE>
: http-request ( request -- response stream )
dup request [
dup request-addr iso-8859-1 <client>
dup request-addr latin1 <client>
1 minutes over set-timeout
[
write-request flush
@ -82,7 +82,7 @@ PRIVATE>
: download-to ( url file -- )
#! Downloads the contents of a URL to a file.
swap http-get-stream swap check-response
[ swap iso-8859-1 <file-writer> stream-copy ] with-disposal ;
[ swap latin1 <file-writer> stream-copy ] with-disposal ;
: download ( url -- )
dup download-name download-to ;

View File

@ -217,7 +217,7 @@ SYMBOL: exit-continuation
: httpd ( port -- )
internet-server "http.server"
iso-8859-1 [ handle-client ] with-server ;
latin1 [ handle-client ] with-server ;
: httpd-main ( -- ) 8888 httpd ;

View File

@ -0,0 +1,91 @@
! Copyright (C) 2008 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.syntax help.markup io.encodings.8-bit.private ;
IN: io.encodings.8-bit
ARTICLE: "io.encodings.8-bit" "8-bit encodings"
"Many encodings are a simple mapping of bytes onto characters. The " { $vocab-link "io.encodings.8-bit" } " vocabulary implements these generically using existing resource files. These encodings should be used with extreme caution, as fully general Unicode encodings like UTF-8 are nearly always more appropriate. The following 8-bit encodings are already defined:"
{ $subsection latin1 }
{ $subsection latin2 }
{ $subsection latin3 }
{ $subsection latin4 }
{ $subsection latin/cyrillic }
{ $subsection latin/arabic }
{ $subsection latin/greek }
{ $subsection latin/hebrew }
{ $subsection latin5 }
{ $subsection latin6 }
{ $subsection latin/thai }
{ $subsection latin7 }
{ $subsection latin8 }
{ $subsection latin9 }
{ $subsection latin10 }
{ $subsection koi8-r }
{ $subsection windows-1252 }
{ $subsection ebcdic }
{ $subsection mac-roman }
"Other encodings can be defined using the following utility"
{ $subsection define-8-bit-encoding } ;
ABOUT: "io.encodings.8-bit"
HELP: define-8-bit-encoding
{ $values { "name" "a string" } { "path" "a path" } }
{ $description "Creates a new encoding with the given name, using the resource file at the path to tell how to encode and decode octets. The resource file should be in a similar format to those at ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/" } ;
HELP: latin1
{ $description "This is the ISO-8859-1 encoding, also called Latin-1: Western European. It is an 8-bit superset of ASCII which is the default for a mimetype starting with 'text' and provides the characters necessary for most western European languages." } ;
HELP: latin2
{ $description "This is the ISO-8859-2 encoding, also called Latin-2: Eastern European. It is an 8-bit superset of ASCII and provides the characters necessary for most eastern European languages." } ;
HELP: latin3
{ $description "This is the ISO-8859-3 encoding, also called Latin-3: South European. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, Maltese and Esperanto." } ;
HELP: latin4
{ $description "This is the ISO-8859-4 encoding, also called Latin-4: North European. It is an 8-bit superset of ASCII and provides the characters necessary for Latvian, Lithuanian, Estonian, Greenlandic and Sami." } ;
HELP: latin/cyrillic
{ $description "This is the ISO-8859-5 encoding, also called Latin/Cyrillic. It is an 8-bit superset of ASCII and provides the characters necessary for most languages which use Cyrilic, including Russian, Macedonian, Belarusian, Bulgarian, Serbian, and Ukrainian. KOI8-R is used much more commonly." } ;
HELP: latin/arabic
{ $description "This is the ISO-8859-6 encoding, also called Latin/Arabic. It is an 8-bit superset of ASCII and provides the characters necessary for Arabic, though not other languages which use Arabic script." } ;
HELP: latin/greek
{ $description "This is the ISO-8859-7 encoding, also called Latin/Greek. It is an 8-bit superset of ASCII and provides the characters necessary for Greek written in modern monotonic orthography, or ancient Greek without accent marks." } ;
HELP: latin/hebrew
{ $description "This is the ISO-8859-8 encoding, also called Latin/Hebrew. It is an 8-bit superset of ASCII and provides the characters necessary for modern Hebrew without explicit vowels. Generally, this is interpreted in logical order, making it ISO-8859-8-I, technically." } ;
HELP: latin5
{ $description "This is the ISO-8859-9 encoding, also called Latin-5: Turkish. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, similar to Latin-1 but replacing the spots used for Icelandic with characters used in Turkish." } ;
HELP: latin6
{ $description "This is the ISO-8859-10 encoding, also called Latin-6: Nordic. It is an 8-bit superset of ASCII containing the same characters as Latin-4, but rearranged to be of better use to nordic languages." } ;
HELP: latin/thai
{ $description "This is the ISO-8859-11 encoding, also called Latin/Thai. It is an 8-bit superset of ASCII containing the characters necessary to represent Thai. It is basically identical to TIS-620." } ;
HELP: latin7
{ $description "This is the ISO-8859-13 encoding, also called Latin-7: Baltic Rim. It is an 8-bit superset of ASCII containing all characters necesary to represent Baltic Rim languages, as previous character sets were incomplete." } ;
HELP: latin8
{ $description "This is the ISO-8859-14 encoding, also called Latin-8: Celtic. It is an 8-bit superset of ASCII designed for Celtic languages like Gaelic and Breton." } ;
HELP: latin9
{ $description "This is the ISO-8859-15 encoding, also called Latin-9 and unoffically as Latin-0. It is an 8-bit superset of ASCII designed as a modification of Latin-1, removing little-used characters in favor of the Euro symbol and other characters." } ;
HELP: latin10
{ $description "This is the ISO-8859-16 encoding, also called Latin-10: South-Eastern European. It is an 8-bit superset of ASCII." } ;
HELP: windows-1252
{ $description "Windows 1252 is an 8-bit superset of ASCII which is closely related to Latin-1. Control characters in the 0x80 to 0x9F range are replaced with printable characters such as the Euro symbol." } ;
HELP: ebcdic
{ $description "EBCDIC is an 8-bit legacy encoding designed for IBM mainframes like System/360 in the 1960s. It has since fallen into disuse. It contains large unallocated regions, and the version included here (code page 37) contains auxiliary characters in this region for English- and Portugese-speaking countries." } ;
HELP: mac-roman
{ $description "Mac Roman is an 8-bit superset of ASCII which was the standard encoding on Mac OS prior to version 10. It is incompatible with Latin-1 in all but a few places and ASCII, and it is suitable for encoding many Western European languages." } ;
HELP: koi8-r
{ $description "KOI8-R is an 8-bit superset of ASCII which encodes the Cyrillic alphabet, as used in Russian and Bulgarian. Characters are in such an order that, if the eight bit is stripped, text is still interpretable as ASCII. Block-building characters also exist." } ;

View File

@ -1,10 +1,10 @@
USING: io.encodings.string io.encodings.8-bit tools.test strings arrays ;
IN: io.encodings.8-bit.tests
[ B{ CHAR: f CHAR: o CHAR: o } ] [ "foo" iso-8859-1 encode ] unit-test
[ { 256 } >string iso-8859-1 encode ] must-fail
[ B{ 255 } ] [ { 255 } iso-8859-1 encode ] unit-test
[ B{ CHAR: f CHAR: o CHAR: o } ] [ "foo" latin1 encode ] unit-test
[ { 256 } >string latin1 encode ] must-fail
[ B{ 255 } ] [ { 255 } latin1 encode ] unit-test
[ "bar" ] [ "bar" iso-8859-1 decode ] unit-test
[ { CHAR: b 233 CHAR: r } ] [ { CHAR: b 233 CHAR: r } iso-8859-1 decode >array ] unit-test
[ "bar" ] [ "bar" latin1 decode ] unit-test
[ { CHAR: b 233 CHAR: r } ] [ { CHAR: b 233 CHAR: r } latin1 decode >array ] unit-test
[ { HEX: fffd HEX: 20AC } ] [ { HEX: 81 HEX: 80 } windows-1252 decode >array ] unit-test

View File

@ -9,21 +9,21 @@ IN: io.encodings.8-bit
<PRIVATE
: mappings {
{ "iso-8859-1" "8859-1" }
{ "iso-8859-2" "8859-2" }
{ "iso-8859-3" "8859-3" }
{ "iso-8859-4" "8859-4" }
{ "iso-8859-5" "8859-5" }
{ "iso-8859-6" "8859-6" }
{ "iso-8859-7" "8859-7" }
{ "iso-8859-8" "8859-8" }
{ "iso-8859-9" "8859-9" }
{ "iso-8859-10" "8859-10" }
{ "iso-8859-11" "8859-11" }
{ "iso-8859-13" "8859-13" }
{ "iso-8859-14" "8859-14" }
{ "iso-8859-15" "8859-15" }
{ "iso-8859-16" "8859-16" }
{ "latin1" "8859-1" }
{ "latin2" "8859-2" }
{ "latin3" "8859-3" }
{ "latin4" "8859-4" }
{ "latin/cyrillic" "8859-5" }
{ "latin/arabic" "8859-6" }
{ "latin/greek" "8859-7" }
{ "latin/hebrew" "8859-8" }
{ "latin5" "8859-9" }
{ "latin6" "8859-10" }
{ "latin/thai" "8859-11" }
{ "latin7" "8859-13" }
{ "latin8" "8859-14" }
{ "latin9" "8859-15" }
{ "latin10" "8859-16" }
{ "koi8-r" "KOI8-R" }
{ "windows-1252" "CP1252" }
{ "ebcdic" "CP037" }
@ -50,7 +50,7 @@ IN: io.encodings.8-bit
[ swap ] assoc-map >hashtable ;
: parse-file ( file-name -- byte>ch ch>byte )
full-path ascii file-lines process-contents
ascii file-lines process-contents
[ byte>ch ] [ ch>byte ] bi ;
: empty-tuple-class ( string -- class )
@ -85,9 +85,9 @@ IN: io.encodings.8-bit
: 8-bit-methods ( class byte>ch ch>byte -- )
>r over r> define-encode-char define-decode-char ;
: define-8-bit-encoding ( tuple-name file-name -- )
: define-8-bit-encoding ( name path -- )
>r empty-tuple-class r> parse-file 8-bit-methods ;
PRIVATE>
[ mappings [ define-8-bit-encoding ] assoc-each ] with-compilation-unit
[ mappings [ full-path define-8-bit-encoding ] assoc-each ] with-compilation-unit

View File

@ -0,0 +1,10 @@
! Copyright (C) 2008 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.syntax help.markup ;
IN: io.encodings.strict
HELP: strict ( encoding -- strict-encoding )
{ $values { "encoding" "an encoding descriptor" } { "strict-encoding" "a strict encoding descriptor" } }
{ $description "Makes an encoding strict, that is, in the presence of a malformed code point, an error is thrown. Note that the existence of a replacement character in a file (U+FFFD) also throws an error." } ;
ABOUT: strict