diff --git a/.gitignore b/.gitignore index 7e1e52d866..f2cf3de119 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,4 @@ factor temp logs work -buildsupport/wordsize +build-support/wordsize diff --git a/build-support/target b/build-support/target index 1903a6da64..1fbfb31d11 100755 --- a/build-support/target +++ b/build-support/target @@ -1,38 +1,38 @@ #!/bin/sh -if [ \( `uname -s ` = FreeBSD \) -a \( `uname -p` = i386 \) ] -then - echo freebsd-x86-32 -elif [ \( `uname -s` = FreeBSD \) -a \( `uname -m` = amd64 \) ] -then - echo freebsd-x86-64 -elif [ \( `uname -s` = OpenBSD \) -a \( `uname -m` = i386 \) ] -then - echo openbsd-x86-32 -elif [ \( `uname -s` = OpenBSD \) -a \( `uname -m` = amd64 \) ] -then - echo openbsd-x86-64 -elif [ \( `uname -s` = NetBSD \) -a \( `uname -p` = i386 \) ] -then - echo netbsd-x86-32 -elif [ \( `uname -s` = NetBSD \) -a \( `uname -p` = x86_64 \) ] -then - echo netbsd-x86-64 -elif [ \( `uname -s` = Darwin \) -a \( `uname -p` = powerpc \) ] -then - echo macosx-ppc -elif [ `uname -s` = Darwin ] -then - echo macosx-x86-`./build-support/wordsize` -elif [ \( `uname -s` = Linux \) -a \( `uname -m` = i686 \) ] -then - echo linux-x86-32 -elif [ \( `uname -s` = Linux \) -a \( `uname -m` = x86_64 \) ] -then - echo linux-x86-64 -elif [ \( `uname -o` = Cygwin \) -a \( `uname -m` = i686 \) ] -then - echo winnt-x86-`./build-support/wordsize` -else - echo help +uname_s=`uname -s` +case $uname_s in + CYGWIN_NT-5.2-WOW64) OS=winnt;; + *CYGWIN_NT*) OS=winnt;; + *CYGWIN*) OS=winnt;; + *darwin*) OS=macosx;; + *Darwin*) OS=macosx;; + *linux*) OS=linux;; + *Linux*) OS=linux;; + *NetBSD*) OS=netbsd;; + *FreeBSD*) OS=freebsd;; + *OpenBSD*) OS=openbsd;; + *DragonFly*) OS=dragonflybsd;; +esac + +uname_m=`uname -m` +case $uname_m in + i386) ARCH=x86;; + i686) ARCH=x86;; + amd64) ARCH=x86;; + *86) ARCH=x86;; + *86_64) ARCH=x86;; + "Power Macintosh") ARCH=ppc;; +esac + +WORD=`./build-support/wordsize` + +MAKE_TARGET=$OS-$ARCH-$WORD +if [[ $OS == macosx && $ARCH == ppc ]] ; then + MAKE_TARGET=$OS-$ARCH fi +if [[ $OS == linux && $ARCH == ppc ]] ; then + MAKE_TARGET=$OS-$ARCH +fi + +echo $MAKE_TARGET diff --git a/core/io/encodings/binary/binary-docs.factor b/core/io/encodings/binary/binary-docs.factor index 823eea67be..fdd9828867 100644 --- a/core/io/encodings/binary/binary-docs.factor +++ b/core/io/encodings/binary/binary-docs.factor @@ -2,4 +2,7 @@ USING: help.syntax help.markup ; IN: io.encodings.binary HELP: binary -{ $class-description "This is the encoding descriptor for binary I/O. Making an encoded stream with the binary encoding is a no-op; streams with this encoding deal with byte-arrays, not strings." } ; +{ $class-description "This is the encoding descriptor for binary I/O. Making an encoded stream with the binary encoding is a no-op; streams with this encoding deal with byte-arrays, not strings." } +{ $see-also "encodings-introduction" } ; + +ABOUT: binary diff --git a/core/io/encodings/encodings-docs.factor b/core/io/encodings/encodings-docs.factor index d5bdf24dc0..07e0f9f401 100644 --- a/core/io/encodings/encodings-docs.factor +++ b/core/io/encodings/encodings-docs.factor @@ -1,15 +1,16 @@ USING: help.markup help.syntax ; IN: io.encodings -ABOUT: "encodings" +ABOUT: "io.encodings" ARTICLE: "io.encodings" "I/O encodings" -"Many streams deal with bytes, rather than Unicode code points, at some level. The translation between these two things is specified by an encoding. To abstract this away from the programmer, Factor provides a system where these streams are associated with an encoding which is always used when the stream is read from or written to. For most purposes, an encoding descriptor consisting of a symbol is all that is needed when initializing a stream." +"Bytes can't be understood in isolation as text. They must be interpreted under a certain encoding. Factor provides utilities for dealing with encoded text by declaring that a stream has a particular encoding, and utilities to encode and decode strings." { $subsection "encodings-constructors" } { $subsection "encodings-descriptors" } { $subsection "encodings-protocol" } ; -ARTICLE: "encodings-constructors" "Constructing an encoded stream" +ARTICLE: "encodings-constructors" "Manually constructing an encoded stream" +"The following words can be used to construct encoded streams. Note that they are usually not used directly, but rather by the stream constructors themselves. Most stream constructors take an encoding descriptor as a parameter and internally call these constructors." { $subsection } { $subsection } { $subsection } ; @@ -37,19 +38,22 @@ HELP: ARTICLE: "encodings-descriptors" "Encoding descriptors" "An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use are defined in the following vocabularies:" -{ $vocab-subsection "io.encodings.utf8" } -{ $vocab-subsection "io.encodings.ascii" } -{ $vocab-subsection "io.encodings.8-bit" } -{ $vocab-subsection "io.encodings.binary" } -{ $vocab-subsection "io.encodings.utf16" } ; +{ $vocab-subsection "ASCII" "io.encodings.ascii" } +{ $vocab-subsection "Binary" "io.encodings.binary" } +{ $vocab-subsection "Strict encodings" "io.encodings.strict" } +{ $vocab-subsection "8-bit encodings" "io.encodings.8-bit" } +{ $vocab-subsection "UTF-8" "io.encodings.utf8" } +{ $vocab-subsection "UTF-16" "io.encodings.utf16" } +{ $see-also "encodings-introduction" } ; ARTICLE: "encodings-protocol" "Encoding protocol" -"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again." +"There are two parts to implementing a new encoding. First, methods for creating an encoded or decoded stream must be provided. These have defaults, however, which wrap a stream in an encoder or decoder wrapper with the given encoding descriptor." +{ $subsection } +{ $subsection } +"If an encoding might be contained in the code slot of an encoder or decoder tuple, then the following methods must be implemented to read or write one code point from a stream:" { $subsection decode-char } { $subsection encode-char } -"The following methods are optional:" -{ $subsection } -{ $subsection } ; +{ $see-also "encodings-introduction" } ; HELP: decode-char { $values { "stream" "an underlying input stream" } diff --git a/core/io/encodings/utf8/utf8-docs.factor b/core/io/encodings/utf8/utf8-docs.factor index dbbc193a02..7a29039eca 100755 --- a/core/io/encodings/utf8/utf8-docs.factor +++ b/core/io/encodings/utf8/utf8-docs.factor @@ -1,11 +1,8 @@ -USING: help.markup help.syntax io.encodings strings io.files ; +USING: help.markup help.syntax ; IN: io.encodings.utf8 -ARTICLE: "io.encodings.utf8" "Working with UTF8-encoded data" -"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences. The encoding descriptor for UTF-8:" -{ $subsection utf8 } ; - HELP: utf8 -{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. This conforms to the " { $link "encodings-protocol" } "." } ; +{ $class-description "This is the encoding descriptor for a UTF-8 encoding. UTF-8 is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences." } +{ $see-also "encodings-introduction" } ; -ABOUT: "io.encodings.utf8" +ABOUT: utf8 diff --git a/core/io/io-tests.factor b/core/io/io-tests.factor index 91e51f25b0..abae63c82b 100755 --- a/core/io/io-tests.factor +++ b/core/io/io-tests.factor @@ -9,7 +9,7 @@ IN: io.tests ] unit-test : ( resource -- stream ) - resource-path iso-8859-1 ; + resource-path latin1 ; [ "This is a line.\rThis is another line.\r" diff --git a/extra/help/handbook/handbook.factor b/extra/help/handbook/handbook.factor index 1310b58133..4079386d7f 100755 --- a/extra/help/handbook/handbook.factor +++ b/extra/help/handbook/handbook.factor @@ -170,7 +170,17 @@ ARTICLE: "collections" "Collections" { $subsection "graphs" } { $subsection "buffers" } ; -USING: io.sockets io.launcher io.mmap io.monitors ; +USING: io.sockets io.launcher io.mmap io.monitors +io.encodings.utf8 io.encodings.binary io.encodings.ascii io.files ; + +ARTICLE: "encodings-introduction" "An introduction to encodings" +"In order to express text in terms of binary, some sort of encoding has to be used. In a modern context, this is understood as a two-way mapping between Unicode code points (characters) and some amount of binary. Since English isn't the only language in the world, ASCII is not sufficient as a mapping from binary to Unicode; it can't even express em-dashes or curly quotes. Unicode was designed as a universal character set that could potentially represent everything." $nl +"Not all encodings can represent all Unicode code points, but Unicode can represent basically everything that exists in modern encodings. Some encodings are language-specific, and some can represent everything in Unicode. Though the world is moving toward Unicode and UTF-8, the reality today is that there are several encodings which must be taken into account." $nl +"Factor uses a system of encoding descriptors to denote encodings. Encoding descriptors are objects which describe encodings. Examples are " { $link utf8 } ", " { $link ascii } " and " { $link binary } ". Encoding descriptors can be passed around independently. Each encoding descriptor has some method for constructing an encoded or decoded stream, and the resulting stream has an encoding descriptor stored which has methods for reading or writing characters." $nl +"Constructors for streams which deal with bytes usually take an encoding as an explicit parameter. For example, to open a text file for reading whose contents are in UTF-8, use the following" +{ $code "\"filename\" utf8 " } +"If there is an error in the encoded stream, a replacement character (0xFFFD) will be inserted. To throw an exception upon error, use a strict encoding as follows" +{ $code "\"filename\" utf8 strict " } ; ARTICLE: "io" "Input and output" { $heading "Streams" } @@ -188,6 +198,7 @@ ARTICLE: "io" "Input and output" { $subsection "io.mmap" } { $subsection "io.monitors" } { $heading "Encodings" } +{ $subsection "encodings-introduction" } { $subsection "io.encodings" } { $subsection "io.encodings.string" } { $heading "Other features" } diff --git a/extra/http/client/client.factor b/extra/http/client/client.factor index 233b61ea74..e4bbf0279f 100755 --- a/extra/http/client/client.factor +++ b/extra/http/client/client.factor @@ -52,7 +52,7 @@ PRIVATE> : http-request ( request -- response stream ) dup request [ - dup request-addr iso-8859-1 + dup request-addr latin1 1 minutes over set-timeout [ write-request flush @@ -82,7 +82,7 @@ PRIVATE> : download-to ( url file -- ) #! Downloads the contents of a URL to a file. swap http-get-stream swap check-response - [ swap iso-8859-1 stream-copy ] with-disposal ; + [ swap latin1 stream-copy ] with-disposal ; : download ( url -- ) dup download-name download-to ; diff --git a/extra/http/server/server.factor b/extra/http/server/server.factor index 3df21adf26..81201dd3fe 100755 --- a/extra/http/server/server.factor +++ b/extra/http/server/server.factor @@ -217,7 +217,7 @@ SYMBOL: exit-continuation : httpd ( port -- ) internet-server "http.server" - iso-8859-1 [ handle-client ] with-server ; + latin1 [ handle-client ] with-server ; : httpd-main ( -- ) 8888 httpd ; diff --git a/extra/io/encodings/8-bit/8-bit-docs.factor b/extra/io/encodings/8-bit/8-bit-docs.factor new file mode 100644 index 0000000000..8e5fd815bc --- /dev/null +++ b/extra/io/encodings/8-bit/8-bit-docs.factor @@ -0,0 +1,110 @@ +! Copyright (C) 2008 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: help.syntax help.markup io.encodings.8-bit.private ; +IN: io.encodings.8-bit + +ARTICLE: "io.encodings.8-bit" "8-bit encodings" +"Many encodings are a simple mapping of bytes onto characters. The " { $vocab-link "io.encodings.8-bit" } " vocabulary implements these generically using existing resource files. These encodings should be used with extreme caution, as fully general Unicode encodings like UTF-8 are nearly always more appropriate. The following 8-bit encodings are already defined:" +{ $subsection latin1 } +{ $subsection latin2 } +{ $subsection latin3 } +{ $subsection latin4 } +{ $subsection latin/cyrillic } +{ $subsection latin/arabic } +{ $subsection latin/greek } +{ $subsection latin/hebrew } +{ $subsection latin5 } +{ $subsection latin6 } +{ $subsection latin/thai } +{ $subsection latin7 } +{ $subsection latin8 } +{ $subsection latin9 } +{ $subsection latin10 } +{ $subsection koi8-r } +{ $subsection windows-1252 } +{ $subsection ebcdic } +{ $subsection mac-roman } +"Other encodings can be defined using the following utility" +{ $subsection define-8-bit-encoding } ; + +ABOUT: "io.encodings.8-bit" + +HELP: define-8-bit-encoding +{ $values { "name" "a string" } { "path" "a path" } } +{ $description "Creates a new encoding with the given name, using the resource file at the path to tell how to encode and decode octets. The resource file should be in a similar format to those at ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/" } ; + +HELP: latin1 +{ $description "This is the ISO-8859-1 encoding, also called Latin-1: Western European. It is an 8-bit superset of ASCII which is the default for a mimetype starting with 'text' and provides the characters necessary for most western European languages." } +{ $see-also "encodings-introduction" } ; + +HELP: latin2 +{ $description "This is the ISO-8859-2 encoding, also called Latin-2: Eastern European. It is an 8-bit superset of ASCII and provides the characters necessary for most eastern European languages." } +{ $see-also "encodings-introduction" } ; + +HELP: latin3 +{ $description "This is the ISO-8859-3 encoding, also called Latin-3: South European. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, Maltese and Esperanto." } +{ $see-also "encodings-introduction" } ; + +HELP: latin4 +{ $description "This is the ISO-8859-4 encoding, also called Latin-4: North European. It is an 8-bit superset of ASCII and provides the characters necessary for Latvian, Lithuanian, Estonian, Greenlandic and Sami." } +{ $see-also "encodings-introduction" } ; + +HELP: latin/cyrillic +{ $description "This is the ISO-8859-5 encoding, also called Latin/Cyrillic. It is an 8-bit superset of ASCII and provides the characters necessary for most languages which use Cyrilic, including Russian, Macedonian, Belarusian, Bulgarian, Serbian, and Ukrainian. KOI8-R is used much more commonly." } +{ $see-also "encodings-introduction" } ; + +HELP: latin/arabic +{ $description "This is the ISO-8859-6 encoding, also called Latin/Arabic. It is an 8-bit superset of ASCII and provides the characters necessary for Arabic, though not other languages which use Arabic script." } +{ $see-also "encodings-introduction" } ; + +HELP: latin/greek +{ $description "This is the ISO-8859-7 encoding, also called Latin/Greek. It is an 8-bit superset of ASCII and provides the characters necessary for Greek written in modern monotonic orthography, or ancient Greek without accent marks." } +{ $see-also "encodings-introduction" } ; + +HELP: latin/hebrew +{ $description "This is the ISO-8859-8 encoding, also called Latin/Hebrew. It is an 8-bit superset of ASCII and provides the characters necessary for modern Hebrew without explicit vowels. Generally, this is interpreted in logical order, making it ISO-8859-8-I, technically." } +{ $see-also "encodings-introduction" } ; + +HELP: latin5 +{ $description "This is the ISO-8859-9 encoding, also called Latin-5: Turkish. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, similar to Latin-1 but replacing the spots used for Icelandic with characters used in Turkish." } +{ $see-also "encodings-introduction" } ; + +HELP: latin6 +{ $description "This is the ISO-8859-10 encoding, also called Latin-6: Nordic. It is an 8-bit superset of ASCII containing the same characters as Latin-4, but rearranged to be of better use to nordic languages." } +{ $see-also "encodings-introduction" } ; + +HELP: latin/thai +{ $description "This is the ISO-8859-11 encoding, also called Latin/Thai. It is an 8-bit superset of ASCII containing the characters necessary to represent Thai. It is basically identical to TIS-620." } +{ $see-also "encodings-introduction" } ; + +HELP: latin7 +{ $description "This is the ISO-8859-13 encoding, also called Latin-7: Baltic Rim. It is an 8-bit superset of ASCII containing all characters necesary to represent Baltic Rim languages, as previous character sets were incomplete." } +{ $see-also "encodings-introduction" } ; + +HELP: latin8 +{ $description "This is the ISO-8859-14 encoding, also called Latin-8: Celtic. It is an 8-bit superset of ASCII designed for Celtic languages like Gaelic and Breton." } +{ $see-also "encodings-introduction" } ; + +HELP: latin9 +{ $description "This is the ISO-8859-15 encoding, also called Latin-9 and unoffically as Latin-0. It is an 8-bit superset of ASCII designed as a modification of Latin-1, removing little-used characters in favor of the Euro symbol and other characters." } +{ $see-also "encodings-introduction" } ; + +HELP: latin10 +{ $description "This is the ISO-8859-16 encoding, also called Latin-10: South-Eastern European. It is an 8-bit superset of ASCII." } +{ $see-also "encodings-introduction" } ; + +HELP: windows-1252 +{ $description "Windows 1252 is an 8-bit superset of ASCII which is closely related to Latin-1. Control characters in the 0x80 to 0x9F range are replaced with printable characters such as the Euro symbol." } +{ $see-also "encodings-introduction" } ; + +HELP: ebcdic +{ $description "EBCDIC is an 8-bit legacy encoding designed for IBM mainframes like System/360 in the 1960s. It has since fallen into disuse. It contains large unallocated regions, and the version included here (code page 37) contains auxiliary characters in this region for English- and Portugese-speaking countries." } +{ $see-also "encodings-introduction" } ; + +HELP: mac-roman +{ $description "Mac Roman is an 8-bit superset of ASCII which was the standard encoding on Mac OS prior to version 10. It is incompatible with Latin-1 in all but a few places and ASCII, and it is suitable for encoding many Western European languages." } +{ $see-also "encodings-introduction" } ; + +HELP: koi8-r +{ $description "KOI8-R is an 8-bit superset of ASCII which encodes the Cyrillic alphabet, as used in Russian and Bulgarian. Characters are in such an order that, if the eight bit is stripped, text is still interpretable as ASCII. Block-building characters also exist." } +{ $see-also "encodings-introduction" } ; diff --git a/extra/io/encodings/8-bit/8-bit-tests.factor b/extra/io/encodings/8-bit/8-bit-tests.factor index 5dbe28cb14..24cd4137d4 100644 --- a/extra/io/encodings/8-bit/8-bit-tests.factor +++ b/extra/io/encodings/8-bit/8-bit-tests.factor @@ -1,10 +1,10 @@ USING: io.encodings.string io.encodings.8-bit tools.test strings arrays ; IN: io.encodings.8-bit.tests -[ B{ CHAR: f CHAR: o CHAR: o } ] [ "foo" iso-8859-1 encode ] unit-test -[ { 256 } >string iso-8859-1 encode ] must-fail -[ B{ 255 } ] [ { 255 } iso-8859-1 encode ] unit-test +[ B{ CHAR: f CHAR: o CHAR: o } ] [ "foo" latin1 encode ] unit-test +[ { 256 } >string latin1 encode ] must-fail +[ B{ 255 } ] [ { 255 } latin1 encode ] unit-test -[ "bar" ] [ "bar" iso-8859-1 decode ] unit-test -[ { CHAR: b 233 CHAR: r } ] [ { CHAR: b 233 CHAR: r } iso-8859-1 decode >array ] unit-test +[ "bar" ] [ "bar" latin1 decode ] unit-test +[ { CHAR: b 233 CHAR: r } ] [ { CHAR: b 233 CHAR: r } latin1 decode >array ] unit-test [ { HEX: fffd HEX: 20AC } ] [ { HEX: 81 HEX: 80 } windows-1252 decode >array ] unit-test diff --git a/extra/io/encodings/8-bit/8-bit.factor b/extra/io/encodings/8-bit/8-bit.factor index 2cc6b2e57c..c041e699a2 100644 --- a/extra/io/encodings/8-bit/8-bit.factor +++ b/extra/io/encodings/8-bit/8-bit.factor @@ -9,21 +9,21 @@ IN: io.encodings.8-bit hashtable ; : parse-file ( file-name -- byte>ch ch>byte ) - full-path ascii file-lines process-contents + ascii file-lines process-contents [ byte>ch ] [ ch>byte ] bi ; : empty-tuple-class ( string -- class ) @@ -85,9 +85,9 @@ IN: io.encodings.8-bit : 8-bit-methods ( class byte>ch ch>byte -- ) >r over r> define-encode-char define-decode-char ; -: define-8-bit-encoding ( tuple-name file-name -- ) +: define-8-bit-encoding ( name path -- ) >r empty-tuple-class r> parse-file 8-bit-methods ; PRIVATE> -[ mappings [ define-8-bit-encoding ] assoc-each ] with-compilation-unit +[ mappings [ full-path define-8-bit-encoding ] assoc-each ] with-compilation-unit diff --git a/extra/io/encodings/ascii/ascii-docs.factor b/extra/io/encodings/ascii/ascii-docs.factor new file mode 100644 index 0000000000..0b54a341d9 --- /dev/null +++ b/extra/io/encodings/ascii/ascii-docs.factor @@ -0,0 +1,8 @@ +USING: help.markup help.syntax ; +IN: io.encodings.ascii + +HELP: ascii +{ $class-description "This is the encoding descriptor which denotes an ASCII encoding. By default, if there's a non-ASCII character in an input stream, it will be replaced with a replacement character (U+FFFD), and if a non-ASCII character is used in output, an exception is thrown." } +{ $see-also "encodings-introduction" } ; + +ABOUT: ascii diff --git a/extra/io/encodings/strict/strict-docs.factor b/extra/io/encodings/strict/strict-docs.factor new file mode 100644 index 0000000000..e8a4f18179 --- /dev/null +++ b/extra/io/encodings/strict/strict-docs.factor @@ -0,0 +1,10 @@ +! Copyright (C) 2008 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: help.syntax help.markup ; +IN: io.encodings.strict + +HELP: strict ( encoding -- strict-encoding ) +{ $values { "encoding" "an encoding descriptor" } { "strict-encoding" "a strict encoding descriptor" } } +{ $description "Makes an encoding strict, that is, in the presence of a malformed code point, an error is thrown. Note that the existence of a replacement character in a file (U+FFFD) also throws an error." } ; + +ABOUT: strict diff --git a/extra/io/encodings/utf16/utf16-docs.factor b/extra/io/encodings/utf16/utf16-docs.factor index 018a15a534..7198cb2b27 100644 --- a/extra/io/encodings/utf16/utf16-docs.factor +++ b/extra/io/encodings/utf16/utf16-docs.factor @@ -1,22 +1,25 @@ USING: help.markup help.syntax io.encodings strings ; IN: io.encodings.utf16 -ARTICLE: "utf16" "Working with UTF-16-encoded data" +ARTICLE: "io.encodings.utf16" "UTF-16" "The UTF-16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences. There are three encoding descriptor classes for working with UTF-16, depending on endianness or the presence of a BOM:" +{ $subsection utf16 } { $subsection utf16le } { $subsection utf16be } -{ $subsection utf16 } -"All of these conform to the " { $link "encodings-protocol" } "." ; +{ $subsection utf16n } ; -ABOUT: "utf16" +ABOUT: "io.encodings.utf16" HELP: utf16le -{ $class-description "The encoding protocol for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; HELP: utf16be -{ $class-description "The encoding protocol for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; HELP: utf16 -{ $class-description "The encoding protocol for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ; -{ utf16 utf16le utf16be } related-words +HELP: utf16n +{ $class-description "The encoding descriptor for UTF-16 without a byte order mark in native endian order. This is useful mostly for FFI calls which take input of strings in of wide_t*." } ; + +{ utf16 utf16le utf16be utf16n } related-words diff --git a/extra/io/encodings/utf16/utf16-tests.factor b/extra/io/encodings/utf16/utf16-tests.factor index 89b61a3e37..6985983917 100755 --- a/extra/io/encodings/utf16/utf16-tests.factor +++ b/extra/io/encodings/utf16/utf16-tests.factor @@ -1,5 +1,7 @@ USING: kernel tools.test io.encodings.utf16 arrays sbufs -sequences io.encodings io unicode io.encodings.string ; +io.streams.byte-array sequences io.encodings io unicode +io.encodings.string alien.c-types accessors classes ; +IN: io.encodings.utf16.tests [ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode >array ] unit-test [ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode >array ] unit-test @@ -20,3 +22,9 @@ sequences io.encodings io unicode io.encodings.string ; [ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode >array ] unit-test [ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode >array ] unit-test + +: correct-endian + code>> class little-endian? [ utf16le = ] [ utf16be = ] if ; + +[ t ] [ B{ } utf16n correct-endian ] unit-test +[ t ] [ utf16n correct-endian ] unit-test diff --git a/extra/io/encodings/utf16/utf16.factor b/extra/io/encodings/utf16/utf16.factor index 290761ec91..e8ca04af35 100755 --- a/extra/io/encodings/utf16/utf16.factor +++ b/extra/io/encodings/utf16/utf16.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2006, 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors namespaces io.binary -io.encodings combinators splitting io byte-arrays inspector ; +io.encodings combinators splitting io byte-arrays inspector +alien.c-types ; IN: io.encodings.utf16 TUPLE: utf16be ; @@ -10,6 +11,8 @@ TUPLE: utf16le ; TUPLE: utf16 ; +TUPLE: utf16n ; + ( stream utf16 -- decoder ) M: utf16 ( stream utf16 -- encoder ) drop bom-le over stream-write utf16le ; +! Native-order UTF-16 + +: native-utf16 ( -- descriptor ) + little-endian? utf16le utf16be ? ; + +M: utf16n drop native-utf16 ; + +M: utf16n drop native-utf16 ; + PRIVATE> diff --git a/extra/peg/parsers/parsers-tests.factor b/extra/peg/parsers/parsers-tests.factor index 08bde98419..e80baf3c4f 100644 --- a/extra/peg/parsers/parsers-tests.factor +++ b/extra/peg/parsers/parsers-tests.factor @@ -48,3 +48,7 @@ IN: peg.parsers.tests [ V{ } ] [ "" epsilon parse parse-result-ast ] unit-test + +{ "a" } [ + "a" "a" token just parse parse-result-ast +] unit-test \ No newline at end of file diff --git a/extra/peg/parsers/parsers.factor b/extra/peg/parsers/parsers.factor index 4bba60bb09..13509e81f7 100755 --- a/extra/peg/parsers/parsers.factor +++ b/extra/peg/parsers/parsers.factor @@ -3,14 +3,14 @@ USING: kernel sequences strings namespaces math assocs shuffle vectors arrays combinators.lib math.parser match unicode.categories sequences.deep peg peg.private - peg.search math.ranges ; + peg.search math.ranges words ; IN: peg.parsers TUPLE: just-parser p1 ; : just-pattern [ - dup [ + execute dup [ dup parse-result-remaining empty? [ drop f ] unless ] when ] ; diff --git a/extra/peg/peg.factor b/extra/peg/peg.factor index 1707193e70..ae5ed2f8b2 100755 --- a/extra/peg/peg.factor +++ b/extra/peg/peg.factor @@ -43,17 +43,16 @@ TUPLE: token-parser symbol ; MATCH-VARS: ?token ; -: token-pattern ( -- quot ) - [ - ?token 2dup head? [ - dup >r length tail-slice r> - ] [ - 2drop f - ] if - ] ; - +: parse-token ( input string -- result ) + #! Parse the string, returning a parse result + 2dup head? [ + dup >r length tail-slice r> + ] [ + 2drop f + ] if ; + M: token-parser (compile) ( parser -- quot ) - token-parser-symbol \ ?token token-pattern match-replace ; + token-parser-symbol [ parse-token ] curry ; TUPLE: satisfy-parser quot ; diff --git a/extra/unix/stat/linux/64/64.factor b/extra/unix/stat/linux/64/64.factor index be6ad1e3fc..a374551385 100644 --- a/extra/unix/stat/linux/64/64.factor +++ b/extra/unix/stat/linux/64/64.factor @@ -27,5 +27,5 @@ C-STRUCT: stat FUNCTION: int __xstat ( int ver, char* pathname, stat* buf ) ; FUNCTION: int __lxstat ( int ver, char* pathname, stat* buf ) ; -: stat ( pathname buf -- int ) 3 -rot __xstat ; -: lstat ( pathname buf -- int ) 3 -rot __lxstat ; \ No newline at end of file +: stat ( pathname buf -- int ) 1 -rot __xstat ; +: lstat ( pathname buf -- int ) 1 -rot __lxstat ;