From d967d04e4cf961af1919b55620119381c2251ef7 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sun, 23 Mar 2008 00:43:43 -0400 Subject: [PATCH 01/15] Changing 8-bit encoding names; documentation --- core/io/io-tests.factor | 2 +- extra/http/client/client.factor | 4 +- extra/http/server/server.factor | 2 +- extra/io/encodings/8-bit/8-bit-docs.factor | 91 ++++++++++++++++++++ extra/io/encodings/8-bit/8-bit-tests.factor | 10 +-- extra/io/encodings/8-bit/8-bit.factor | 36 ++++---- extra/io/encodings/strict/strict-docs.factor | 10 +++ 7 files changed, 128 insertions(+), 27 deletions(-) create mode 100644 extra/io/encodings/8-bit/8-bit-docs.factor create mode 100644 extra/io/encodings/strict/strict-docs.factor diff --git a/core/io/io-tests.factor b/core/io/io-tests.factor index 91e51f25b0..abae63c82b 100755 --- a/core/io/io-tests.factor +++ b/core/io/io-tests.factor @@ -9,7 +9,7 @@ IN: io.tests ] unit-test : ( resource -- stream ) - resource-path iso-8859-1 ; + resource-path latin1 ; [ "This is a line.\rThis is another line.\r" diff --git a/extra/http/client/client.factor b/extra/http/client/client.factor index 233b61ea74..e4bbf0279f 100755 --- a/extra/http/client/client.factor +++ b/extra/http/client/client.factor @@ -52,7 +52,7 @@ PRIVATE> : http-request ( request -- response stream ) dup request [ - dup request-addr iso-8859-1 + dup request-addr latin1 1 minutes over set-timeout [ write-request flush @@ -82,7 +82,7 @@ PRIVATE> : download-to ( url file -- ) #! Downloads the contents of a URL to a file. swap http-get-stream swap check-response - [ swap iso-8859-1 stream-copy ] with-disposal ; + [ swap latin1 stream-copy ] with-disposal ; : download ( url -- ) dup download-name download-to ; diff --git a/extra/http/server/server.factor b/extra/http/server/server.factor index 3df21adf26..81201dd3fe 100755 --- a/extra/http/server/server.factor +++ b/extra/http/server/server.factor @@ -217,7 +217,7 @@ SYMBOL: exit-continuation : httpd ( port -- ) internet-server "http.server" - iso-8859-1 [ handle-client ] with-server ; + latin1 [ handle-client ] with-server ; : httpd-main ( -- ) 8888 httpd ; diff --git a/extra/io/encodings/8-bit/8-bit-docs.factor b/extra/io/encodings/8-bit/8-bit-docs.factor new file mode 100644 index 0000000000..ff21094ba1 --- /dev/null +++ b/extra/io/encodings/8-bit/8-bit-docs.factor @@ -0,0 +1,91 @@ +! Copyright (C) 2008 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: help.syntax help.markup io.encodings.8-bit.private ; +IN: io.encodings.8-bit + +ARTICLE: "io.encodings.8-bit" "8-bit encodings" +"Many encodings are a simple mapping of bytes onto characters. The " { $vocab-link "io.encodings.8-bit" } " vocabulary implements these generically using existing resource files. These encodings should be used with extreme caution, as fully general Unicode encodings like UTF-8 are nearly always more appropriate. The following 8-bit encodings are already defined:" +{ $subsection latin1 } +{ $subsection latin2 } +{ $subsection latin3 } +{ $subsection latin4 } +{ $subsection latin/cyrillic } +{ $subsection latin/arabic } +{ $subsection latin/greek } +{ $subsection latin/hebrew } +{ $subsection latin5 } +{ $subsection latin6 } +{ $subsection latin/thai } +{ $subsection latin7 } +{ $subsection latin8 } +{ $subsection latin9 } +{ $subsection latin10 } +{ $subsection koi8-r } +{ $subsection windows-1252 } +{ $subsection ebcdic } +{ $subsection mac-roman } +"Other encodings can be defined using the following utility" +{ $subsection define-8-bit-encoding } ; + +ABOUT: "io.encodings.8-bit" + +HELP: define-8-bit-encoding +{ $values { "name" "a string" } { "path" "a path" } } +{ $description "Creates a new encoding with the given name, using the resource file at the path to tell how to encode and decode octets. The resource file should be in a similar format to those at ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/" } ; + +HELP: latin1 +{ $description "This is the ISO-8859-1 encoding, also called Latin-1: Western European. It is an 8-bit superset of ASCII which is the default for a mimetype starting with 'text' and provides the characters necessary for most western European languages." } ; + +HELP: latin2 +{ $description "This is the ISO-8859-2 encoding, also called Latin-2: Eastern European. It is an 8-bit superset of ASCII and provides the characters necessary for most eastern European languages." } ; + +HELP: latin3 +{ $description "This is the ISO-8859-3 encoding, also called Latin-3: South European. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, Maltese and Esperanto." } ; + +HELP: latin4 +{ $description "This is the ISO-8859-4 encoding, also called Latin-4: North European. It is an 8-bit superset of ASCII and provides the characters necessary for Latvian, Lithuanian, Estonian, Greenlandic and Sami." } ; + +HELP: latin/cyrillic +{ $description "This is the ISO-8859-5 encoding, also called Latin/Cyrillic. It is an 8-bit superset of ASCII and provides the characters necessary for most languages which use Cyrilic, including Russian, Macedonian, Belarusian, Bulgarian, Serbian, and Ukrainian. KOI8-R is used much more commonly." } ; + +HELP: latin/arabic +{ $description "This is the ISO-8859-6 encoding, also called Latin/Arabic. It is an 8-bit superset of ASCII and provides the characters necessary for Arabic, though not other languages which use Arabic script." } ; + +HELP: latin/greek +{ $description "This is the ISO-8859-7 encoding, also called Latin/Greek. It is an 8-bit superset of ASCII and provides the characters necessary for Greek written in modern monotonic orthography, or ancient Greek without accent marks." } ; + +HELP: latin/hebrew +{ $description "This is the ISO-8859-8 encoding, also called Latin/Hebrew. It is an 8-bit superset of ASCII and provides the characters necessary for modern Hebrew without explicit vowels. Generally, this is interpreted in logical order, making it ISO-8859-8-I, technically." } ; + +HELP: latin5 +{ $description "This is the ISO-8859-9 encoding, also called Latin-5: Turkish. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, similar to Latin-1 but replacing the spots used for Icelandic with characters used in Turkish." } ; + +HELP: latin6 +{ $description "This is the ISO-8859-10 encoding, also called Latin-6: Nordic. It is an 8-bit superset of ASCII containing the same characters as Latin-4, but rearranged to be of better use to nordic languages." } ; + +HELP: latin/thai +{ $description "This is the ISO-8859-11 encoding, also called Latin/Thai. It is an 8-bit superset of ASCII containing the characters necessary to represent Thai. It is basically identical to TIS-620." } ; + +HELP: latin7 +{ $description "This is the ISO-8859-13 encoding, also called Latin-7: Baltic Rim. It is an 8-bit superset of ASCII containing all characters necesary to represent Baltic Rim languages, as previous character sets were incomplete." } ; + +HELP: latin8 +{ $description "This is the ISO-8859-14 encoding, also called Latin-8: Celtic. It is an 8-bit superset of ASCII designed for Celtic languages like Gaelic and Breton." } ; + +HELP: latin9 +{ $description "This is the ISO-8859-15 encoding, also called Latin-9 and unoffically as Latin-0. It is an 8-bit superset of ASCII designed as a modification of Latin-1, removing little-used characters in favor of the Euro symbol and other characters." } ; + +HELP: latin10 +{ $description "This is the ISO-8859-16 encoding, also called Latin-10: South-Eastern European. It is an 8-bit superset of ASCII." } ; + +HELP: windows-1252 +{ $description "Windows 1252 is an 8-bit superset of ASCII which is closely related to Latin-1. Control characters in the 0x80 to 0x9F range are replaced with printable characters such as the Euro symbol." } ; + +HELP: ebcdic +{ $description "EBCDIC is an 8-bit legacy encoding designed for IBM mainframes like System/360 in the 1960s. It has since fallen into disuse. It contains large unallocated regions, and the version included here (code page 37) contains auxiliary characters in this region for English- and Portugese-speaking countries." } ; + +HELP: mac-roman +{ $description "Mac Roman is an 8-bit superset of ASCII which was the standard encoding on Mac OS prior to version 10. It is incompatible with Latin-1 in all but a few places and ASCII, and it is suitable for encoding many Western European languages." } ; + +HELP: koi8-r +{ $description "KOI8-R is an 8-bit superset of ASCII which encodes the Cyrillic alphabet, as used in Russian and Bulgarian. Characters are in such an order that, if the eight bit is stripped, text is still interpretable as ASCII. Block-building characters also exist." } ; diff --git a/extra/io/encodings/8-bit/8-bit-tests.factor b/extra/io/encodings/8-bit/8-bit-tests.factor index 5dbe28cb14..24cd4137d4 100644 --- a/extra/io/encodings/8-bit/8-bit-tests.factor +++ b/extra/io/encodings/8-bit/8-bit-tests.factor @@ -1,10 +1,10 @@ USING: io.encodings.string io.encodings.8-bit tools.test strings arrays ; IN: io.encodings.8-bit.tests -[ B{ CHAR: f CHAR: o CHAR: o } ] [ "foo" iso-8859-1 encode ] unit-test -[ { 256 } >string iso-8859-1 encode ] must-fail -[ B{ 255 } ] [ { 255 } iso-8859-1 encode ] unit-test +[ B{ CHAR: f CHAR: o CHAR: o } ] [ "foo" latin1 encode ] unit-test +[ { 256 } >string latin1 encode ] must-fail +[ B{ 255 } ] [ { 255 } latin1 encode ] unit-test -[ "bar" ] [ "bar" iso-8859-1 decode ] unit-test -[ { CHAR: b 233 CHAR: r } ] [ { CHAR: b 233 CHAR: r } iso-8859-1 decode >array ] unit-test +[ "bar" ] [ "bar" latin1 decode ] unit-test +[ { CHAR: b 233 CHAR: r } ] [ { CHAR: b 233 CHAR: r } latin1 decode >array ] unit-test [ { HEX: fffd HEX: 20AC } ] [ { HEX: 81 HEX: 80 } windows-1252 decode >array ] unit-test diff --git a/extra/io/encodings/8-bit/8-bit.factor b/extra/io/encodings/8-bit/8-bit.factor index 2cc6b2e57c..c041e699a2 100644 --- a/extra/io/encodings/8-bit/8-bit.factor +++ b/extra/io/encodings/8-bit/8-bit.factor @@ -9,21 +9,21 @@ IN: io.encodings.8-bit hashtable ; : parse-file ( file-name -- byte>ch ch>byte ) - full-path ascii file-lines process-contents + ascii file-lines process-contents [ byte>ch ] [ ch>byte ] bi ; : empty-tuple-class ( string -- class ) @@ -85,9 +85,9 @@ IN: io.encodings.8-bit : 8-bit-methods ( class byte>ch ch>byte -- ) >r over r> define-encode-char define-decode-char ; -: define-8-bit-encoding ( tuple-name file-name -- ) +: define-8-bit-encoding ( name path -- ) >r empty-tuple-class r> parse-file 8-bit-methods ; PRIVATE> -[ mappings [ define-8-bit-encoding ] assoc-each ] with-compilation-unit +[ mappings [ full-path define-8-bit-encoding ] assoc-each ] with-compilation-unit diff --git a/extra/io/encodings/strict/strict-docs.factor b/extra/io/encodings/strict/strict-docs.factor new file mode 100644 index 0000000000..e8a4f18179 --- /dev/null +++ b/extra/io/encodings/strict/strict-docs.factor @@ -0,0 +1,10 @@ +! Copyright (C) 2008 Daniel Ehrenberg +! See http://factorcode.org/license.txt for BSD license. +USING: help.syntax help.markup ; +IN: io.encodings.strict + +HELP: strict ( encoding -- strict-encoding ) +{ $values { "encoding" "an encoding descriptor" } { "strict-encoding" "a strict encoding descriptor" } } +{ $description "Makes an encoding strict, that is, in the presence of a malformed code point, an error is thrown. Note that the existence of a replacement character in a file (U+FFFD) also throws an error." } ; + +ABOUT: strict From 78886019496045a8436df8c6f2c6777bc1396144 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sun, 23 Mar 2008 00:58:17 -0400 Subject: [PATCH 02/15] Change to encodings docs --- core/io/encodings/encodings-docs.factor | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/io/encodings/encodings-docs.factor b/core/io/encodings/encodings-docs.factor index d5bdf24dc0..0f43bba0db 100644 --- a/core/io/encodings/encodings-docs.factor +++ b/core/io/encodings/encodings-docs.factor @@ -10,6 +10,7 @@ ARTICLE: "io.encodings" "I/O encodings" { $subsection "encodings-protocol" } ; ARTICLE: "encodings-constructors" "Constructing an encoded stream" +"The following words can be used to construct encoded streams. Note that they are usually not used directly, but rather by the stream constructors themselves." { $subsection } { $subsection } { $subsection } ; @@ -47,7 +48,7 @@ ARTICLE: "encodings-protocol" "Encoding protocol" "An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again." { $subsection decode-char } { $subsection encode-char } -"The following methods are optional:" +"Optionally, an encoding can override the constructor words:" { $subsection } { $subsection } ; From 8362ef09588d883934f66cb6a4febdb00f55a49e Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Tue, 25 Mar 2008 13:51:09 -0500 Subject: [PATCH 03/15] fix netbsd32 --- misc/factor.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/misc/factor.sh b/misc/factor.sh index 9d4f26fa46..a1437c67bf 100755 --- a/misc/factor.sh +++ b/misc/factor.sh @@ -90,6 +90,8 @@ set_gcc() { openbsd) ensure_program_installed egcc; CC=egcc;; netbsd) if [[ $WORD -eq 64 ]] ; then CC=/usr/pkg/gcc34/bin/gcc + else + CC=gcc fi ;; *) CC=gcc;; esac From b13e0f7042f38814ed28166e6d11ad97b488089c Mon Sep 17 00:00:00 2001 From: erg Date: Tue, 25 Mar 2008 19:50:39 -0500 Subject: [PATCH 04/15] redo path handling --- core/io/files/files-tests.factor | 51 +++++++++ core/io/files/files.factor | 152 ++++++++++++++++--------- extra/io/unix/files/files-tests.factor | 6 + 3 files changed, 155 insertions(+), 54 deletions(-) diff --git a/core/io/files/files-tests.factor b/core/io/files/files-tests.factor index 4cda463983..e3765fead0 100755 --- a/core/io/files/files-tests.factor +++ b/core/io/files/files-tests.factor @@ -9,6 +9,9 @@ io.files.unique sequences strings accessors ; [ "passwd" ] [ "/etc/passwd" file-name ] unit-test [ "awk" ] [ "/usr/libexec/awk/" file-name ] unit-test [ "awk" ] [ "/usr/libexec/awk///" file-name ] unit-test +[ "" ] [ "" file-name ] unit-test +[ "/" ] [ "/" file-name ] unit-test +[ "///" ] [ "///" file-name ] unit-test [ ] [ { "Hello world." } @@ -144,3 +147,51 @@ io.files.unique sequences strings accessors ; ] keep file-info size>> ] with-unique-file ] unit-test + +[ "/usr/lib" ] [ "/usr" "lib" append-path ] unit-test +[ "/usr/lib" ] [ "/usr/" "lib" append-path ] unit-test +[ "/lib" ] [ "/usr/" "/lib" append-path ] unit-test +[ "/lib/" ] [ "/usr/" "/lib/" append-path ] unit-test +[ "/usr/lib" ] [ "/usr" "./lib" append-path ] unit-test +[ "/usr/lib/" ] [ "/usr" "./lib/" append-path ] unit-test +[ "/lib" ] [ "/usr" "../lib" append-path ] unit-test +[ "/lib/" ] [ "/usr" "../lib/" append-path ] unit-test +[ "/lib" ] [ "/" "../lib" append-path ] unit-test +[ "/lib/" ] [ "/" "../lib/" append-path ] unit-test + +[ "" ] [ "" "." append-path ] unit-test +[ "" ".." append-path ] must-fail + +[ "/" ] [ "/" "./." append-path ] unit-test +[ "/" ] [ "/" "././" append-path ] unit-test +[ "/" ] [ "/" "../.." append-path ] unit-test +[ "/" ] [ "/" "../../" append-path ] unit-test +[ "/lib" ] [ "/" "../../lib" append-path ] unit-test +[ "/lib/" ] [ "/" "../../lib/" append-path ] unit-test +[ "/a/b/lib" ] [ "/a/b/c/d/e/f/" "../../../../lib" append-path ] unit-test +[ "/a/b/lib/" ] [ "/a/b/c/d/e/f/" "../../../../lib/" append-path ] unit-test + +[ "" "../lib/" append-path ] must-fail +[ "lib" ] [ "" "lib" append-path ] unit-test +[ "lib" ] [ "" "./lib" append-path ] unit-test + +[ "/lib/bux" ] [ "/usr" "/lib/bux" append-path ] unit-test +[ "/lib/bux/" ] [ "/usr" "/lib/bux/" append-path ] unit-test + +[ "foo/" ] [ "foo/bar/." parent-directory ] unit-test +[ "foo/" ] [ "foo/bar/./" parent-directory ] unit-test +[ "foo/" ] [ "foo/bar/baz/.." parent-directory ] unit-test +[ "foo/" ] [ "foo/bar/baz/../" parent-directory ] unit-test + +[ "." parent-directory ] must-fail +[ "./" parent-directory ] must-fail +[ ".." parent-directory ] must-fail +[ "../" parent-directory ] must-fail +[ "../../" parent-directory ] must-fail +[ "foo/.." parent-directory ] must-fail +[ "foo/../" parent-directory ] must-fail + +[ "bar/foo" ] [ "bar/baz" "..///foo" append-path ] unit-test +[ "bar/baz/foo" ] [ "bar/baz" ".///foo" append-path ] unit-test +[ "bar/foo" ] [ "bar/baz" "./..//foo" append-path ] unit-test +[ "bar/foo" ] [ "bar/baz" "./../././././././///foo" append-path ] unit-test diff --git a/core/io/files/files.factor b/core/io/files/files.factor index 21cc7c8f0a..8595f227bf 100755 --- a/core/io/files/files.factor +++ b/core/io/files/files.factor @@ -3,7 +3,7 @@ USING: io.backend io.files.private io hashtables kernel math memory namespaces sequences strings assocs arrays definitions system combinators splitting sbufs continuations io.encodings -io.encodings.binary ; +io.encodings.binary init ; IN: io.files HOOK: (file-reader) io-backend ( path -- stream ) @@ -21,7 +21,26 @@ HOOK: (file-appender) io-backend ( path -- stream ) : ( path encoding -- stream ) swap (file-appender) swap ; -HOOK: rename-file io-backend ( from to -- ) +: file-lines ( path encoding -- seq ) + lines ; + +: with-file-reader ( path encoding quot -- ) + >r r> with-stream ; inline + +: file-contents ( path encoding -- str ) + contents ; + +: with-file-writer ( path encoding quot -- ) + >r r> with-stream ; inline + +: set-file-lines ( seq path encoding -- ) + [ [ print ] each ] with-file-writer ; + +: set-file-contents ( str path encoding -- ) + [ write ] with-file-writer ; + +: with-file-appender ( path encoding quot -- ) + >r r> with-stream ; inline ! Pathnames : path-separator? ( ch -- ? ) windows? "/\\" "/" ? member? ; @@ -32,42 +51,84 @@ HOOK: rename-file io-backend ( from to -- ) : left-trim-separators ( str -- newstr ) [ path-separator? ] left-trim ; -: append-path ( str1 str2 -- str ) - >r right-trim-separators "/" r> - left-trim-separators 3append ; - -: prepend-path ( str1 str2 -- str ) - swap append-path ; inline - : last-path-separator ( path -- n ? ) [ length 1- ] keep [ path-separator? ] find-last* ; HOOK: root-directory? io-backend ( path -- ? ) -M: object root-directory? ( path -- ? ) path-separator? ; - -: special-directory? ( name -- ? ) { "." ".." } member? ; +M: object root-directory? ( path -- ? ) + dup empty? [ drop f ] [ [ path-separator? ] all? ] if ; ERROR: no-parent-directory path ; : parent-directory ( path -- parent ) - right-trim-separators { - { [ dup empty? ] [ drop "/" ] } - { [ dup root-directory? ] [ ] } - { [ dup [ path-separator? ] contains? not ] [ drop "." ] } + dup root-directory? [ + right-trim-separators + dup last-path-separator [ + 1+ cut + { + { "." [ 1 head* parent-directory ] } + { ".." [ + 2 head* parent-directory parent-directory + ] } + [ drop ] + } case + ] [ no-parent-directory ] if + ] unless ; + + + +: absolute-path? ( path -- ? ) + dup empty? [ drop f ] [ first path-separator? ] if ; + +: append-path ( str1 str2 -- str ) + { + { [ over empty? ] [ append-path-empty ] } + { [ dup empty? ] [ drop ] } + { [ dup absolute-path? ] [ nip ] } + { [ dup head.? ] [ 1 tail left-trim-separators append-path ] } + { [ dup head..? ] [ + 2 tail left-trim-separators + >r parent-directory r> append-path + ] } { [ t ] [ - dup last-path-separator drop 1+ cut - special-directory? [ no-parent-directory ] when + >r right-trim-separators "/" r> + left-trim-separators 3append ] } } cond ; -: file-name ( path -- string ) - right-trim-separators { - { [ dup empty? ] [ drop "/" ] } - { [ dup last-path-separator ] [ 1+ tail ] } - { [ t ] [ drop ] } - } cond ; +: prepend-path ( str1 str2 -- str ) + swap append-path ; inline +: file-name ( path -- string ) + dup root-directory? [ + right-trim-separators + dup last-path-separator [ 1+ tail ] [ drop ] if + ] unless ; + +! File info TUPLE: file-info type size permissions modified ; HOOK: file-info io-backend ( path -- info ) @@ -94,8 +155,12 @@ HOOK: cd io-backend ( path -- ) HOOK: cwd io-backend ( -- path ) +SYMBOL: current-directory + +[ cwd current-directory set-global ] "current-directory" add-init-hook + : with-directory ( path quot -- ) - cwd [ cd ] curry rot cd [ ] cleanup ; inline + current-directory swap with-variable ; inline ! Creating directories HOOK: make-directory io-backend ( path -- ) @@ -118,7 +183,7 @@ HOOK: make-directory io-backend ( path -- ) dup string? [ tuck append-path directory? 2array ] [ nip ] if ] with map - [ first special-directory? not ] subset ; + [ first { "." ".." } member? not ] subset ; : directory ( path -- seq ) normalize-directory dup (directory) fixup-directory ; @@ -199,34 +264,6 @@ DEFER: copy-tree-into : resource-exists? ( path -- ? ) ?resource-path exists? ; -! Pathname presentations -TUPLE: pathname string ; - -C: pathname - -M: pathname <=> [ pathname-string ] compare ; - -: file-lines ( path encoding -- seq ) - lines ; - -: with-file-reader ( path encoding quot -- ) - >r r> with-stream ; inline - -: file-contents ( path encoding -- str ) - contents ; - -: with-file-writer ( path encoding quot -- ) - >r r> with-stream ; inline - -: set-file-lines ( seq path encoding -- ) - [ [ print ] each ] with-file-writer ; - -: set-file-contents ( str path encoding -- ) - [ write ] with-file-writer ; - -: with-file-appender ( path encoding quot -- ) - >r r> with-stream ; inline - : temp-directory ( -- path ) "temp" resource-path dup exists? not @@ -235,6 +272,13 @@ M: pathname <=> [ pathname-string ] compare ; : temp-file ( name -- path ) temp-directory prepend-path ; +! Pathname presentations +TUPLE: pathname string ; + +C: pathname + +M: pathname <=> [ pathname-string ] compare ; + ! Home directory : home ( -- dir ) { diff --git a/extra/io/unix/files/files-tests.factor b/extra/io/unix/files/files-tests.factor index f5366d32ae..98de09e8f1 100755 --- a/extra/io/unix/files/files-tests.factor +++ b/extra/io/unix/files/files-tests.factor @@ -6,3 +6,9 @@ IN: io.unix.files.tests [ "/" ] [ "/etc/" parent-directory ] unit-test [ "/" ] [ "/etc" parent-directory ] unit-test [ "/" ] [ "/" parent-directory ] unit-test +[ "asdf" parent-directory ] must-fail + +[ f ] [ "" root-directory? ] unit-test +[ t ] [ "/" root-directory? ] unit-test +[ t ] [ "//" root-directory? ] unit-test +[ t ] [ "///////" root-directory? ] unit-test From 807c84918b952c377e949454fc13b59dfbeeb93b Mon Sep 17 00:00:00 2001 From: erg Date: Tue, 25 Mar 2008 19:52:07 -0500 Subject: [PATCH 05/15] minor cleanup in windows path handling --- extra/io/windows/nt/files/files.factor | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/extra/io/windows/nt/files/files.factor b/extra/io/windows/nt/files/files.factor index 7cf056674f..540737004b 100755 --- a/extra/io/windows/nt/files/files.factor +++ b/extra/io/windows/nt/files/files.factor @@ -2,7 +2,8 @@ USING: continuations destructors io.buffers io.files io.backend io.timeouts io.nonblocking io.windows io.windows.nt.backend kernel libc math threads windows windows.kernel32 alien.c-types alien.arrays sequences combinators combinators.lib -sequences.lib ascii splitting alien strings assocs ; +sequences.lib ascii splitting alien strings assocs +combinators.cleave ; IN: io.windows.nt.files M: windows-nt-io cwd @@ -18,18 +19,19 @@ M: windows-nt-io cd M: windows-nt-io root-directory? ( path -- ? ) dup length 2 = [ - dup first Letter? - swap second CHAR: : = and + first2 + [ Letter? ] [ CHAR: : = ] bi* and ] [ drop f ] if ; +ERROR: not-absolute-path ; : root-directory ( string -- string' ) { [ dup length 2 >= ] [ dup second CHAR: : = ] [ dup first Letter? ] - } && [ 2 head ] [ "Not an absolute path" throw ] if ; + } && [ 2 head ] [ not-absolute-path ] if ; : prepend-prefix ( string -- string' ) unicode-prefix prepend ; @@ -58,9 +60,12 @@ M: windows-nt-io root-directory? ( path -- ? ) ] } } cond ; +ERROR: nonstring-pathname ; +ERROR: empty-pathname ; + M: windows-nt-io normalize-pathname ( string -- string ) - dup string? [ "Pathname must be a string" throw ] unless - dup empty? [ "Empty pathname" throw ] when + dup string? [ nonstring-pathname ] unless + dup empty? [ empty-pathname ] when { { CHAR: / CHAR: \\ } } substitute cwd swap windows-append-path [ "/\\." member? ] right-trim From 06848c8e7575983cb590beb7c1ad43ed1dfdf66f Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Tue, 25 Mar 2008 21:17:37 -0400 Subject: [PATCH 06/15] UTF-16 native order; better encodings docs --- core/io/encodings/binary/binary-docs.factor | 5 +- core/io/encodings/encodings-docs.factor | 29 ++++++----- core/io/encodings/utf8/utf8-docs.factor | 11 ++-- extra/help/handbook/handbook.factor | 13 ++++- extra/io/encodings/8-bit/8-bit-docs.factor | 57 ++++++++++++++------- extra/io/encodings/ascii/ascii-docs.factor | 8 +++ extra/io/encodings/utf16/utf16-docs.factor | 19 ++++--- extra/io/encodings/utf16/utf16-tests.factor | 10 +++- extra/io/encodings/utf16/utf16.factor | 14 ++++- 9 files changed, 115 insertions(+), 51 deletions(-) create mode 100644 extra/io/encodings/ascii/ascii-docs.factor diff --git a/core/io/encodings/binary/binary-docs.factor b/core/io/encodings/binary/binary-docs.factor index 823eea67be..fdd9828867 100644 --- a/core/io/encodings/binary/binary-docs.factor +++ b/core/io/encodings/binary/binary-docs.factor @@ -2,4 +2,7 @@ USING: help.syntax help.markup ; IN: io.encodings.binary HELP: binary -{ $class-description "This is the encoding descriptor for binary I/O. Making an encoded stream with the binary encoding is a no-op; streams with this encoding deal with byte-arrays, not strings." } ; +{ $class-description "This is the encoding descriptor for binary I/O. Making an encoded stream with the binary encoding is a no-op; streams with this encoding deal with byte-arrays, not strings." } +{ $see-also "encodings-introduction" } ; + +ABOUT: binary diff --git a/core/io/encodings/encodings-docs.factor b/core/io/encodings/encodings-docs.factor index 0f43bba0db..07e0f9f401 100644 --- a/core/io/encodings/encodings-docs.factor +++ b/core/io/encodings/encodings-docs.factor @@ -1,16 +1,16 @@ USING: help.markup help.syntax ; IN: io.encodings -ABOUT: "encodings" +ABOUT: "io.encodings" ARTICLE: "io.encodings" "I/O encodings" -"Many streams deal with bytes, rather than Unicode code points, at some level. The translation between these two things is specified by an encoding. To abstract this away from the programmer, Factor provides a system where these streams are associated with an encoding which is always used when the stream is read from or written to. For most purposes, an encoding descriptor consisting of a symbol is all that is needed when initializing a stream." +"Bytes can't be understood in isolation as text. They must be interpreted under a certain encoding. Factor provides utilities for dealing with encoded text by declaring that a stream has a particular encoding, and utilities to encode and decode strings." { $subsection "encodings-constructors" } { $subsection "encodings-descriptors" } { $subsection "encodings-protocol" } ; -ARTICLE: "encodings-constructors" "Constructing an encoded stream" -"The following words can be used to construct encoded streams. Note that they are usually not used directly, but rather by the stream constructors themselves." +ARTICLE: "encodings-constructors" "Manually constructing an encoded stream" +"The following words can be used to construct encoded streams. Note that they are usually not used directly, but rather by the stream constructors themselves. Most stream constructors take an encoding descriptor as a parameter and internally call these constructors." { $subsection } { $subsection } { $subsection } ; @@ -38,19 +38,22 @@ HELP: ARTICLE: "encodings-descriptors" "Encoding descriptors" "An encoding descriptor is something which can be used for input or output streams to encode or decode files. It must conform to the " { $link "encodings-protocol" } ". Encodings which you can use are defined in the following vocabularies:" -{ $vocab-subsection "io.encodings.utf8" } -{ $vocab-subsection "io.encodings.ascii" } -{ $vocab-subsection "io.encodings.8-bit" } -{ $vocab-subsection "io.encodings.binary" } -{ $vocab-subsection "io.encodings.utf16" } ; +{ $vocab-subsection "ASCII" "io.encodings.ascii" } +{ $vocab-subsection "Binary" "io.encodings.binary" } +{ $vocab-subsection "Strict encodings" "io.encodings.strict" } +{ $vocab-subsection "8-bit encodings" "io.encodings.8-bit" } +{ $vocab-subsection "UTF-8" "io.encodings.utf8" } +{ $vocab-subsection "UTF-16" "io.encodings.utf16" } +{ $see-also "encodings-introduction" } ; ARTICLE: "encodings-protocol" "Encoding protocol" -"An encoding descriptor must implement the following methods. The methods are implemented on tuple classes by instantiating the class and calling the method again." +"There are two parts to implementing a new encoding. First, methods for creating an encoded or decoded stream must be provided. These have defaults, however, which wrap a stream in an encoder or decoder wrapper with the given encoding descriptor." +{ $subsection } +{ $subsection } +"If an encoding might be contained in the code slot of an encoder or decoder tuple, then the following methods must be implemented to read or write one code point from a stream:" { $subsection decode-char } { $subsection encode-char } -"Optionally, an encoding can override the constructor words:" -{ $subsection } -{ $subsection } ; +{ $see-also "encodings-introduction" } ; HELP: decode-char { $values { "stream" "an underlying input stream" } diff --git a/core/io/encodings/utf8/utf8-docs.factor b/core/io/encodings/utf8/utf8-docs.factor index dbbc193a02..7a29039eca 100755 --- a/core/io/encodings/utf8/utf8-docs.factor +++ b/core/io/encodings/utf8/utf8-docs.factor @@ -1,11 +1,8 @@ -USING: help.markup help.syntax io.encodings strings io.files ; +USING: help.markup help.syntax ; IN: io.encodings.utf8 -ARTICLE: "io.encodings.utf8" "Working with UTF8-encoded data" -"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences. The encoding descriptor for UTF-8:" -{ $subsection utf8 } ; - HELP: utf8 -{ $class-description "This is the class of encoding tuples which denote a UTF-8 encoding. This conforms to the " { $link "encodings-protocol" } "." } ; +{ $class-description "This is the encoding descriptor for a UTF-8 encoding. UTF-8 is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences." } +{ $see-also "encodings-introduction" } ; -ABOUT: "io.encodings.utf8" +ABOUT: utf8 diff --git a/extra/help/handbook/handbook.factor b/extra/help/handbook/handbook.factor index 1310b58133..4079386d7f 100755 --- a/extra/help/handbook/handbook.factor +++ b/extra/help/handbook/handbook.factor @@ -170,7 +170,17 @@ ARTICLE: "collections" "Collections" { $subsection "graphs" } { $subsection "buffers" } ; -USING: io.sockets io.launcher io.mmap io.monitors ; +USING: io.sockets io.launcher io.mmap io.monitors +io.encodings.utf8 io.encodings.binary io.encodings.ascii io.files ; + +ARTICLE: "encodings-introduction" "An introduction to encodings" +"In order to express text in terms of binary, some sort of encoding has to be used. In a modern context, this is understood as a two-way mapping between Unicode code points (characters) and some amount of binary. Since English isn't the only language in the world, ASCII is not sufficient as a mapping from binary to Unicode; it can't even express em-dashes or curly quotes. Unicode was designed as a universal character set that could potentially represent everything." $nl +"Not all encodings can represent all Unicode code points, but Unicode can represent basically everything that exists in modern encodings. Some encodings are language-specific, and some can represent everything in Unicode. Though the world is moving toward Unicode and UTF-8, the reality today is that there are several encodings which must be taken into account." $nl +"Factor uses a system of encoding descriptors to denote encodings. Encoding descriptors are objects which describe encodings. Examples are " { $link utf8 } ", " { $link ascii } " and " { $link binary } ". Encoding descriptors can be passed around independently. Each encoding descriptor has some method for constructing an encoded or decoded stream, and the resulting stream has an encoding descriptor stored which has methods for reading or writing characters." $nl +"Constructors for streams which deal with bytes usually take an encoding as an explicit parameter. For example, to open a text file for reading whose contents are in UTF-8, use the following" +{ $code "\"filename\" utf8 " } +"If there is an error in the encoded stream, a replacement character (0xFFFD) will be inserted. To throw an exception upon error, use a strict encoding as follows" +{ $code "\"filename\" utf8 strict " } ; ARTICLE: "io" "Input and output" { $heading "Streams" } @@ -188,6 +198,7 @@ ARTICLE: "io" "Input and output" { $subsection "io.mmap" } { $subsection "io.monitors" } { $heading "Encodings" } +{ $subsection "encodings-introduction" } { $subsection "io.encodings" } { $subsection "io.encodings.string" } { $heading "Other features" } diff --git a/extra/io/encodings/8-bit/8-bit-docs.factor b/extra/io/encodings/8-bit/8-bit-docs.factor index ff21094ba1..8e5fd815bc 100644 --- a/extra/io/encodings/8-bit/8-bit-docs.factor +++ b/extra/io/encodings/8-bit/8-bit-docs.factor @@ -34,58 +34,77 @@ HELP: define-8-bit-encoding { $description "Creates a new encoding with the given name, using the resource file at the path to tell how to encode and decode octets. The resource file should be in a similar format to those at ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/" } ; HELP: latin1 -{ $description "This is the ISO-8859-1 encoding, also called Latin-1: Western European. It is an 8-bit superset of ASCII which is the default for a mimetype starting with 'text' and provides the characters necessary for most western European languages." } ; +{ $description "This is the ISO-8859-1 encoding, also called Latin-1: Western European. It is an 8-bit superset of ASCII which is the default for a mimetype starting with 'text' and provides the characters necessary for most western European languages." } +{ $see-also "encodings-introduction" } ; HELP: latin2 -{ $description "This is the ISO-8859-2 encoding, also called Latin-2: Eastern European. It is an 8-bit superset of ASCII and provides the characters necessary for most eastern European languages." } ; +{ $description "This is the ISO-8859-2 encoding, also called Latin-2: Eastern European. It is an 8-bit superset of ASCII and provides the characters necessary for most eastern European languages." } +{ $see-also "encodings-introduction" } ; HELP: latin3 -{ $description "This is the ISO-8859-3 encoding, also called Latin-3: South European. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, Maltese and Esperanto." } ; +{ $description "This is the ISO-8859-3 encoding, also called Latin-3: South European. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, Maltese and Esperanto." } +{ $see-also "encodings-introduction" } ; HELP: latin4 -{ $description "This is the ISO-8859-4 encoding, also called Latin-4: North European. It is an 8-bit superset of ASCII and provides the characters necessary for Latvian, Lithuanian, Estonian, Greenlandic and Sami." } ; +{ $description "This is the ISO-8859-4 encoding, also called Latin-4: North European. It is an 8-bit superset of ASCII and provides the characters necessary for Latvian, Lithuanian, Estonian, Greenlandic and Sami." } +{ $see-also "encodings-introduction" } ; HELP: latin/cyrillic -{ $description "This is the ISO-8859-5 encoding, also called Latin/Cyrillic. It is an 8-bit superset of ASCII and provides the characters necessary for most languages which use Cyrilic, including Russian, Macedonian, Belarusian, Bulgarian, Serbian, and Ukrainian. KOI8-R is used much more commonly." } ; +{ $description "This is the ISO-8859-5 encoding, also called Latin/Cyrillic. It is an 8-bit superset of ASCII and provides the characters necessary for most languages which use Cyrilic, including Russian, Macedonian, Belarusian, Bulgarian, Serbian, and Ukrainian. KOI8-R is used much more commonly." } +{ $see-also "encodings-introduction" } ; HELP: latin/arabic -{ $description "This is the ISO-8859-6 encoding, also called Latin/Arabic. It is an 8-bit superset of ASCII and provides the characters necessary for Arabic, though not other languages which use Arabic script." } ; +{ $description "This is the ISO-8859-6 encoding, also called Latin/Arabic. It is an 8-bit superset of ASCII and provides the characters necessary for Arabic, though not other languages which use Arabic script." } +{ $see-also "encodings-introduction" } ; HELP: latin/greek -{ $description "This is the ISO-8859-7 encoding, also called Latin/Greek. It is an 8-bit superset of ASCII and provides the characters necessary for Greek written in modern monotonic orthography, or ancient Greek without accent marks." } ; +{ $description "This is the ISO-8859-7 encoding, also called Latin/Greek. It is an 8-bit superset of ASCII and provides the characters necessary for Greek written in modern monotonic orthography, or ancient Greek without accent marks." } +{ $see-also "encodings-introduction" } ; HELP: latin/hebrew -{ $description "This is the ISO-8859-8 encoding, also called Latin/Hebrew. It is an 8-bit superset of ASCII and provides the characters necessary for modern Hebrew without explicit vowels. Generally, this is interpreted in logical order, making it ISO-8859-8-I, technically." } ; +{ $description "This is the ISO-8859-8 encoding, also called Latin/Hebrew. It is an 8-bit superset of ASCII and provides the characters necessary for modern Hebrew without explicit vowels. Generally, this is interpreted in logical order, making it ISO-8859-8-I, technically." } +{ $see-also "encodings-introduction" } ; HELP: latin5 -{ $description "This is the ISO-8859-9 encoding, also called Latin-5: Turkish. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, similar to Latin-1 but replacing the spots used for Icelandic with characters used in Turkish." } ; +{ $description "This is the ISO-8859-9 encoding, also called Latin-5: Turkish. It is an 8-bit superset of ASCII and provides the characters necessary for Turkish, similar to Latin-1 but replacing the spots used for Icelandic with characters used in Turkish." } +{ $see-also "encodings-introduction" } ; HELP: latin6 -{ $description "This is the ISO-8859-10 encoding, also called Latin-6: Nordic. It is an 8-bit superset of ASCII containing the same characters as Latin-4, but rearranged to be of better use to nordic languages." } ; +{ $description "This is the ISO-8859-10 encoding, also called Latin-6: Nordic. It is an 8-bit superset of ASCII containing the same characters as Latin-4, but rearranged to be of better use to nordic languages." } +{ $see-also "encodings-introduction" } ; HELP: latin/thai -{ $description "This is the ISO-8859-11 encoding, also called Latin/Thai. It is an 8-bit superset of ASCII containing the characters necessary to represent Thai. It is basically identical to TIS-620." } ; +{ $description "This is the ISO-8859-11 encoding, also called Latin/Thai. It is an 8-bit superset of ASCII containing the characters necessary to represent Thai. It is basically identical to TIS-620." } +{ $see-also "encodings-introduction" } ; HELP: latin7 -{ $description "This is the ISO-8859-13 encoding, also called Latin-7: Baltic Rim. It is an 8-bit superset of ASCII containing all characters necesary to represent Baltic Rim languages, as previous character sets were incomplete." } ; +{ $description "This is the ISO-8859-13 encoding, also called Latin-7: Baltic Rim. It is an 8-bit superset of ASCII containing all characters necesary to represent Baltic Rim languages, as previous character sets were incomplete." } +{ $see-also "encodings-introduction" } ; HELP: latin8 -{ $description "This is the ISO-8859-14 encoding, also called Latin-8: Celtic. It is an 8-bit superset of ASCII designed for Celtic languages like Gaelic and Breton." } ; +{ $description "This is the ISO-8859-14 encoding, also called Latin-8: Celtic. It is an 8-bit superset of ASCII designed for Celtic languages like Gaelic and Breton." } +{ $see-also "encodings-introduction" } ; HELP: latin9 -{ $description "This is the ISO-8859-15 encoding, also called Latin-9 and unoffically as Latin-0. It is an 8-bit superset of ASCII designed as a modification of Latin-1, removing little-used characters in favor of the Euro symbol and other characters." } ; +{ $description "This is the ISO-8859-15 encoding, also called Latin-9 and unoffically as Latin-0. It is an 8-bit superset of ASCII designed as a modification of Latin-1, removing little-used characters in favor of the Euro symbol and other characters." } +{ $see-also "encodings-introduction" } ; HELP: latin10 -{ $description "This is the ISO-8859-16 encoding, also called Latin-10: South-Eastern European. It is an 8-bit superset of ASCII." } ; +{ $description "This is the ISO-8859-16 encoding, also called Latin-10: South-Eastern European. It is an 8-bit superset of ASCII." } +{ $see-also "encodings-introduction" } ; HELP: windows-1252 -{ $description "Windows 1252 is an 8-bit superset of ASCII which is closely related to Latin-1. Control characters in the 0x80 to 0x9F range are replaced with printable characters such as the Euro symbol." } ; +{ $description "Windows 1252 is an 8-bit superset of ASCII which is closely related to Latin-1. Control characters in the 0x80 to 0x9F range are replaced with printable characters such as the Euro symbol." } +{ $see-also "encodings-introduction" } ; HELP: ebcdic -{ $description "EBCDIC is an 8-bit legacy encoding designed for IBM mainframes like System/360 in the 1960s. It has since fallen into disuse. It contains large unallocated regions, and the version included here (code page 37) contains auxiliary characters in this region for English- and Portugese-speaking countries." } ; +{ $description "EBCDIC is an 8-bit legacy encoding designed for IBM mainframes like System/360 in the 1960s. It has since fallen into disuse. It contains large unallocated regions, and the version included here (code page 37) contains auxiliary characters in this region for English- and Portugese-speaking countries." } +{ $see-also "encodings-introduction" } ; HELP: mac-roman -{ $description "Mac Roman is an 8-bit superset of ASCII which was the standard encoding on Mac OS prior to version 10. It is incompatible with Latin-1 in all but a few places and ASCII, and it is suitable for encoding many Western European languages." } ; +{ $description "Mac Roman is an 8-bit superset of ASCII which was the standard encoding on Mac OS prior to version 10. It is incompatible with Latin-1 in all but a few places and ASCII, and it is suitable for encoding many Western European languages." } +{ $see-also "encodings-introduction" } ; HELP: koi8-r -{ $description "KOI8-R is an 8-bit superset of ASCII which encodes the Cyrillic alphabet, as used in Russian and Bulgarian. Characters are in such an order that, if the eight bit is stripped, text is still interpretable as ASCII. Block-building characters also exist." } ; +{ $description "KOI8-R is an 8-bit superset of ASCII which encodes the Cyrillic alphabet, as used in Russian and Bulgarian. Characters are in such an order that, if the eight bit is stripped, text is still interpretable as ASCII. Block-building characters also exist." } +{ $see-also "encodings-introduction" } ; diff --git a/extra/io/encodings/ascii/ascii-docs.factor b/extra/io/encodings/ascii/ascii-docs.factor new file mode 100644 index 0000000000..0b54a341d9 --- /dev/null +++ b/extra/io/encodings/ascii/ascii-docs.factor @@ -0,0 +1,8 @@ +USING: help.markup help.syntax ; +IN: io.encodings.ascii + +HELP: ascii +{ $class-description "This is the encoding descriptor which denotes an ASCII encoding. By default, if there's a non-ASCII character in an input stream, it will be replaced with a replacement character (U+FFFD), and if a non-ASCII character is used in output, an exception is thrown." } +{ $see-also "encodings-introduction" } ; + +ABOUT: ascii diff --git a/extra/io/encodings/utf16/utf16-docs.factor b/extra/io/encodings/utf16/utf16-docs.factor index 018a15a534..7198cb2b27 100644 --- a/extra/io/encodings/utf16/utf16-docs.factor +++ b/extra/io/encodings/utf16/utf16-docs.factor @@ -1,22 +1,25 @@ USING: help.markup help.syntax io.encodings strings ; IN: io.encodings.utf16 -ARTICLE: "utf16" "Working with UTF-16-encoded data" +ARTICLE: "io.encodings.utf16" "UTF-16" "The UTF-16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences. There are three encoding descriptor classes for working with UTF-16, depending on endianness or the presence of a BOM:" +{ $subsection utf16 } { $subsection utf16le } { $subsection utf16be } -{ $subsection utf16 } -"All of these conform to the " { $link "encodings-protocol" } "." ; +{ $subsection utf16n } ; -ABOUT: "utf16" +ABOUT: "io.encodings.utf16" HELP: utf16le -{ $class-description "The encoding protocol for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16LE, that is, UTF-16 in little endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; HELP: utf16be -{ $class-description "The encoding protocol for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16BE, that is, UTF-16 in big endian, without a byte order mark. Streams can be made which read or write wth this encoding." } ; HELP: utf16 -{ $class-description "The encoding protocol for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ; +{ $class-description "The encoding descriptor for UTF-16, that is, UTF-16 with a byte order mark. This is the most useful for general input and output in UTF-16. Streams can be made which read or write wth this encoding." } ; -{ utf16 utf16le utf16be } related-words +HELP: utf16n +{ $class-description "The encoding descriptor for UTF-16 without a byte order mark in native endian order. This is useful mostly for FFI calls which take input of strings in of wide_t*." } ; + +{ utf16 utf16le utf16be utf16n } related-words diff --git a/extra/io/encodings/utf16/utf16-tests.factor b/extra/io/encodings/utf16/utf16-tests.factor index 89b61a3e37..6985983917 100755 --- a/extra/io/encodings/utf16/utf16-tests.factor +++ b/extra/io/encodings/utf16/utf16-tests.factor @@ -1,5 +1,7 @@ USING: kernel tools.test io.encodings.utf16 arrays sbufs -sequences io.encodings io unicode io.encodings.string ; +io.streams.byte-array sequences io.encodings io unicode +io.encodings.string alien.c-types accessors classes ; +IN: io.encodings.utf16.tests [ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode >array ] unit-test [ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode >array ] unit-test @@ -20,3 +22,9 @@ sequences io.encodings io unicode io.encodings.string ; [ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode >array ] unit-test [ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode >array ] unit-test + +: correct-endian + code>> class little-endian? [ utf16le = ] [ utf16be = ] if ; + +[ t ] [ B{ } utf16n correct-endian ] unit-test +[ t ] [ utf16n correct-endian ] unit-test diff --git a/extra/io/encodings/utf16/utf16.factor b/extra/io/encodings/utf16/utf16.factor index 290761ec91..e8ca04af35 100755 --- a/extra/io/encodings/utf16/utf16.factor +++ b/extra/io/encodings/utf16/utf16.factor @@ -1,7 +1,8 @@ ! Copyright (C) 2006, 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors namespaces io.binary -io.encodings combinators splitting io byte-arrays inspector ; +io.encodings combinators splitting io byte-arrays inspector +alien.c-types ; IN: io.encodings.utf16 TUPLE: utf16be ; @@ -10,6 +11,8 @@ TUPLE: utf16le ; TUPLE: utf16 ; +TUPLE: utf16n ; + ( stream utf16 -- decoder ) M: utf16 ( stream utf16 -- encoder ) drop bom-le over stream-write utf16le ; +! Native-order UTF-16 + +: native-utf16 ( -- descriptor ) + little-endian? utf16le utf16be ? ; + +M: utf16n drop native-utf16 ; + +M: utf16n drop native-utf16 ; + PRIVATE> From c9b22c92a67fd50177378e5566f0e55a3cba9715 Mon Sep 17 00:00:00 2001 From: erg Date: Tue, 25 Mar 2008 21:09:39 -0500 Subject: [PATCH 07/15] redo target --- build-support/target | 70 ++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/build-support/target b/build-support/target index 1903a6da64..1fbfb31d11 100755 --- a/build-support/target +++ b/build-support/target @@ -1,38 +1,38 @@ #!/bin/sh -if [ \( `uname -s ` = FreeBSD \) -a \( `uname -p` = i386 \) ] -then - echo freebsd-x86-32 -elif [ \( `uname -s` = FreeBSD \) -a \( `uname -m` = amd64 \) ] -then - echo freebsd-x86-64 -elif [ \( `uname -s` = OpenBSD \) -a \( `uname -m` = i386 \) ] -then - echo openbsd-x86-32 -elif [ \( `uname -s` = OpenBSD \) -a \( `uname -m` = amd64 \) ] -then - echo openbsd-x86-64 -elif [ \( `uname -s` = NetBSD \) -a \( `uname -p` = i386 \) ] -then - echo netbsd-x86-32 -elif [ \( `uname -s` = NetBSD \) -a \( `uname -p` = x86_64 \) ] -then - echo netbsd-x86-64 -elif [ \( `uname -s` = Darwin \) -a \( `uname -p` = powerpc \) ] -then - echo macosx-ppc -elif [ `uname -s` = Darwin ] -then - echo macosx-x86-`./build-support/wordsize` -elif [ \( `uname -s` = Linux \) -a \( `uname -m` = i686 \) ] -then - echo linux-x86-32 -elif [ \( `uname -s` = Linux \) -a \( `uname -m` = x86_64 \) ] -then - echo linux-x86-64 -elif [ \( `uname -o` = Cygwin \) -a \( `uname -m` = i686 \) ] -then - echo winnt-x86-`./build-support/wordsize` -else - echo help +uname_s=`uname -s` +case $uname_s in + CYGWIN_NT-5.2-WOW64) OS=winnt;; + *CYGWIN_NT*) OS=winnt;; + *CYGWIN*) OS=winnt;; + *darwin*) OS=macosx;; + *Darwin*) OS=macosx;; + *linux*) OS=linux;; + *Linux*) OS=linux;; + *NetBSD*) OS=netbsd;; + *FreeBSD*) OS=freebsd;; + *OpenBSD*) OS=openbsd;; + *DragonFly*) OS=dragonflybsd;; +esac + +uname_m=`uname -m` +case $uname_m in + i386) ARCH=x86;; + i686) ARCH=x86;; + amd64) ARCH=x86;; + *86) ARCH=x86;; + *86_64) ARCH=x86;; + "Power Macintosh") ARCH=ppc;; +esac + +WORD=`./build-support/wordsize` + +MAKE_TARGET=$OS-$ARCH-$WORD +if [[ $OS == macosx && $ARCH == ppc ]] ; then + MAKE_TARGET=$OS-$ARCH fi +if [[ $OS == linux && $ARCH == ppc ]] ; then + MAKE_TARGET=$OS-$ARCH +fi + +echo $MAKE_TARGET From 9b7246555a8107262dc7e674d845ff3ac0d48300 Mon Sep 17 00:00:00 2001 From: Chris Double Date: Wed, 26 Mar 2008 15:26:54 +1300 Subject: [PATCH 08/15] Fix just parser in pegs --- extra/peg/parsers/parsers-tests.factor | 4 ++++ extra/peg/parsers/parsers.factor | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/extra/peg/parsers/parsers-tests.factor b/extra/peg/parsers/parsers-tests.factor index 08bde98419..e80baf3c4f 100644 --- a/extra/peg/parsers/parsers-tests.factor +++ b/extra/peg/parsers/parsers-tests.factor @@ -48,3 +48,7 @@ IN: peg.parsers.tests [ V{ } ] [ "" epsilon parse parse-result-ast ] unit-test + +{ "a" } [ + "a" "a" token just parse parse-result-ast +] unit-test \ No newline at end of file diff --git a/extra/peg/parsers/parsers.factor b/extra/peg/parsers/parsers.factor index 4bba60bb09..13509e81f7 100755 --- a/extra/peg/parsers/parsers.factor +++ b/extra/peg/parsers/parsers.factor @@ -3,14 +3,14 @@ USING: kernel sequences strings namespaces math assocs shuffle vectors arrays combinators.lib math.parser match unicode.categories sequences.deep peg peg.private - peg.search math.ranges ; + peg.search math.ranges words ; IN: peg.parsers TUPLE: just-parser p1 ; : just-pattern [ - dup [ + execute dup [ dup parse-result-remaining empty? [ drop f ] unless ] when ] ; From b1561de0f6636af53f2e53918b9f4e60265ad076 Mon Sep 17 00:00:00 2001 From: Chris Double Date: Wed, 26 Mar 2008 15:40:17 +1300 Subject: [PATCH 09/15] Reduce amount of generated code for peg token parser --- extra/peg/peg.factor | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/extra/peg/peg.factor b/extra/peg/peg.factor index 1707193e70..ae5ed2f8b2 100755 --- a/extra/peg/peg.factor +++ b/extra/peg/peg.factor @@ -43,17 +43,16 @@ TUPLE: token-parser symbol ; MATCH-VARS: ?token ; -: token-pattern ( -- quot ) - [ - ?token 2dup head? [ - dup >r length tail-slice r> - ] [ - 2drop f - ] if - ] ; - +: parse-token ( input string -- result ) + #! Parse the string, returning a parse result + 2dup head? [ + dup >r length tail-slice r> + ] [ + 2drop f + ] if ; + M: token-parser (compile) ( parser -- quot ) - token-parser-symbol \ ?token token-pattern match-replace ; + token-parser-symbol [ parse-token ] curry ; TUPLE: satisfy-parser quot ; From 4d8d25ecb336ff486755334d90f1f09b2f352463 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Tue, 25 Mar 2008 21:58:27 -0500 Subject: [PATCH 10/15] Update .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7e1e52d866..f2cf3de119 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,4 @@ factor temp logs work -buildsupport/wordsize +build-support/wordsize From 8569d18068dbaebeb28a4984af87dcbb3dda89ff Mon Sep 17 00:00:00 2001 From: Chris Double Date: Wed, 26 Mar 2008 16:08:14 +1300 Subject: [PATCH 11/15] Use new slots in peg --- extra/peg/peg.factor | 46 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/extra/peg/peg.factor b/extra/peg/peg.factor index ae5ed2f8b2..79c866c768 100755 --- a/extra/peg/peg.factor +++ b/extra/peg/peg.factor @@ -3,7 +3,7 @@ USING: kernel sequences strings namespaces math assocs shuffle vectors arrays combinators.lib math.parser match unicode.categories sequences.lib compiler.units parser - words quotations effects memoize ; + words quotations effects memoize accessors ; IN: peg TUPLE: parse-result remaining ast ; @@ -52,7 +52,7 @@ MATCH-VARS: ?token ; ] if ; M: token-parser (compile) ( parser -- quot ) - token-parser-symbol [ parse-token ] curry ; + symbol>> [ parse-token ] curry ; TUPLE: satisfy-parser quot ; @@ -72,7 +72,7 @@ MATCH-VARS: ?quot ; ] ; M: satisfy-parser (compile) ( parser -- quot ) - satisfy-parser-quot \ ?quot satisfy-pattern match-replace ; + quot>> \ ?quot satisfy-pattern match-replace ; TUPLE: range-parser min max ; @@ -100,12 +100,12 @@ TUPLE: seq-parser parsers ; : seq-pattern ( -- quot ) [ dup [ - dup parse-result-remaining ?quot [ - [ parse-result-remaining swap set-parse-result-remaining ] 2keep - parse-result-ast dup ignore = [ + dup remaining>> ?quot [ + [ remaining>> swap (>>remaining) ] 2keep + ast>> dup ignore = [ drop ] [ - swap [ parse-result-ast push ] keep + swap [ ast>> push ] keep ] if ] [ drop f @@ -118,7 +118,7 @@ TUPLE: seq-parser parsers ; M: seq-parser (compile) ( parser -- quot ) [ [ V{ } clone ] % - seq-parser-parsers [ compiled-parser \ ?quot seq-pattern match-replace % ] each + parsers>> [ compiled-parser \ ?quot seq-pattern match-replace % ] each ] [ ] make ; TUPLE: choice-parser parsers ; @@ -135,16 +135,16 @@ TUPLE: choice-parser parsers ; M: choice-parser (compile) ( parser -- quot ) [ f , - choice-parser-parsers [ compiled-parser \ ?quot choice-pattern match-replace % ] each + parsers>> [ compiled-parser \ ?quot choice-pattern match-replace % ] each \ nip , ] [ ] make ; TUPLE: repeat0-parser p1 ; : (repeat0) ( quot result -- result ) - 2dup parse-result-remaining swap call [ - [ parse-result-remaining swap set-parse-result-remaining ] 2keep - parse-result-ast swap [ parse-result-ast push ] keep + 2dup remaining>> swap call [ + [ remaining>> swap (>>remaining) ] 2keep + ast>> swap [ ast>> push ] keep (repeat0) ] [ nip @@ -158,7 +158,7 @@ TUPLE: repeat0-parser p1 ; M: repeat0-parser (compile) ( parser -- quot ) [ [ V{ } clone ] % - repeat0-parser-p1 compiled-parser \ ?quot repeat0-pattern match-replace % + p1>> compiled-parser \ ?quot repeat0-pattern match-replace % ] [ ] make ; TUPLE: repeat1-parser p1 ; @@ -166,7 +166,7 @@ TUPLE: repeat1-parser p1 ; : repeat1-pattern ( -- quot ) [ [ ?quot ] swap (repeat0) [ - dup parse-result-ast empty? [ + dup ast>> empty? [ drop f ] when ] [ @@ -177,7 +177,7 @@ TUPLE: repeat1-parser p1 ; M: repeat1-parser (compile) ( parser -- quot ) [ [ V{ } clone ] % - repeat1-parser-p1 compiled-parser \ ?quot repeat1-pattern match-replace % + p1>> compiled-parser \ ?quot repeat1-pattern match-replace % ] [ ] make ; TUPLE: optional-parser p1 ; @@ -188,7 +188,7 @@ TUPLE: optional-parser p1 ; ] ; M: optional-parser (compile) ( parser -- quot ) - optional-parser-p1 compiled-parser \ ?quot optional-pattern match-replace ; + p1>> compiled-parser \ ?quot optional-pattern match-replace ; TUPLE: ensure-parser p1 ; @@ -202,7 +202,7 @@ TUPLE: ensure-parser p1 ; ] ; M: ensure-parser (compile) ( parser -- quot ) - ensure-parser-p1 compiled-parser \ ?quot ensure-pattern match-replace ; + p1>> compiled-parser \ ?quot ensure-pattern match-replace ; TUPLE: ensure-not-parser p1 ; @@ -216,7 +216,7 @@ TUPLE: ensure-not-parser p1 ; ] ; M: ensure-not-parser (compile) ( parser -- quot ) - ensure-not-parser-p1 compiled-parser \ ?quot ensure-not-pattern match-replace ; + p1>> compiled-parser \ ?quot ensure-not-pattern match-replace ; TUPLE: action-parser p1 quot ; @@ -225,13 +225,13 @@ MATCH-VARS: ?action ; : action-pattern ( -- quot ) [ ?quot dup [ - dup parse-result-ast ?action call - swap [ set-parse-result-ast ] keep + dup ast>> ?action call + >>ast ] when ] ; M: action-parser (compile) ( parser -- quot ) - { action-parser-p1 action-parser-quot } get-slots [ compiled-parser ] dip + { p1>> quot>> } get-slots [ compiled-parser ] dip 2array { ?quot ?action } action-pattern match-replace ; : left-trim-slice ( string -- string ) @@ -245,7 +245,7 @@ TUPLE: sp-parser p1 ; M: sp-parser (compile) ( parser -- quot ) [ - \ left-trim-slice , sp-parser-p1 compiled-parser , + \ left-trim-slice , p1>> compiled-parser , ] [ ] make ; TUPLE: delay-parser quot ; @@ -255,7 +255,7 @@ M: delay-parser (compile) ( parser -- quot ) #! This way it is run only once and the #! parser constructed once at run time. [ - delay-parser-quot % \ compile , + quot>> % \ compile , ] [ ] make { } { "word" } memoize-quot [ % \ execute , ] [ ] make ; From 1ec945ba4c3b6380f3fe7a3a6d8decc5ffa315fb Mon Sep 17 00:00:00 2001 From: Chris Double Date: Wed, 26 Mar 2008 16:16:23 +1300 Subject: [PATCH 12/15] Use new slots in peg.ebnf --- extra/peg/ebnf/ebnf.factor | 40 +++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/extra/peg/ebnf/ebnf.factor b/extra/peg/ebnf/ebnf.factor index db478e571f..11e1e2ea64 100644 --- a/extra/peg/ebnf/ebnf.factor +++ b/extra/peg/ebnf/ebnf.factor @@ -3,7 +3,7 @@ USING: kernel compiler.units parser words arrays strings math.parser sequences quotations vectors namespaces math assocs continuations peg peg.parsers unicode.categories multiline combinators.lib - splitting ; + splitting accessors ; IN: peg.ebnf TUPLE: ebnf-non-terminal symbol ; @@ -16,7 +16,7 @@ TUPLE: ebnf-choice options ; TUPLE: ebnf-sequence elements ; TUPLE: ebnf-repeat0 group ; TUPLE: ebnf-repeat1 group ; -TUPLE: ebnf-optional elements ; +TUPLE: ebnf-optional group ; TUPLE: ebnf-rule symbol elements ; TUPLE: ebnf-action parser code ; TUPLE: ebnf rules ; @@ -198,7 +198,7 @@ DEFER: 'choice' : 'rule' ( -- parser ) [ - 'non-terminal' [ ebnf-non-terminal-symbol ] action , + 'non-terminal' [ symbol>> ] action , "=" syntax , 'choice' , ] seq* [ first2 ] action ; @@ -215,49 +215,53 @@ SYMBOL: main H{ } clone dup dup [ parser set swap (transform) main set ] bind ; M: ebnf (transform) ( ast -- parser ) - ebnf-rules [ (transform) ] map peek ; + rules>> [ (transform) ] map peek ; M: ebnf-rule (transform) ( ast -- parser ) - dup ebnf-rule-elements (transform) [ - swap ebnf-rule-symbol set + dup elements>> (transform) [ + swap symbol>> set ] keep ; M: ebnf-sequence (transform) ( ast -- parser ) - ebnf-sequence-elements [ (transform) ] map seq ; + elements>> [ (transform) ] map seq ; M: ebnf-choice (transform) ( ast -- parser ) - ebnf-choice-options [ (transform) ] map choice ; + options>> [ (transform) ] map choice ; M: ebnf-any-character (transform) ( ast -- parser ) drop any-char ; M: ebnf-range (transform) ( ast -- parser ) - ebnf-range-pattern range-pattern ; + pattern>> range-pattern ; + +: transform-group ( ast -- parser ) + #! convert a ast node with groups to a parser for that group + group>> (transform) ; M: ebnf-ensure (transform) ( ast -- parser ) - ebnf-ensure-group (transform) ensure ; + transform-group ensure ; M: ebnf-ensure-not (transform) ( ast -- parser ) - ebnf-ensure-not-group (transform) ensure-not ; + transform-group ensure-not ; M: ebnf-repeat0 (transform) ( ast -- parser ) - ebnf-repeat0-group (transform) repeat0 ; + transform-group repeat0 ; M: ebnf-repeat1 (transform) ( ast -- parser ) - ebnf-repeat1-group (transform) repeat1 ; + transform-group repeat1 ; M: ebnf-optional (transform) ( ast -- parser ) - ebnf-optional-elements (transform) optional ; + transform-group optional ; M: ebnf-action (transform) ( ast -- parser ) - [ ebnf-action-parser (transform) ] keep - ebnf-action-code string-lines [ parse-lines ] with-compilation-unit action ; + [ parser>> (transform) ] keep + code>> string-lines [ parse-lines ] with-compilation-unit action ; M: ebnf-terminal (transform) ( ast -- parser ) - ebnf-terminal-symbol token sp ; + symbol>> token sp ; M: ebnf-non-terminal (transform) ( ast -- parser ) - ebnf-non-terminal-symbol [ + symbol>> [ , parser get , \ at , ] [ ] make delay sp ; From de3e4e049fdaf177d85553508b888abd9b3a09cf Mon Sep 17 00:00:00 2001 From: Chris Double Date: Wed, 26 Mar 2008 16:21:33 +1300 Subject: [PATCH 13/15] Use cleave instead of get-slots in peg --- extra/peg/peg.factor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extra/peg/peg.factor b/extra/peg/peg.factor index 79c866c768..00271a9ad3 100755 --- a/extra/peg/peg.factor +++ b/extra/peg/peg.factor @@ -3,7 +3,7 @@ USING: kernel sequences strings namespaces math assocs shuffle vectors arrays combinators.lib math.parser match unicode.categories sequences.lib compiler.units parser - words quotations effects memoize accessors ; + words quotations effects memoize accessors combinators.cleave ; IN: peg TUPLE: parse-result remaining ast ; @@ -231,7 +231,7 @@ MATCH-VARS: ?action ; ] ; M: action-parser (compile) ( parser -- quot ) - { p1>> quot>> } get-slots [ compiled-parser ] dip + { [ p1>> ] [ quot>> ] } cleave [ compiled-parser ] dip 2array { ?quot ?action } action-pattern match-replace ; : left-trim-slice ( string -- string ) From 2f73edb3a21a72ddc015c998a9bf29538f971547 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Tue, 25 Mar 2008 22:26:33 -0500 Subject: [PATCH 14/15] Fix stat on linux/x86.64 --- extra/unix/stat/linux/64/64.factor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extra/unix/stat/linux/64/64.factor b/extra/unix/stat/linux/64/64.factor index be6ad1e3fc..a374551385 100644 --- a/extra/unix/stat/linux/64/64.factor +++ b/extra/unix/stat/linux/64/64.factor @@ -27,5 +27,5 @@ C-STRUCT: stat FUNCTION: int __xstat ( int ver, char* pathname, stat* buf ) ; FUNCTION: int __lxstat ( int ver, char* pathname, stat* buf ) ; -: stat ( pathname buf -- int ) 3 -rot __xstat ; -: lstat ( pathname buf -- int ) 3 -rot __lxstat ; \ No newline at end of file +: stat ( pathname buf -- int ) 1 -rot __xstat ; +: lstat ( pathname buf -- int ) 1 -rot __lxstat ; From 257a03ace5ca43014f3bae8322bbdf0c9b1aab26 Mon Sep 17 00:00:00 2001 From: Slava Pestov Date: Wed, 26 Mar 2008 01:30:44 -0500 Subject: [PATCH 15/15] Fix multi-methods load error --- extra/multi-methods/multi-methods.factor | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extra/multi-methods/multi-methods.factor b/extra/multi-methods/multi-methods.factor index 9a74cc65e8..42ade34186 100755 --- a/extra/multi-methods/multi-methods.factor +++ b/extra/multi-methods/multi-methods.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Slava Pestov. ! See http://factorcode.org/license.txt for BSD license. -USING: kernel math sequences vectors classes combinators -arrays words assocs parser namespaces definitions +USING: kernel math sequences vectors classes classes.algebra +combinators arrays words assocs parser namespaces definitions prettyprint prettyprint.backend quotations arrays.lib debugger io compiler.units kernel.private effects ; IN: multi-methods