diff --git a/core/io/encodings/binary/about.txt b/core/io/encodings/binary/about.txt new file mode 100644 index 0000000000..a1eb4bc664 --- /dev/null +++ b/core/io/encodings/binary/about.txt @@ -0,0 +1 @@ +Dummy encoding for binary I/O diff --git a/core/io/utf16/authors.txt b/core/io/encodings/binary/authors.txt similarity index 100% rename from core/io/utf16/authors.txt rename to core/io/encodings/binary/authors.txt diff --git a/core/io/encodings/binary/binary-docs.factor b/core/io/encodings/binary/binary-docs.factor new file mode 100644 index 0000000000..f8be5054df --- /dev/null +++ b/core/io/encodings/binary/binary-docs.factor @@ -0,0 +1,5 @@ +USING: help.syntax help.markup ; +IN: io.encodings.binary + +HELP: binary +{ $class-description "This is the encoding descriptor for binary I/O." } ; diff --git a/core/io/encodings/binary/binary.factor b/core/io/encodings/binary/binary.factor new file mode 100644 index 0000000000..b58f9836c0 --- /dev/null +++ b/core/io/encodings/binary/binary.factor @@ -0,0 +1,6 @@ +USING: kernel io.encodings ; + +TUPLE: binary ; + +M: binary init-decoding drop ; +M: binary init-encoding drop ; diff --git a/core/io/encodings/binary/tags.txt b/core/io/encodings/binary/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/binary/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 767e9b266b..b27b89642d 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors -namespaces unicode.syntax ; +namespaces unicode.syntax growable strings io ; IN: io.encodings TUPLE: encode-error ; @@ -23,6 +23,42 @@ SYMBOL: begin : finish-decoding ( buf ch state -- str ) begin eq? [ decode-error ] unless drop "" like ; -: decode ( seq quot -- str ) - >r [ length <sbuf> 0 begin ] keep r> each +: start-decoding ( seq length -- buf ch state seq ) + <sbuf> 0 begin roll ; + +: decode ( seq quot -- string ) + >r dup length start-decoding r> + [ -rot ] swap compose each finish-decoding ; inline + +: space ( resizable -- room-left ) + dup underlying swap [ length ] 2apply - ; + +: full? ( resizable -- ? ) space zero? ; + +: end-read-loop ( buf ch state stream quot -- string/f ) + 2drop 2drop >string f like ; + +: under ( a b c -- c a b c ) + tuck >r swapd r> ; inline + +: decode-read-loop ( buf ch state stream quot -- string/f ) + >r >r pick r> r> rot full? [ end-read-loop ] [ + over stream-read1 [ + -rot tuck >r >r >r -rot r> call r> r> decode-read-loop + ] [ end-read-loop ] if* + ] if ; inline + +: decode-read ( length stream quot -- string ) + >r swap start-decoding r> + decode-read-loop ; inline + +GENERIC: init-decoding ( stream encoding -- decoded-stream ) + +: <decoding> ( stream decoding-class -- decoded-stream ) + construct-empty init-decoding ; + +GENERIC: init-encoding ( stream encoding -- encoded-stream ) + +: <encoding> ( stream encoding-class -- encoded-stream ) + construct-empty init-encoding ; diff --git a/core/io/encodings/latin1/about.txt b/core/io/encodings/latin1/about.txt new file mode 100644 index 0000000000..d40d628767 --- /dev/null +++ b/core/io/encodings/latin1/about.txt @@ -0,0 +1 @@ +ISO 8859-1 encoding/decoding diff --git a/core/io/utf8/authors.txt b/core/io/encodings/latin1/authors.txt similarity index 100% rename from core/io/utf8/authors.txt rename to core/io/encodings/latin1/authors.txt diff --git a/core/io/encodings/latin1/latin1-docs.factor b/core/io/encodings/latin1/latin1-docs.factor new file mode 100644 index 0000000000..5872b2bcfd --- /dev/null +++ b/core/io/encodings/latin1/latin1-docs.factor @@ -0,0 +1,5 @@ +USING: help.syntax help.markup ; +IN: io.encodings.latin1 + +HELP: latin1 +{ $class-description "This class is used for Latin 1 (ISO 8859-1) encoding and decoding" } ; diff --git a/core/io/encodings/latin1/latin1.factor b/core/io/encodings/latin1/latin1.factor new file mode 100644 index 0000000000..2c2aa8d60a --- /dev/null +++ b/core/io/encodings/latin1/latin1.factor @@ -0,0 +1,19 @@ +USING: io.encodings strings kernel ; +IN: io.encodings.latin1 + +TUPLE: latin1 stream ; + +M: latin1 init-decoding tuck set-latin1-stream ; +M: latin1 init-encoding drop ; + +M: latin1 stream-read1 + latin1-stream stream-read1 ; + +M: latin1 stream-read + latin1-stream stream-read >string ; + +M: latin1 stream-read-until + latin1-stream stream-read-until >string ; + +M: latin1 stream-readln + latin1-stream stream-readln >string ; diff --git a/core/io/encodings/latin1/tags.txt b/core/io/encodings/latin1/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/latin1/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/utf16/.utf16.factor.swo b/core/io/encodings/utf16/.utf16.factor.swo new file mode 100644 index 0000000000..01be8fdab2 Binary files /dev/null and b/core/io/encodings/utf16/.utf16.factor.swo differ diff --git a/core/io/encodings/utf16/about.txt b/core/io/encodings/utf16/about.txt new file mode 100644 index 0000000000..ffb8ebf8f5 --- /dev/null +++ b/core/io/encodings/utf16/about.txt @@ -0,0 +1 @@ +UTF-16, UTF-16LE, UTF-16BE encoding and decoding diff --git a/core/io/encodings/utf16/authors.txt b/core/io/encodings/utf16/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/core/io/encodings/utf16/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/core/io/utf16/summary.txt b/core/io/encodings/utf16/summary.txt similarity index 100% rename from core/io/utf16/summary.txt rename to core/io/encodings/utf16/summary.txt diff --git a/core/io/encodings/utf16/tags.txt b/core/io/encodings/utf16/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/utf16/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/utf16/utf16-docs.factor b/core/io/encodings/utf16/utf16-docs.factor similarity index 99% rename from core/io/utf16/utf16-docs.factor rename to core/io/encodings/utf16/utf16-docs.factor index 6d24f54694..c49c030ef3 100644 --- a/core/io/utf16/utf16-docs.factor +++ b/core/io/encodings/utf16/utf16-docs.factor @@ -1,5 +1,5 @@ USING: help.markup help.syntax io.encodings strings ; -IN: io.utf16 +IN: io.encodings.utf16 ARTICLE: "io.utf16" "Working with UTF16-encoded data" "The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences." diff --git a/core/io/utf16/utf16-tests.factor b/core/io/encodings/utf16/utf16-tests.factor similarity index 100% rename from core/io/utf16/utf16-tests.factor rename to core/io/encodings/utf16/utf16-tests.factor diff --git a/core/io/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor similarity index 73% rename from core/io/utf16/utf16.factor rename to core/io/encodings/utf16/utf16.factor index 19ebc1d43a..81c982dd55 100755 --- a/core/io/utf16/utf16.factor +++ b/core/io/encodings/utf16/utf16.factor @@ -30,7 +30,7 @@ SYMBOL: ignore >r 2 shift r> BIN: 11 bitand bitor quad3 ] [ 2drop do-ignore ] if ; -: (decode-utf16be) ( buf byte ch state -- buf ch state ) +: decode-utf16be-step ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf16be ] } { double [ end-multibyte ] } @@ -41,7 +41,7 @@ SYMBOL: ignore } case ; : decode-utf16be ( seq -- str ) - [ -rot (decode-utf16be) ] decode ; + [ decode-utf16be-step ] decode ; : handle-double ( buf byte ch -- buf ch state ) swap dup -3 shift BIN: 11011 = [ @@ -55,7 +55,7 @@ SYMBOL: ignore BIN: 11 bitand append-nums HEX: 10000 + decoded ] [ 2drop push-replacement ] if ; -: (decode-utf16le) ( buf byte ch state -- buf ch state ) +: decode-utf16le-step ( buf byte ch state -- buf ch state ) { { begin [ drop double ] } { double [ handle-double ] } @@ -65,7 +65,7 @@ SYMBOL: ignore } case ; : decode-utf16le ( seq -- str ) - [ -rot (decode-utf16le) ] decode ; + [ decode-utf16le-step ] decode ; : encode-first -10 shift @@ -114,3 +114,35 @@ SYMBOL: ignore { [ utf16be? ] [ decode-utf16be ] } { [ t ] [ decode-error ] } } cond ; + +! UTF16LE streams + +TUPLE: utf16le ; +: <utf16le> utf16le construct-delegate ; +! In the future, this should detect and ignore a BOM at the beginning + +M: utf16le init-decoding ( stream utf16le -- utf16le-stream ) + tuck set-delegate ; + +M: utf16le init-encoding ( stream utf16le -- utf16le-stream ) + tuck set-delegate ; + +M: utf16le stream-read1 1 swap stream-read ; + +M: utf16le stream-read + delegate [ decode-utf16le-step ] decode-read ; + +M: utf16le stream-read-partial stream-read ; + +M: utf16le stream-read-until + ! Copied from { c-reader stream-read-until }!!! + [ swap read-until-loop ] "" make + swap over empty? over not and [ 2drop f f ] when ; + +M: utf16le stream-write1 + >r 1string r> stream-write ; + +M: utf16le stream-write + >r encode-utf16le r> delegate stream-write ; + +M: utf16le dispose delegate dispose ; diff --git a/core/io/encodings/utf8/about.txt b/core/io/encodings/utf8/about.txt new file mode 100644 index 0000000000..7560b72db4 --- /dev/null +++ b/core/io/encodings/utf8/about.txt @@ -0,0 +1 @@ +UTF-8 encoding and decoding diff --git a/core/io/encodings/utf8/authors.txt b/core/io/encodings/utf8/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/core/io/encodings/utf8/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/core/io/utf8/summary.txt b/core/io/encodings/utf8/summary.txt similarity index 100% rename from core/io/utf8/summary.txt rename to core/io/encodings/utf8/summary.txt diff --git a/core/io/encodings/utf8/tags.txt b/core/io/encodings/utf8/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/utf8/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/utf8/utf8-docs.factor b/core/io/encodings/utf8/utf8-docs.factor similarity index 97% rename from core/io/utf8/utf8-docs.factor rename to core/io/encodings/utf8/utf8-docs.factor index 28310b5d77..212552519c 100644 --- a/core/io/utf8/utf8-docs.factor +++ b/core/io/encodings/utf8/utf8-docs.factor @@ -1,5 +1,5 @@ USING: help.markup help.syntax io.encodings strings ; -IN: io.utf8 +IN: io.encodings.utf8 ARTICLE: "io.utf8" "Working with UTF8-encoded data" "The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences." diff --git a/core/io/encodings/utf8/utf8-tests.factor b/core/io/encodings/utf8/utf8-tests.factor new file mode 100644 index 0000000000..33c4ffbf12 --- /dev/null +++ b/core/io/encodings/utf8/utf8-tests.factor @@ -0,0 +1,23 @@ +USING: io.encodings.utf8 tools.test sbufs kernel io +sequences strings arrays unicode.syntax ; + +: decode-utf8-w/stream ( array -- newarray ) + >sbuf dup reverse-here <utf8> contents >array ; + +: encode-utf8-w/stream ( array -- newarray ) + SBUF" " clone tuck <utf8> write >array ; + +[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test + +[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test + +[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test + +[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test + +[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test + +[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test + +[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] +[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/core/io/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor similarity index 63% rename from core/io/utf8/utf8.factor rename to core/io/encodings/utf8/utf8.factor index 213afb6eae..de3fd5b67b 100644 --- a/core/io/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -1,8 +1,10 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: math kernel sequences sbufs vectors -namespaces io.encodings combinators ; -IN: io.utf8 +USING: math kernel sequences sbufs vectors growable io continuations +namespaces io.encodings combinators strings io.streams.c ; +IN: io.encodings.utf8 + +! Decoding UTF-8 SYMBOL: double SYMBOL: triple @@ -31,7 +33,7 @@ SYMBOL: quad3 : end-multibyte ( buf byte ch -- buf ch state ) f append-nums [ decoded ] unless* ; -: (decode-utf8) ( buf byte ch state -- buf ch state ) +: decode-utf8-step ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf8 ] } { double [ end-multibyte ] } @@ -43,7 +45,9 @@ SYMBOL: quad3 } case ; : decode-utf8 ( seq -- str ) - [ -rot (decode-utf8) ] decode ; + [ decode-utf8-step ] decode ; + +! Encoding UTF-8 : encoded ( char -- ) BIN: 111111 bitand BIN: 10000000 bitor , ; @@ -70,3 +74,35 @@ SYMBOL: quad3 : encode-utf8 ( str -- seq ) [ [ char>utf8 ] each ] B{ } make ; + +! Interface for streams + +TUPLE: utf8 ; +: <utf8> utf8 construct-delegate ; +! In the future, this should detect and ignore a BOM at the beginning + +M: utf8 init-decoding ( stream utf8 -- utf8-stream ) + tuck set-delegate ; + +M: utf8 init-encoding ( stream utf8 -- utf8-stream ) + tuck set-delegate ; + +M: utf8 stream-read1 1 swap stream-read ; + +M: utf8 stream-read + delegate [ decode-utf8-step ] decode-read ; + +M: utf8 stream-read-partial stream-read ; + +M: utf8 stream-read-until + ! Copied from { c-reader stream-read-until }!!! + [ swap read-until-loop ] "" make + swap over empty? over not and [ 2drop f f ] when ; + +M: utf8 stream-write1 + >r 1string r> stream-write ; + +M: utf8 stream-write + >r encode-utf8 r> delegate stream-write ; + +M: utf8 dispose delegate dispose ; diff --git a/core/io/utf8/utf8-tests.factor b/core/io/utf8/utf8-tests.factor deleted file mode 100644 index 3576471586..0000000000 --- a/core/io/utf8/utf8-tests.factor +++ /dev/null @@ -1,16 +0,0 @@ -USING: io.utf8 tools.test strings arrays unicode.syntax ; - -[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test - -[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test - -[ "x" ] [ "x" decode-utf8 >string ] unit-test - -[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test - -[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test - -[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test - -[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] -[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/extra/const/const.factor b/extra/const/const.factor deleted file mode 100644 index 8efef7e372..0000000000 --- a/extra/const/const.factor +++ /dev/null @@ -1,24 +0,0 @@ -USING: kernel parser words sequences ; -IN: const - -: define-const ( word value -- ) - [ parsed ] curry dupd define - t "parsing" set-word-prop ; - -: CONST: - CREATE scan-word dup parsing? - [ execute dup pop ] when define-const ; parsing - -: define-enum ( words -- ) - dup length [ define-const ] 2each ; - -: ENUM: - ";" parse-tokens [ create-in ] map define-enum ; parsing - -: define-value ( word -- ) - { f } clone [ first ] curry define ; - -: VALUE: CREATE define-value ; parsing - -: set-value ( value word -- ) - word-def first set-first ; diff --git a/extra/delegate/protocols/protocols.factor b/extra/delegate/protocols/protocols.factor index 37f3812d2d..f9b4c8648d 100755 --- a/extra/delegate/protocols/protocols.factor +++ b/extra/delegate/protocols/protocols.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2007 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: delegate sequences.private sequences assocs prettyprint.sections -io definitions kernel ; +io definitions kernel continuations ; IN: delegate.protocols PROTOCOL: sequence-protocol @@ -15,7 +15,7 @@ PROTOCOL: assoc-protocol ! everything should work, just slower (with >alist) PROTOCOL: stream-protocol - stream-read1 stream-read stream-read-until + stream-read1 stream-read stream-read-until dispose stream-flush stream-write1 stream-write stream-format stream-nl make-span-stream make-block-stream stream-readln make-cell-stream stream-write-table ; diff --git a/extra/multiline/multiline.factor b/extra/multiline/multiline.factor index 4d777605cf..d32c11dd06 100755 --- a/extra/multiline/multiline.factor +++ b/extra/multiline/multiline.factor @@ -7,8 +7,11 @@ IN: multiline lexer get dup next-line lexer-line-text ; : (parse-here) ( -- ) - next-line-text dup ";" = - [ drop lexer get next-line ] [ % "\n" % (parse-here) ] if ; + next-line-text [ + dup ";" = + [ drop lexer get next-line ] + [ % "\n" % (parse-here) ] if + ] [ ";" unexpected-eof ] if* ; : parse-here ( -- str ) [ (parse-here) ] "" make 1 head* @@ -19,11 +22,13 @@ IN: multiline parse-here 1quotation define-inline ; parsing : (parse-multiline-string) ( start-index end-text -- end-index ) - lexer get lexer-line-text 2dup start - [ rot dupd >r >r swap subseq % r> r> length + ] [ - rot tail % "\n" % 0 - lexer get next-line swap (parse-multiline-string) - ] if* ; + lexer get lexer-line-text [ + 2dup start + [ rot dupd >r >r swap subseq % r> r> length + ] [ + rot tail % "\n" % 0 + lexer get next-line swap (parse-multiline-string) + ] if* + ] [ nip unexpected-eof ] if* ; : parse-multiline-string ( end-text -- str ) [ diff --git a/extra/unicode/breaks/breaks.factor b/extra/unicode/breaks/breaks.factor index 70a9c781a2..1014d3ad7e 100644 --- a/extra/unicode/breaks/breaks.factor +++ b/extra/unicode/breaks/breaks.factor @@ -1,7 +1,7 @@ USING: unicode.categories kernel math combinators splitting sequences math.parser io.files io assocs arrays namespaces combinators.lib assocs.lib math.ranges unicode.normalize -unicode.syntax unicode.data compiler.units alien.syntax const ; +unicode.syntax unicode.data compiler.units alien.syntax ; IN: unicode.breaks C-ENUM: Any L V T Extend Control CR LF graphemes ;