diff --git a/core/io/encodings/binary/tags.txt b/core/io/encodings/binary/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/binary/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 767e9b266b..e6811b6e6d 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -23,6 +23,18 @@ SYMBOL: begin : finish-decoding ( buf ch state -- str ) begin eq? [ decode-error ] unless drop "" like ; -: decode ( seq quot -- str ) - >r [ length 0 begin ] keep r> each - finish-decoding ; inline +: decode ( ch state seq quot -- buf ch state ) + [ -rot ] swap compose each ; inline + +: start-decoding ( seq -- buf ch state seq ) + [ length 0 begin ] keep ; + +GENERIC: init-decoding ( stream encoding -- decoded-stream ) + +: ( stream decoding-class -- decoded-stream ) + construct-empty init-decoding ; + +GENERIC: init-encoding ( stream encoding -- encoded-stream ) + +: ( stream encoding-class -- encoded-stream ) + construct-empty init-encoding ; diff --git a/core/io/encodings/latin1/about.txt b/core/io/encodings/latin1/about.txt new file mode 100644 index 0000000000..d40d628767 --- /dev/null +++ b/core/io/encodings/latin1/about.txt @@ -0,0 +1 @@ +ISO 8859-1 encoding/decoding diff --git a/core/io/encodings/latin1/authors.txt b/core/io/encodings/latin1/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/core/io/encodings/latin1/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/core/io/encodings/latin1/latin1-docs.factor b/core/io/encodings/latin1/latin1-docs.factor new file mode 100644 index 0000000000..5872b2bcfd --- /dev/null +++ b/core/io/encodings/latin1/latin1-docs.factor @@ -0,0 +1,5 @@ +USING: help.syntax help.markup ; +IN: io.encodings.latin1 + +HELP: latin1 +{ $class-description "This class is used for Latin 1 (ISO 8859-1) encoding and decoding" } ; diff --git a/core/io/encodings/latin1/latin1.factor b/core/io/encodings/latin1/latin1.factor new file mode 100644 index 0000000000..2c2aa8d60a --- /dev/null +++ b/core/io/encodings/latin1/latin1.factor @@ -0,0 +1,19 @@ +USING: io.encodings strings kernel ; +IN: io.encodings.latin1 + +TUPLE: latin1 stream ; + +M: latin1 init-decoding tuck set-latin1-stream ; +M: latin1 init-encoding drop ; + +M: latin1 stream-read1 + latin1-stream stream-read1 ; + +M: latin1 stream-read + latin1-stream stream-read >string ; + +M: latin1 stream-read-until + latin1-stream stream-read-until >string ; + +M: latin1 stream-readln + latin1-stream stream-readln >string ; diff --git a/core/io/encodings/latin1/tags.txt b/core/io/encodings/latin1/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/latin1/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/utf16/.utf16.factor.swo b/core/io/encodings/utf16/.utf16.factor.swo new file mode 100644 index 0000000000..01be8fdab2 Binary files /dev/null and b/core/io/encodings/utf16/.utf16.factor.swo differ diff --git a/core/io/encodings/utf16/about.txt b/core/io/encodings/utf16/about.txt new file mode 100644 index 0000000000..ffb8ebf8f5 --- /dev/null +++ b/core/io/encodings/utf16/about.txt @@ -0,0 +1 @@ +UTF-16, UTF-16LE, UTF-16BE encoding and decoding diff --git a/core/io/encodings/utf16/authors.txt b/core/io/encodings/utf16/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/core/io/encodings/utf16/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/core/io/encodings/utf16/summary.txt b/core/io/encodings/utf16/summary.txt new file mode 100644 index 0000000000..b2490675aa --- /dev/null +++ b/core/io/encodings/utf16/summary.txt @@ -0,0 +1 @@ +UTF16 encoding/decoding diff --git a/core/io/encodings/utf16/tags.txt b/core/io/encodings/utf16/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/utf16/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/utf16/utf16-docs.factor b/core/io/encodings/utf16/utf16-docs.factor new file mode 100644 index 0000000000..c49c030ef3 --- /dev/null +++ b/core/io/encodings/utf16/utf16-docs.factor @@ -0,0 +1,45 @@ +USING: help.markup help.syntax io.encodings strings ; +IN: io.encodings.utf16 + +ARTICLE: "io.utf16" "Working with UTF16-encoded data" +"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences." +{ $subsection encode-utf16le } +{ $subsection encode-utf16be } +{ $subsection decode-utf16le } +{ $subsection decode-utf16be } +"Support for UTF16 data with a byte order mark:" +{ $subsection encode-utf16 } +{ $subsection decode-utf16 } ; + +ABOUT: "io.utf16" + +HELP: decode-utf16 +{ $values { "seq" "a sequence of bytes" } { "str" string } } +{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." } +{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; + +HELP: decode-utf16be +{ $values { "seq" "a sequence of bytes" } { "str" string } } +{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." } +{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; + +HELP: decode-utf16le +{ $values { "seq" "a sequence of bytes" } { "str" string } } +{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." } +{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; + +{ decode-utf16 decode-utf16le decode-utf16be } related-words + +HELP: encode-utf16be +{ $values { "str" string } { "seq" "a sequence of bytes" } } +{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ; + +HELP: encode-utf16le +{ $values { "str" string } { "seq" "a sequence of bytes" } } +{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ; + +HELP: encode-utf16 +{ $values { "str" string } { "seq" "a sequence of bytes" } } +{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ; + +{ encode-utf16 encode-utf16be encode-utf16le } related-words diff --git a/core/io/encodings/utf16/utf16-tests.factor b/core/io/encodings/utf16/utf16-tests.factor new file mode 100755 index 0000000000..9800a9827d --- /dev/null +++ b/core/io/encodings/utf16/utf16-tests.factor @@ -0,0 +1,15 @@ +USING: tools.test io.utf16 arrays unicode.syntax ; + +[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test +[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test + +[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test + +[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test +[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test + +[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor new file mode 100755 index 0000000000..c38e7845df --- /dev/null +++ b/core/io/encodings/utf16/utf16.factor @@ -0,0 +1,116 @@ +! Copyright (C) 2006, 2007 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: math kernel sequences sbufs vectors namespaces io.binary +io.encodings combinators splitting ; +IN: io.utf16 + +SYMBOL: double +SYMBOL: quad1 +SYMBOL: quad2 +SYMBOL: quad3 +SYMBOL: ignore + +: do-ignore ( -- ch state ) 0 ignore ; + +: append-nums ( byte ch -- ch ) + 8 shift bitor ; + +: end-multibyte ( buf byte ch -- buf ch state ) + append-nums decoded ; + +: begin-utf16be ( buf byte -- buf ch state ) + dup -3 shift BIN: 11011 number= [ + dup BIN: 00000100 bitand zero? + [ BIN: 11 bitand quad1 ] + [ drop do-ignore ] if + ] [ double ] if ; + +: handle-quad2be ( byte ch -- ch state ) + swap dup -2 shift BIN: 110111 number= [ + >r 2 shift r> BIN: 11 bitand bitor quad3 + ] [ 2drop do-ignore ] if ; + +: (decode-utf16be) ( buf byte ch state -- buf ch state ) + { + { begin [ drop begin-utf16be ] } + { double [ end-multibyte ] } + { quad1 [ append-nums quad2 ] } + { quad2 [ handle-quad2be ] } + { quad3 [ append-nums HEX: 10000 + decoded ] } + { ignore [ 2drop push-replacement ] } + } case ; + +: decode-utf16be ( seq -- str ) + [ (decode-utf16be) ] decode ; + +: handle-double ( buf byte ch -- buf ch state ) + swap dup -3 shift BIN: 11011 = [ + dup BIN: 100 bitand 0 number= + [ BIN: 11 bitand 8 shift bitor quad2 ] + [ 2drop push-replacement ] if + ] [ end-multibyte ] if ; + +: handle-quad3le ( buf byte ch -- buf ch state ) + swap dup -2 shift BIN: 110111 = [ + BIN: 11 bitand append-nums HEX: 10000 + decoded + ] [ 2drop push-replacement ] if ; + +: (decode-utf16le) ( buf byte ch state -- buf ch state ) + { + { begin [ drop double ] } + { double [ handle-double ] } + { quad1 [ append-nums quad2 ] } + { quad2 [ 10 shift bitor quad3 ] } + { quad3 [ handle-quad3le ] } + } case ; + +: decode-utf16le ( seq -- str ) + [ (decode-utf16le) ] decode ; + +: encode-first + -10 shift + dup -8 shift BIN: 11011000 bitor + swap HEX: FF bitand ; + +: encode-second + BIN: 1111111111 bitand + dup -8 shift BIN: 11011100 bitor + swap BIN: 11111111 bitand ; + +: char>utf16be ( char -- ) + dup HEX: FFFF > [ + HEX: 10000 - + dup encode-first swap , , + encode-second swap , , + ] [ h>b/b , , ] if ; + +: encode-utf16be ( str -- seq ) + [ [ char>utf16be ] each ] B{ } make ; + +: char>utf16le ( char -- ) + dup HEX: FFFF > [ + HEX: 10000 - + dup encode-first , , + encode-second , , + ] [ h>b/b swap , , ] if ; + +: encode-utf16le ( str -- seq ) + [ [ char>utf16le ] each ] B{ } make ; + +: bom-le B{ HEX: ff HEX: fe } ; inline + +: bom-be B{ HEX: fe HEX: ff } ; inline + +: encode-utf16 ( str -- seq ) + encode-utf16le bom-le swap append ; + +: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ; + +: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; + +: decode-utf16 ( seq -- str ) + { + { [ utf16le? ] [ decode-utf16le ] } + { [ utf16be? ] [ decode-utf16be ] } + { [ t ] [ decode-error ] } + } cond ; diff --git a/core/io/encodings/utf8/about.txt b/core/io/encodings/utf8/about.txt new file mode 100644 index 0000000000..7560b72db4 --- /dev/null +++ b/core/io/encodings/utf8/about.txt @@ -0,0 +1 @@ +UTF-8 encoding and decoding diff --git a/core/io/encodings/utf8/authors.txt b/core/io/encodings/utf8/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/core/io/encodings/utf8/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/core/io/encodings/utf8/summary.txt b/core/io/encodings/utf8/summary.txt new file mode 100644 index 0000000000..afd259a56b --- /dev/null +++ b/core/io/encodings/utf8/summary.txt @@ -0,0 +1 @@ +UTF8 encoding/decoding diff --git a/core/io/encodings/utf8/tags.txt b/core/io/encodings/utf8/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/utf8/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/utf8/utf8-docs.factor b/core/io/encodings/utf8/utf8-docs.factor new file mode 100644 index 0000000000..212552519c --- /dev/null +++ b/core/io/encodings/utf8/utf8-docs.factor @@ -0,0 +1,18 @@ +USING: help.markup help.syntax io.encodings strings ; +IN: io.encodings.utf8 + +ARTICLE: "io.utf8" "Working with UTF8-encoded data" +"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences." +{ $subsection encode-utf8 } +{ $subsection decode-utf8 } ; + +ABOUT: "io.utf8" + +HELP: decode-utf8 +{ $values { "seq" "a sequence of bytes" } { "str" string } } +{ $description "Decodes a sequence of bytes representing a Unicode string in UTF8 format." } +{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; + +HELP: encode-utf8 +{ $values { "str" string } { "seq" "a sequence of bytes" } } +{ $description "Encodes a Unicode string as a sequence of bytes in UTF8 format." } ; diff --git a/core/io/encodings/utf8/utf8-tests.factor b/core/io/encodings/utf8/utf8-tests.factor new file mode 100644 index 0000000000..3576471586 --- /dev/null +++ b/core/io/encodings/utf8/utf8-tests.factor @@ -0,0 +1,16 @@ +USING: io.utf8 tools.test strings arrays unicode.syntax ; + +[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test + +[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test + +[ "x" ] [ "x" decode-utf8 >string ] unit-test + +[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test + +[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test + +[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test + +[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] +[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor new file mode 100644 index 0000000000..90aec4623a --- /dev/null +++ b/core/io/encodings/utf8/utf8.factor @@ -0,0 +1,117 @@ +! Copyright (C) 2006, 2007 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: math kernel sequences sbufs vectors growable io +namespaces io.encodings combinators ; +IN: io.utf8 + +! Decoding UTF-8 + +SYMBOL: double +SYMBOL: triple +SYMBOL: triple2 +SYMBOL: quad +SYMBOL: quad2 +SYMBOL: quad3 + +: starts-2? ( char -- ? ) + -6 shift BIN: 10 number= ; + +: append-nums ( buf bottom top state-out -- buf num state ) + >r over starts-2? + [ 6 shift swap BIN: 111111 bitand bitor r> ] + [ r> 3drop push-replacement ] if ; + +: begin-utf8 ( buf byte -- buf ch state ) + { + { [ dup -7 shift zero? ] [ decoded ] } + { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } + { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } + { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } + { [ t ] [ drop push-replacement ] } + } cond ; + +: end-multibyte ( buf byte ch -- buf ch state ) + f append-nums [ decoded ] unless* ; + +: (decode-utf8) ( buf byte ch state -- buf ch state ) + { + { begin [ drop begin-utf8 ] } + { double [ end-multibyte ] } + { triple [ triple2 append-nums ] } + { triple2 [ end-multibyte ] } + { quad [ quad2 append-nums ] } + { quad2 [ quad3 append-nums ] } + { quad3 [ end-multibyte ] } + } case ; + +: decode-utf8-chunk ( ch state seq -- buf ch state ) + [ (decode-utf8) ] decode ; + +: decode-utf8 ( seq -- str ) + start-decoding decode-utf8-chunk finish-decoding ; + +! Encoding UTF-8 + +: encoded ( char -- ) + BIN: 111111 bitand BIN: 10000000 bitor , ; + +: char>utf8 ( char -- ) + { + { [ dup -7 shift zero? ] [ , ] } + { [ dup -11 shift zero? ] [ + dup -6 shift BIN: 11000000 bitor , + encoded + ] } + { [ dup -16 shift zero? ] [ + dup -12 shift BIN: 11100000 bitor , + dup -6 shift encoded + encoded + ] } + { [ t ] [ + dup -18 shift BIN: 11110000 bitor , + dup -12 shift encoded + dup -6 shift encoded + encoded + ] } + } cond ; + +: encode-utf8 ( str -- seq ) + [ [ char>utf8 ] each ] B{ } make ; + +! Interface for streams + +TUPLE: utf8 ; +! In the future, this should detect and ignore a BOM at the beginning + +M: utf8 init-decoding ( stream utf8 -- utf8-stream ) + tuck set-delegate ; + +M: utf8 init-encoding ( stream utf8 -- utf8-stream ) + tuck set-delegate ; + +M: utf8 stream-read1 1 swap stream-read ; + +: space ( resizable -- room-left ) + dup underlying swap [ length ] 2apply - ; + +: full? ( resizable -- ? ) space zero? ; + +: utf8-stream-read ( buf ch state stream -- string ) + >r pick full? [ r> 3drop >string ] [ + pick space r> [ stream-read decode-utf8-chunk ] keep + utf8-stream-read + ] if ; + +M: utf8 stream-read + >r start-decoding drop r> delegate utf8-stream-read ; + +M: utf8 stream-read-until + ! Copied from { c-reader stream-read-until }!!! + [ swap read-until-loop ] "" make + swap over empty? over not and [ 2drop f f ] when ; + +M: utf8 stream-write1 + >r 1string r> stream-write ; + +M: utf8 stream-write + >r encode-utf8 r> delegate stream-write ; diff --git a/extra/delegate/protocols/protocols.factor b/extra/delegate/protocols/protocols.factor index 1121883b7c..e6f794de53 100755 --- a/extra/delegate/protocols/protocols.factor +++ b/extra/delegate/protocols/protocols.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2007 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: delegate sequences.private sequences assocs prettyprint.sections -io definitions kernel ; +io definitions kernel continuations ; IN: delegate.protocols PROTOCOL: sequence-protocol @@ -15,7 +15,7 @@ PROTOCOL: assoc-protocol ! everything should work, just slower (with >alist) PROTOCOL: stream-protocol - stream-read1 stream-read stream-read-until + stream-read1 stream-read stream-read-until dispose stream-flush stream-write1 stream-write stream-format stream-nl make-span-stream make-block-stream stream-readln make-cell-stream stream-write-table set-timeout ;