From 5c37add12a82472d40bd9b0d2e3bf977015c2be1 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <ehrenbed@carleton.edu>
Date: Sat, 2 Feb 2008 13:23:54 -0600
Subject: [PATCH 1/8] Binary encoding through the new encoding protocol

---
 core/io/encodings/binary/about.txt          | 1 +
 core/io/encodings/binary/authors.txt        | 1 +
 core/io/encodings/binary/binary-docs.factor | 5 +++++
 core/io/encodings/binary/binary.factor      | 6 ++++++
 4 files changed, 13 insertions(+)
 create mode 100644 core/io/encodings/binary/about.txt
 create mode 100644 core/io/encodings/binary/authors.txt
 create mode 100644 core/io/encodings/binary/binary-docs.factor
 create mode 100644 core/io/encodings/binary/binary.factor

diff --git a/core/io/encodings/binary/about.txt b/core/io/encodings/binary/about.txt
new file mode 100644
index 0000000000..a1eb4bc664
--- /dev/null
+++ b/core/io/encodings/binary/about.txt
@@ -0,0 +1 @@
+Dummy encoding for binary I/O
diff --git a/core/io/encodings/binary/authors.txt b/core/io/encodings/binary/authors.txt
new file mode 100644
index 0000000000..f990dd0ed2
--- /dev/null
+++ b/core/io/encodings/binary/authors.txt
@@ -0,0 +1 @@
+Daniel Ehrenberg
diff --git a/core/io/encodings/binary/binary-docs.factor b/core/io/encodings/binary/binary-docs.factor
new file mode 100644
index 0000000000..f8be5054df
--- /dev/null
+++ b/core/io/encodings/binary/binary-docs.factor
@@ -0,0 +1,5 @@
+USING: help.syntax help.markup ;
+IN: io.encodings.binary
+
+HELP: binary
+{ $class-description "This is the encoding descriptor for binary I/O." } ;
diff --git a/core/io/encodings/binary/binary.factor b/core/io/encodings/binary/binary.factor
new file mode 100644
index 0000000000..b58f9836c0
--- /dev/null
+++ b/core/io/encodings/binary/binary.factor
@@ -0,0 +1,6 @@
+USING: kernel io.encodings ;
+
+TUPLE: binary ;
+
+M: binary init-decoding drop ;
+M: binary init-encoding drop ;

From b08907ef2737c942cf3953ba83ac4ccf5eb47621 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <ehrenbed@carleton.edu>
Date: Thu, 7 Feb 2008 17:12:50 -0600
Subject: [PATCH 2/8] extra/multiline checks for EOF now

---
 extra/multiline/multiline.factor | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/extra/multiline/multiline.factor b/extra/multiline/multiline.factor
index 7f831e5351..0ed72f6936 100644
--- a/extra/multiline/multiline.factor
+++ b/extra/multiline/multiline.factor
@@ -7,8 +7,11 @@ IN: multiline
     lexer get dup next-line line-text ;
 
 : (parse-here) ( -- )
-    next-line-text dup ";" =
-    [ drop lexer get next-line ] [ % "\n" % (parse-here) ] if ;
+    next-line-text [
+        dup ";" =
+        [ drop lexer get next-line ]
+        [ % "\n" % (parse-here) ] if
+    ] [ ";" unexpected-eof ] if* ;
 
 : parse-here ( -- str )
     [ (parse-here) ] "" make 1 head*
@@ -19,11 +22,13 @@ IN: multiline
     parse-here 1quotation define ; parsing
 
 : (parse-multiline-string) ( start-index end-text -- end-index )
-    lexer get line-text 2dup start
-    [ rot dupd >r >r swap subseq % r> r> length + ] [
-        rot tail % "\n" % 0
-        lexer get next-line swap (parse-multiline-string)
-    ] if* ;
+    lexer get line-text [
+        2dup start
+        [ rot dupd >r >r swap subseq % r> r> length + ] [
+            rot tail % "\n" % 0
+            lexer get next-line swap (parse-multiline-string)
+        ] if*
+    ] [ nip unexpected-eof ] if* ;
 
 : parse-multiline-string ( end-text -- str )
     [

From 99ff43b404b53649ac994eca5af1f71a2cdb131c Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <ehrenbed@carleton.edu>
Date: Sun, 10 Feb 2008 23:14:42 -0600
Subject: [PATCH 3/8] Beginning of encoded streams

---
 core/io/encodings/binary/tags.txt           |   1 +
 core/io/encodings/encodings.factor          |  18 ++-
 core/io/encodings/latin1/about.txt          |   1 +
 core/io/encodings/latin1/authors.txt        |   1 +
 core/io/encodings/latin1/latin1-docs.factor |   5 +
 core/io/encodings/latin1/latin1.factor      |  19 ++++
 core/io/encodings/latin1/tags.txt           |   1 +
 core/io/encodings/utf16/.utf16.factor.swo   | Bin 0 -> 16384 bytes
 core/io/encodings/utf16/about.txt           |   1 +
 core/io/encodings/utf16/authors.txt         |   1 +
 core/io/encodings/utf16/summary.txt         |   1 +
 core/io/encodings/utf16/tags.txt            |   1 +
 core/io/encodings/utf16/utf16-docs.factor   |  45 ++++++++
 core/io/encodings/utf16/utf16-tests.factor  |  15 +++
 core/io/encodings/utf16/utf16.factor        | 116 +++++++++++++++++++
 core/io/encodings/utf8/about.txt            |   1 +
 core/io/encodings/utf8/authors.txt          |   1 +
 core/io/encodings/utf8/summary.txt          |   1 +
 core/io/encodings/utf8/tags.txt             |   1 +
 core/io/encodings/utf8/utf8-docs.factor     |  18 +++
 core/io/encodings/utf8/utf8-tests.factor    |  16 +++
 core/io/encodings/utf8/utf8.factor          | 117 ++++++++++++++++++++
 extra/delegate/protocols/protocols.factor   |   4 +-
 23 files changed, 380 insertions(+), 5 deletions(-)
 create mode 100644 core/io/encodings/binary/tags.txt
 create mode 100644 core/io/encodings/latin1/about.txt
 create mode 100644 core/io/encodings/latin1/authors.txt
 create mode 100644 core/io/encodings/latin1/latin1-docs.factor
 create mode 100644 core/io/encodings/latin1/latin1.factor
 create mode 100644 core/io/encodings/latin1/tags.txt
 create mode 100644 core/io/encodings/utf16/.utf16.factor.swo
 create mode 100644 core/io/encodings/utf16/about.txt
 create mode 100644 core/io/encodings/utf16/authors.txt
 create mode 100644 core/io/encodings/utf16/summary.txt
 create mode 100644 core/io/encodings/utf16/tags.txt
 create mode 100644 core/io/encodings/utf16/utf16-docs.factor
 create mode 100755 core/io/encodings/utf16/utf16-tests.factor
 create mode 100755 core/io/encodings/utf16/utf16.factor
 create mode 100644 core/io/encodings/utf8/about.txt
 create mode 100644 core/io/encodings/utf8/authors.txt
 create mode 100644 core/io/encodings/utf8/summary.txt
 create mode 100644 core/io/encodings/utf8/tags.txt
 create mode 100644 core/io/encodings/utf8/utf8-docs.factor
 create mode 100644 core/io/encodings/utf8/utf8-tests.factor
 create mode 100644 core/io/encodings/utf8/utf8.factor

diff --git a/core/io/encodings/binary/tags.txt b/core/io/encodings/binary/tags.txt
new file mode 100644
index 0000000000..8e27be7d61
--- /dev/null
+++ b/core/io/encodings/binary/tags.txt
@@ -0,0 +1 @@
+text
diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor
index 767e9b266b..e6811b6e6d 100755
--- a/core/io/encodings/encodings.factor
+++ b/core/io/encodings/encodings.factor
@@ -23,6 +23,18 @@ SYMBOL: begin
 : finish-decoding ( buf ch state -- str )
     begin eq? [ decode-error ] unless drop "" like ;
 
-: decode ( seq quot -- str )
-    >r [ length <sbuf> 0 begin ] keep r> each
-    finish-decoding ; inline
+: decode ( ch state seq quot -- buf ch state )
+    [ -rot ] swap compose each ; inline
+
+: start-decoding ( seq -- buf ch state seq )
+    [ length <sbuf> 0 begin ] keep ;
+
+GENERIC: init-decoding ( stream encoding -- decoded-stream )
+
+: <decoding> ( stream decoding-class -- decoded-stream )
+    construct-empty init-decoding ;
+
+GENERIC: init-encoding ( stream encoding -- encoded-stream )
+
+: <encoding> ( stream encoding-class -- encoded-stream )
+    construct-empty init-encoding ;
diff --git a/core/io/encodings/latin1/about.txt b/core/io/encodings/latin1/about.txt
new file mode 100644
index 0000000000..d40d628767
--- /dev/null
+++ b/core/io/encodings/latin1/about.txt
@@ -0,0 +1 @@
+ISO 8859-1 encoding/decoding
diff --git a/core/io/encodings/latin1/authors.txt b/core/io/encodings/latin1/authors.txt
new file mode 100644
index 0000000000..f990dd0ed2
--- /dev/null
+++ b/core/io/encodings/latin1/authors.txt
@@ -0,0 +1 @@
+Daniel Ehrenberg
diff --git a/core/io/encodings/latin1/latin1-docs.factor b/core/io/encodings/latin1/latin1-docs.factor
new file mode 100644
index 0000000000..5872b2bcfd
--- /dev/null
+++ b/core/io/encodings/latin1/latin1-docs.factor
@@ -0,0 +1,5 @@
+USING: help.syntax help.markup ;
+IN: io.encodings.latin1
+
+HELP: latin1
+{ $class-description "This class is used for Latin 1 (ISO 8859-1) encoding and decoding" } ;
diff --git a/core/io/encodings/latin1/latin1.factor b/core/io/encodings/latin1/latin1.factor
new file mode 100644
index 0000000000..2c2aa8d60a
--- /dev/null
+++ b/core/io/encodings/latin1/latin1.factor
@@ -0,0 +1,19 @@
+USING: io.encodings strings kernel ;
+IN: io.encodings.latin1
+
+TUPLE: latin1 stream ;
+
+M: latin1 init-decoding tuck set-latin1-stream ;
+M: latin1 init-encoding drop ;
+
+M: latin1 stream-read1
+    latin1-stream stream-read1 ;
+
+M: latin1 stream-read
+    latin1-stream stream-read >string ;
+
+M: latin1 stream-read-until
+    latin1-stream stream-read-until >string ;
+
+M: latin1 stream-readln
+    latin1-stream stream-readln >string ;
diff --git a/core/io/encodings/latin1/tags.txt b/core/io/encodings/latin1/tags.txt
new file mode 100644
index 0000000000..8e27be7d61
--- /dev/null
+++ b/core/io/encodings/latin1/tags.txt
@@ -0,0 +1 @@
+text
diff --git a/core/io/encodings/utf16/.utf16.factor.swo b/core/io/encodings/utf16/.utf16.factor.swo
new file mode 100644
index 0000000000000000000000000000000000000000..01be8fdab2946825bf902b05af5135496b67501b
GIT binary patch
literal 16384
zcmeI2O^h5z6~~JZJ2)Xgk^>yynghG?_Uz0o*wI+KE7mc=!kZA*YZ+E6rDwW&rnNoY
z<L;W+o7Eyr3?UI9P8>)iI3Oil$N>d};6Q}Pkxwa+V&wxN1rQP!j)@fhue!RYyLY^-
zk3rSeZ`RZO>Q%jey{hi6nr`#rxivb~XbK!36XLVK`tgImeC~w!?FWPi1EoUgxe;GG
z;8=5eL8Rhz;Iu#82;-g`PB;E);c&rsdn!&AdT}Bbf_Onjz1Rz)e!4JLe(PlR*~pBh
z`?6=P0#<=rRp9Q8#dfRt$>TG0-@O~RTG-lK1*`&A0jq#jz$#!BunJfOtOD<I1=Q#^
z@g8Eh4O|C!?bmz|e?JbsVZP&!J*)y&0jq#jz$#!BunJfOtO8a6tAJI&Dqt0OpDN&b
zLVN^mZ}S2G_WyYe;4dE);s@X>;4@$rd<uL5+zH-Afq#QH!5iQg;OF4`;OpQTcm$jR
zC&4lBx4VV-HFyT>fj;oT61WGv^<g1i0>1%If@|OsSOj;0x9<|-1#k^0kb-s40!<*m
zjXQ;S5quLo3eJF4a1Z#`F(G~iz5~7u*1;M$0gi$<K7{MwyWkr58h8{egC)=a1m3wr
zh`)ljz#qYj;Co;MVsHU0gE?>q_{UKp{s4XtehRLGC%^!F2`m8uN5S8ZAWrZycosYZ
z6iC5&@CD$2Bj9%M>g__j0)7IX2Tz0R;1ZYz$H2Q-$ax3+4m=H>0#`vBw7`GR?>E3}
z;8}1TJOM6(4WRP^@=lHpr6ac|X_gY129LYlP?F<N630U)l)j=lDBB_ZLk?U@RU+M?
zgAcDoElRJrqw+aAjreh>ac`SasT@y+vhOO13nec>6euVEl?bZNycU(9ep8=guAmWz
zb?eFO$0vt$J45FpQUgi;Sd9}okN*cLrGt1JdgMtZdy3r1qo5z**Dd7I>ceZ~DqW;2
z`#}`phc4p6#uv|@ou~vA;8)QVH>F-Y3ZzGV5)Ube8+p`e10mk8U7O0fXhU#SdMJ8f
zAW;|E;YJj9)1Hh_^DrJqieeugJxShp6b3z41#v`doe8vui)id=trjb`OPi!<3!hpS
z>BuCB6P(zqm}t4Xa;0o(&P}%sr`RGra=*KxlUJzSaA(NNoLJ~kPAQ`mSE(<x^fh_8
zO)E4<T_hjmSQ#MA5KL~(mEOb}br;qgx0YEtx^i4&hn&5fuI^|gwMLza6?;b8xQS?~
zZlY<%>Mni$?4=I*(){&lk4{q%g+T<}JGuA5&M)lCcHFE@{Ob+eWI1ox7D=}^fNm(f
zM?-g;M@qF}Xm`D7?27wzu-skfW>L-4JWd5Z8YcKMA4Y@}BLV@#Lg>@@K=$NU{UAwI
z<)RL@)x<%LF2UCN<mYhEGHs%Tl_+NQSJuu9)M}jQ%0t8ntApd5gjmE;jZk;*fnn~m
zxW!m?8#(k+tlH=5xraK`G6!`7#dAhB3|T$reoQF=W>aI=geByKU7YA(D&<=(<6)8s
zSs!MDTvurN<4{#`Pfn_GTFtC66Yn|)GUuB#Yi0_QqI2lG><r?mj_NEkE8`lug*@)L
zDd#bN;bWIBk6mw(6Be?B($Qa0I-f2kMr|17?aSJ1%I-I9!89AoPBb2-+OA!5>BVEd
zp%oRI?^S1d8i$RS#3QO%@I<yxkXh=Eed_MuwuZ5rD!#uV_xWV;-3$aCrETqaG{!F~
z6FCZTw;f8%<k@Hy96KK-1M&_nM^;MG6Q(ab!DVzN==^H$&o5fi1GOf3we1JA8gryG
z9EU2<an>4pU{Jbg+`P?tQ28j($(VOoVa|bMDrEV}L#RpU+pbKOwACcu#eMNs!!_M_
zsAy{`-<1cXd^SlPCgsfo!vRW_Z^8Kh`Cr{$%Wp2mL#EWpZ9Uu8ec#exsds(ZS!J;3
zFWF7DRO`Mm!-`gW;*ev;iyb3lE&rAi$}>J$u4maKyB2F%lb0#Y!g+qx#|q1cHk&by
zaWZXW7n23wcZ)nOW|DH_OIK?no%^swTZKU}4EPS*G`9&J3y;erUeRWH$WIvpW~}H~
zs(n+E|0ZL;oO#)N&o?lB6e-UGB|i}4Ca$W(oSl@N*HzVzT9clhWv>&lF$0(sz;odG
zm)BMwKHo8Cim!U`RKB$<-&)1Dtiy!hyEPuh^({`6pUUz4JnROMi~BO(0`Q2f6rS%P
z#K!u$haT+U2~7=XTPBeV`8jcn2M`HEO#LZ6hIa)n7P&*IU#e2BgS{m?c&J~Ok_JkR
zIt%9a^86g%h?D*T9^_?|%7(hC$j8(9>iPqepF_bj@n|Or`U6F?XXdEgY@VFw&r|e(
z8wF^`*+C*B%#r<u;P?Nx@qT>~@8JCY&wKuV#ykF>!1Lfa5QDSe2>2)7<@ufeb?`EH
z3VaJ}1AgDX7u<la*T75QN8pFxNucYPyhz&rSp}>DRspMkRlq7>6|f3e1*`&Af&T*q
z2nSXiYI_P;cqn$avTZ8oN)pt!y2@9w6;^m^;Vi#a!bSNyUNcLZWaTTZ(#miAm_%EK
zBek<a`u|a!ZBgY*O+NWpCb}|^5gc;cH?xI8-n4VDhZKZP7-Ok(i;f@HTMpVtzh@RZ
zRlC&Fn|!Fsv}015_GV^grnGcoQ3Q&0o`g36&6b>A2rcVo)|kyUpRRGL2<$p@T3ZO3
z;y_2kVUGgdgw(s6s1qVg_AV=%Q&^Dh4aU(n1lg8yq|2j|ba8(ihASuw2aCG(%kaWo
zw*^^6vEx?>AX_6TmN2qvc~!#{)e9b(<Jzt(?|o$r)@wbGLN|t&d&ZF`lW>Qt)~leK
U6w0WN#fU9xV|69mAqVFF1)sD~<p2Nx

literal 0
HcmV?d00001

diff --git a/core/io/encodings/utf16/about.txt b/core/io/encodings/utf16/about.txt
new file mode 100644
index 0000000000..ffb8ebf8f5
--- /dev/null
+++ b/core/io/encodings/utf16/about.txt
@@ -0,0 +1 @@
+UTF-16, UTF-16LE, UTF-16BE encoding and decoding
diff --git a/core/io/encodings/utf16/authors.txt b/core/io/encodings/utf16/authors.txt
new file mode 100644
index 0000000000..f990dd0ed2
--- /dev/null
+++ b/core/io/encodings/utf16/authors.txt
@@ -0,0 +1 @@
+Daniel Ehrenberg
diff --git a/core/io/encodings/utf16/summary.txt b/core/io/encodings/utf16/summary.txt
new file mode 100644
index 0000000000..b2490675aa
--- /dev/null
+++ b/core/io/encodings/utf16/summary.txt
@@ -0,0 +1 @@
+UTF16 encoding/decoding
diff --git a/core/io/encodings/utf16/tags.txt b/core/io/encodings/utf16/tags.txt
new file mode 100644
index 0000000000..8e27be7d61
--- /dev/null
+++ b/core/io/encodings/utf16/tags.txt
@@ -0,0 +1 @@
+text
diff --git a/core/io/encodings/utf16/utf16-docs.factor b/core/io/encodings/utf16/utf16-docs.factor
new file mode 100644
index 0000000000..c49c030ef3
--- /dev/null
+++ b/core/io/encodings/utf16/utf16-docs.factor
@@ -0,0 +1,45 @@
+USING: help.markup help.syntax io.encodings strings ;
+IN: io.encodings.utf16
+
+ARTICLE: "io.utf16" "Working with UTF16-encoded data"
+"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences."
+{ $subsection encode-utf16le }
+{ $subsection encode-utf16be }
+{ $subsection decode-utf16le }
+{ $subsection decode-utf16be }
+"Support for UTF16 data with a byte order mark:"
+{ $subsection encode-utf16 }
+{ $subsection decode-utf16 } ;
+
+ABOUT: "io.utf16"
+
+HELP: decode-utf16
+{ $values { "seq" "a sequence of bytes" } { "str" string } }
+{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." }
+{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
+
+HELP: decode-utf16be
+{ $values { "seq" "a sequence of bytes" } { "str" string } }
+{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
+{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
+
+HELP: decode-utf16le
+{ $values { "seq" "a sequence of bytes" } { "str" string } }
+{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
+{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
+
+{ decode-utf16 decode-utf16le decode-utf16be } related-words
+
+HELP: encode-utf16be
+{ $values { "str" string } { "seq" "a sequence of bytes" } }
+{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ;
+
+HELP: encode-utf16le
+{ $values { "str" string } { "seq" "a sequence of bytes" } }
+{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ;
+
+HELP: encode-utf16
+{ $values { "str" string } { "seq" "a sequence of bytes" } }
+{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ;
+
+{ encode-utf16 encode-utf16be encode-utf16le } related-words
diff --git a/core/io/encodings/utf16/utf16-tests.factor b/core/io/encodings/utf16/utf16-tests.factor
new file mode 100755
index 0000000000..9800a9827d
--- /dev/null
+++ b/core/io/encodings/utf16/utf16-tests.factor
@@ -0,0 +1,15 @@
+USING: tools.test io.utf16 arrays unicode.syntax ;
+
+[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test
+[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test
+[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test
+[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test
+
+[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test
+
+[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test
+[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test
+[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test
+[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test
+
+[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test
diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor
new file mode 100755
index 0000000000..c38e7845df
--- /dev/null
+++ b/core/io/encodings/utf16/utf16.factor
@@ -0,0 +1,116 @@
+! Copyright (C) 2006, 2007 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: math kernel sequences sbufs vectors namespaces io.binary
+io.encodings combinators splitting ;
+IN: io.utf16
+
+SYMBOL: double
+SYMBOL: quad1
+SYMBOL: quad2
+SYMBOL: quad3
+SYMBOL: ignore
+
+: do-ignore ( -- ch state ) 0 ignore ;
+
+: append-nums ( byte ch -- ch )
+    8 shift bitor ;
+
+: end-multibyte ( buf byte ch -- buf ch state )
+    append-nums decoded ;
+
+: begin-utf16be ( buf byte -- buf ch state )
+    dup -3 shift BIN: 11011 number= [
+        dup BIN: 00000100 bitand zero?
+        [ BIN: 11 bitand quad1 ]
+        [ drop do-ignore ] if
+    ] [ double ] if ;
+
+: handle-quad2be ( byte ch -- ch state )
+    swap dup -2 shift BIN: 110111 number= [
+        >r 2 shift r> BIN: 11 bitand bitor quad3
+    ] [ 2drop do-ignore ] if ;
+
+: (decode-utf16be) ( buf byte ch state -- buf ch state )
+    {
+        { begin [ drop begin-utf16be ] }
+        { double [ end-multibyte ] }
+        { quad1 [ append-nums quad2 ] }
+        { quad2 [ handle-quad2be ] }
+        { quad3 [ append-nums HEX: 10000 + decoded ] }
+        { ignore [ 2drop push-replacement ] }
+    } case ;
+
+: decode-utf16be ( seq -- str )
+    [ (decode-utf16be) ] decode ;
+
+: handle-double ( buf byte ch -- buf ch state )
+    swap dup -3 shift BIN: 11011 = [
+        dup BIN: 100 bitand 0 number=
+        [ BIN: 11 bitand 8 shift bitor quad2 ]
+        [ 2drop push-replacement ] if
+    ] [ end-multibyte ] if ;
+
+: handle-quad3le ( buf byte ch -- buf ch state )
+    swap dup -2 shift BIN: 110111 = [
+        BIN: 11 bitand append-nums HEX: 10000 + decoded
+    ] [ 2drop push-replacement ] if ;
+
+: (decode-utf16le) ( buf byte ch state -- buf ch state )
+    {
+        { begin [ drop double ] }
+        { double [ handle-double ] }
+        { quad1 [ append-nums quad2 ] }
+        { quad2 [ 10 shift bitor quad3 ] }
+        { quad3 [ handle-quad3le ] }
+    } case ;
+
+: decode-utf16le ( seq -- str )
+    [ (decode-utf16le) ] decode ;
+
+: encode-first
+    -10 shift
+    dup -8 shift BIN: 11011000 bitor
+    swap HEX: FF bitand ;
+
+: encode-second
+    BIN: 1111111111 bitand
+    dup -8 shift BIN: 11011100 bitor
+    swap BIN: 11111111 bitand ;
+
+: char>utf16be ( char -- )
+    dup HEX: FFFF > [
+        HEX: 10000 -
+        dup encode-first swap , ,
+        encode-second swap , ,
+    ] [ h>b/b , , ] if ;
+
+: encode-utf16be ( str -- seq )
+    [ [ char>utf16be ] each ] B{ } make ;
+
+: char>utf16le ( char -- )
+    dup HEX: FFFF > [
+        HEX: 10000 -
+        dup encode-first , ,
+        encode-second , ,
+    ] [ h>b/b swap , , ] if ; 
+
+: encode-utf16le ( str -- seq )
+    [ [ char>utf16le ] each ] B{ } make ;
+
+: bom-le B{ HEX: ff HEX: fe } ; inline
+
+: bom-be B{ HEX: fe HEX: ff } ; inline
+
+: encode-utf16 ( str -- seq )
+    encode-utf16le bom-le swap append ;
+
+: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
+
+: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
+
+: decode-utf16 ( seq -- str )
+    {
+        { [ utf16le? ] [ decode-utf16le ] }
+        { [ utf16be? ] [ decode-utf16be ] }
+        { [ t ] [ decode-error ] }
+    } cond ;
diff --git a/core/io/encodings/utf8/about.txt b/core/io/encodings/utf8/about.txt
new file mode 100644
index 0000000000..7560b72db4
--- /dev/null
+++ b/core/io/encodings/utf8/about.txt
@@ -0,0 +1 @@
+UTF-8 encoding and decoding
diff --git a/core/io/encodings/utf8/authors.txt b/core/io/encodings/utf8/authors.txt
new file mode 100644
index 0000000000..f990dd0ed2
--- /dev/null
+++ b/core/io/encodings/utf8/authors.txt
@@ -0,0 +1 @@
+Daniel Ehrenberg
diff --git a/core/io/encodings/utf8/summary.txt b/core/io/encodings/utf8/summary.txt
new file mode 100644
index 0000000000..afd259a56b
--- /dev/null
+++ b/core/io/encodings/utf8/summary.txt
@@ -0,0 +1 @@
+UTF8 encoding/decoding
diff --git a/core/io/encodings/utf8/tags.txt b/core/io/encodings/utf8/tags.txt
new file mode 100644
index 0000000000..8e27be7d61
--- /dev/null
+++ b/core/io/encodings/utf8/tags.txt
@@ -0,0 +1 @@
+text
diff --git a/core/io/encodings/utf8/utf8-docs.factor b/core/io/encodings/utf8/utf8-docs.factor
new file mode 100644
index 0000000000..212552519c
--- /dev/null
+++ b/core/io/encodings/utf8/utf8-docs.factor
@@ -0,0 +1,18 @@
+USING: help.markup help.syntax io.encodings strings ;
+IN: io.encodings.utf8
+
+ARTICLE: "io.utf8" "Working with UTF8-encoded data"
+"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences."
+{ $subsection encode-utf8 }
+{ $subsection decode-utf8 } ;
+
+ABOUT: "io.utf8"
+
+HELP: decode-utf8
+{ $values { "seq" "a sequence of bytes" } { "str" string } }
+{ $description "Decodes a sequence of bytes representing a Unicode string in UTF8 format." }
+{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
+
+HELP: encode-utf8
+{ $values { "str" string } { "seq" "a sequence of bytes" } }
+{ $description "Encodes a Unicode string as a sequence of bytes in UTF8 format." } ;
diff --git a/core/io/encodings/utf8/utf8-tests.factor b/core/io/encodings/utf8/utf8-tests.factor
new file mode 100644
index 0000000000..3576471586
--- /dev/null
+++ b/core/io/encodings/utf8/utf8-tests.factor
@@ -0,0 +1,16 @@
+USING: io.utf8 tools.test strings arrays unicode.syntax ;
+
+[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
+
+[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
+
+[ "x" ] [ "x" decode-utf8 >string ] unit-test
+
+[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
+
+[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
+
+[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
+
+[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
+[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor
new file mode 100644
index 0000000000..90aec4623a
--- /dev/null
+++ b/core/io/encodings/utf8/utf8.factor
@@ -0,0 +1,117 @@
+! Copyright (C) 2006, 2007 Daniel Ehrenberg.
+! See http://factorcode.org/license.txt for BSD license.
+USING: math kernel sequences sbufs vectors growable io
+namespaces io.encodings combinators ;
+IN: io.utf8
+
+! Decoding UTF-8
+
+SYMBOL: double
+SYMBOL: triple
+SYMBOL: triple2
+SYMBOL: quad
+SYMBOL: quad2
+SYMBOL: quad3
+
+: starts-2? ( char -- ? )
+    -6 shift BIN: 10 number= ;
+
+: append-nums ( buf bottom top state-out -- buf num state )
+    >r over starts-2?
+    [ 6 shift swap BIN: 111111 bitand bitor r> ]
+    [ r> 3drop push-replacement ] if ;
+
+: begin-utf8 ( buf byte -- buf ch state )
+    {
+        { [ dup -7 shift zero? ] [ decoded ] }
+        { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
+        { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
+        { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
+        { [ t ] [ drop push-replacement ] }
+    } cond ;
+
+: end-multibyte ( buf byte ch -- buf ch state )
+    f append-nums [ decoded ] unless* ;
+
+: (decode-utf8) ( buf byte ch state -- buf ch state )
+    {
+        { begin [ drop begin-utf8 ] }
+        { double [ end-multibyte ] }
+        { triple [ triple2 append-nums ] }
+        { triple2 [ end-multibyte ] }
+        { quad [ quad2 append-nums ] }
+        { quad2 [ quad3 append-nums ] }
+        { quad3 [ end-multibyte ] }
+    } case ;
+
+: decode-utf8-chunk ( ch state seq -- buf ch state )
+    [ (decode-utf8) ] decode ;
+
+: decode-utf8 ( seq -- str )
+    start-decoding decode-utf8-chunk finish-decoding ;
+
+! Encoding UTF-8
+
+: encoded ( char -- )
+    BIN: 111111 bitand BIN: 10000000 bitor , ;
+
+: char>utf8 ( char -- )
+    {
+        { [ dup -7 shift zero? ] [ , ] }
+        { [ dup -11 shift zero? ] [
+            dup -6 shift BIN: 11000000 bitor ,
+            encoded
+        ] }
+        { [ dup -16 shift zero? ] [
+            dup -12 shift BIN: 11100000 bitor ,
+            dup -6 shift encoded
+            encoded
+        ] }
+        { [ t ] [
+            dup -18 shift BIN: 11110000 bitor ,
+            dup -12 shift encoded
+            dup -6 shift encoded
+            encoded
+        ] }
+    } cond ;
+
+: encode-utf8 ( str -- seq )
+    [ [ char>utf8 ] each ] B{ } make ;
+
+! Interface for streams
+
+TUPLE: utf8 ;
+! In the future, this should detect and ignore a BOM at the beginning
+
+M: utf8 init-decoding ( stream utf8 -- utf8-stream )
+    tuck set-delegate ;
+
+M: utf8 init-encoding ( stream utf8 -- utf8-stream )
+    tuck set-delegate ;
+
+M: utf8 stream-read1 1 swap stream-read ;
+
+: space ( resizable -- room-left )
+    dup underlying swap [ length ] 2apply - ;
+
+: full? ( resizable -- ? ) space zero? ;
+
+: utf8-stream-read ( buf ch state stream -- string )
+    >r pick full? [ r> 3drop >string ]  [
+        pick space r> [ stream-read decode-utf8-chunk ] keep
+        utf8-stream-read
+    ] if ;
+
+M: utf8 stream-read
+    >r start-decoding drop r> delegate utf8-stream-read ;
+
+M: utf8 stream-read-until
+    ! Copied from { c-reader stream-read-until }!!!
+    [ swap read-until-loop ] "" make
+    swap over empty? over not and [ 2drop f f ] when ;
+
+M: utf8 stream-write1
+    >r 1string r> stream-write ;
+
+M: utf8 stream-write
+    >r encode-utf8 r> delegate stream-write ;
diff --git a/extra/delegate/protocols/protocols.factor b/extra/delegate/protocols/protocols.factor
index 1121883b7c..e6f794de53 100755
--- a/extra/delegate/protocols/protocols.factor
+++ b/extra/delegate/protocols/protocols.factor
@@ -1,7 +1,7 @@
 ! Copyright (C) 2007 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 USING: delegate sequences.private sequences assocs prettyprint.sections 
-io definitions kernel ;
+io definitions kernel continuations ;
 IN: delegate.protocols
 
 PROTOCOL: sequence-protocol
@@ -15,7 +15,7 @@ PROTOCOL: assoc-protocol
     ! everything should work, just slower (with >alist)
 
 PROTOCOL: stream-protocol
-    stream-read1 stream-read stream-read-until
+    stream-read1 stream-read stream-read-until dispose
     stream-flush stream-write1 stream-write stream-format
     stream-nl make-span-stream make-block-stream stream-readln
     make-cell-stream stream-write-table set-timeout ;

From 8bbc144ce7bb7eddd0aa5a737595e9b4ad2850e9 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <ehrenbed@carleton.edu>
Date: Mon, 11 Feb 2008 17:44:14 -0600
Subject: [PATCH 4/8] io/utf8 and 16 were moved to core/io/encodings

---
 core/io/utf16/authors.txt        |   1 -
 core/io/utf16/summary.txt        |   1 -
 core/io/utf16/utf16-docs.factor  |  45 ------------
 core/io/utf16/utf16-tests.factor |  15 ----
 core/io/utf16/utf16.factor       | 116 -------------------------------
 core/io/utf8/authors.txt         |   1 -
 core/io/utf8/summary.txt         |   1 -
 core/io/utf8/utf8-docs.factor    |  18 -----
 core/io/utf8/utf8-tests.factor   |  16 -----
 core/io/utf8/utf8.factor         |  72 -------------------
 10 files changed, 286 deletions(-)
 delete mode 100644 core/io/utf16/authors.txt
 delete mode 100644 core/io/utf16/summary.txt
 delete mode 100644 core/io/utf16/utf16-docs.factor
 delete mode 100755 core/io/utf16/utf16-tests.factor
 delete mode 100755 core/io/utf16/utf16.factor
 delete mode 100644 core/io/utf8/authors.txt
 delete mode 100644 core/io/utf8/summary.txt
 delete mode 100644 core/io/utf8/utf8-docs.factor
 delete mode 100644 core/io/utf8/utf8-tests.factor
 delete mode 100644 core/io/utf8/utf8.factor

diff --git a/core/io/utf16/authors.txt b/core/io/utf16/authors.txt
deleted file mode 100644
index f990dd0ed2..0000000000
--- a/core/io/utf16/authors.txt
+++ /dev/null
@@ -1 +0,0 @@
-Daniel Ehrenberg
diff --git a/core/io/utf16/summary.txt b/core/io/utf16/summary.txt
deleted file mode 100644
index b2490675aa..0000000000
--- a/core/io/utf16/summary.txt
+++ /dev/null
@@ -1 +0,0 @@
-UTF16 encoding/decoding
diff --git a/core/io/utf16/utf16-docs.factor b/core/io/utf16/utf16-docs.factor
deleted file mode 100644
index 6d24f54694..0000000000
--- a/core/io/utf16/utf16-docs.factor
+++ /dev/null
@@ -1,45 +0,0 @@
-USING: help.markup help.syntax io.encodings strings ;
-IN: io.utf16
-
-ARTICLE: "io.utf16" "Working with UTF16-encoded data"
-"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences."
-{ $subsection encode-utf16le }
-{ $subsection encode-utf16be }
-{ $subsection decode-utf16le }
-{ $subsection decode-utf16be }
-"Support for UTF16 data with a byte order mark:"
-{ $subsection encode-utf16 }
-{ $subsection decode-utf16 } ;
-
-ABOUT: "io.utf16"
-
-HELP: decode-utf16
-{ $values { "seq" "a sequence of bytes" } { "str" string } }
-{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." }
-{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
-
-HELP: decode-utf16be
-{ $values { "seq" "a sequence of bytes" } { "str" string } }
-{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
-{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
-
-HELP: decode-utf16le
-{ $values { "seq" "a sequence of bytes" } { "str" string } }
-{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." }
-{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
-
-{ decode-utf16 decode-utf16le decode-utf16be } related-words
-
-HELP: encode-utf16be
-{ $values { "str" string } { "seq" "a sequence of bytes" } }
-{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ;
-
-HELP: encode-utf16le
-{ $values { "str" string } { "seq" "a sequence of bytes" } }
-{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ;
-
-HELP: encode-utf16
-{ $values { "str" string } { "seq" "a sequence of bytes" } }
-{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ;
-
-{ encode-utf16 encode-utf16be encode-utf16le } related-words
diff --git a/core/io/utf16/utf16-tests.factor b/core/io/utf16/utf16-tests.factor
deleted file mode 100755
index 9800a9827d..0000000000
--- a/core/io/utf16/utf16-tests.factor
+++ /dev/null
@@ -1,15 +0,0 @@
-USING: tools.test io.utf16 arrays unicode.syntax ;
-
-[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test
-[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test
-[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test
-[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test
-
-[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test
-
-[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test
-[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test
-[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test
-[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test
-
-[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test
diff --git a/core/io/utf16/utf16.factor b/core/io/utf16/utf16.factor
deleted file mode 100755
index 19ebc1d43a..0000000000
--- a/core/io/utf16/utf16.factor
+++ /dev/null
@@ -1,116 +0,0 @@
-! Copyright (C) 2006, 2007 Daniel Ehrenberg.
-! See http://factorcode.org/license.txt for BSD license.
-USING: math kernel sequences sbufs vectors namespaces io.binary
-io.encodings combinators splitting ;
-IN: io.utf16
-
-SYMBOL: double
-SYMBOL: quad1
-SYMBOL: quad2
-SYMBOL: quad3
-SYMBOL: ignore
-
-: do-ignore ( -- ch state ) 0 ignore ;
-
-: append-nums ( byte ch -- ch )
-    8 shift bitor ;
-
-: end-multibyte ( buf byte ch -- buf ch state )
-    append-nums decoded ;
-
-: begin-utf16be ( buf byte -- buf ch state )
-    dup -3 shift BIN: 11011 number= [
-        dup BIN: 00000100 bitand zero?
-        [ BIN: 11 bitand quad1 ]
-        [ drop do-ignore ] if
-    ] [ double ] if ;
-
-: handle-quad2be ( byte ch -- ch state )
-    swap dup -2 shift BIN: 110111 number= [
-        >r 2 shift r> BIN: 11 bitand bitor quad3
-    ] [ 2drop do-ignore ] if ;
-
-: (decode-utf16be) ( buf byte ch state -- buf ch state )
-    {
-        { begin [ drop begin-utf16be ] }
-        { double [ end-multibyte ] }
-        { quad1 [ append-nums quad2 ] }
-        { quad2 [ handle-quad2be ] }
-        { quad3 [ append-nums HEX: 10000 + decoded ] }
-        { ignore [ 2drop push-replacement ] }
-    } case ;
-
-: decode-utf16be ( seq -- str )
-    [ -rot (decode-utf16be) ] decode ;
-
-: handle-double ( buf byte ch -- buf ch state )
-    swap dup -3 shift BIN: 11011 = [
-        dup BIN: 100 bitand 0 number=
-        [ BIN: 11 bitand 8 shift bitor quad2 ]
-        [ 2drop push-replacement ] if
-    ] [ end-multibyte ] if ;
-
-: handle-quad3le ( buf byte ch -- buf ch state )
-    swap dup -2 shift BIN: 110111 = [
-        BIN: 11 bitand append-nums HEX: 10000 + decoded
-    ] [ 2drop push-replacement ] if ;
-
-: (decode-utf16le) ( buf byte ch state -- buf ch state )
-    {
-        { begin [ drop double ] }
-        { double [ handle-double ] }
-        { quad1 [ append-nums quad2 ] }
-        { quad2 [ 10 shift bitor quad3 ] }
-        { quad3 [ handle-quad3le ] }
-    } case ;
-
-: decode-utf16le ( seq -- str )
-    [ -rot (decode-utf16le) ] decode ;
-
-: encode-first
-    -10 shift
-    dup -8 shift BIN: 11011000 bitor
-    swap HEX: FF bitand ;
-
-: encode-second
-    BIN: 1111111111 bitand
-    dup -8 shift BIN: 11011100 bitor
-    swap BIN: 11111111 bitand ;
-
-: char>utf16be ( char -- )
-    dup HEX: FFFF > [
-        HEX: 10000 -
-        dup encode-first swap , ,
-        encode-second swap , ,
-    ] [ h>b/b , , ] if ;
-
-: encode-utf16be ( str -- seq )
-    [ [ char>utf16be ] each ] B{ } make ;
-
-: char>utf16le ( char -- )
-    dup HEX: FFFF > [
-        HEX: 10000 -
-        dup encode-first , ,
-        encode-second , ,
-    ] [ h>b/b swap , , ] if ; 
-
-: encode-utf16le ( str -- seq )
-    [ [ char>utf16le ] each ] B{ } make ;
-
-: bom-le B{ HEX: ff HEX: fe } ; inline
-
-: bom-be B{ HEX: fe HEX: ff } ; inline
-
-: encode-utf16 ( str -- seq )
-    encode-utf16le bom-le swap append ;
-
-: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ;
-
-: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
-
-: decode-utf16 ( seq -- str )
-    {
-        { [ utf16le? ] [ decode-utf16le ] }
-        { [ utf16be? ] [ decode-utf16be ] }
-        { [ t ] [ decode-error ] }
-    } cond ;
diff --git a/core/io/utf8/authors.txt b/core/io/utf8/authors.txt
deleted file mode 100644
index f990dd0ed2..0000000000
--- a/core/io/utf8/authors.txt
+++ /dev/null
@@ -1 +0,0 @@
-Daniel Ehrenberg
diff --git a/core/io/utf8/summary.txt b/core/io/utf8/summary.txt
deleted file mode 100644
index afd259a56b..0000000000
--- a/core/io/utf8/summary.txt
+++ /dev/null
@@ -1 +0,0 @@
-UTF8 encoding/decoding
diff --git a/core/io/utf8/utf8-docs.factor b/core/io/utf8/utf8-docs.factor
deleted file mode 100644
index 28310b5d77..0000000000
--- a/core/io/utf8/utf8-docs.factor
+++ /dev/null
@@ -1,18 +0,0 @@
-USING: help.markup help.syntax io.encodings strings ;
-IN: io.utf8
-
-ARTICLE: "io.utf8" "Working with UTF8-encoded data"
-"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences."
-{ $subsection encode-utf8 }
-{ $subsection decode-utf8 } ;
-
-ABOUT: "io.utf8"
-
-HELP: decode-utf8
-{ $values { "seq" "a sequence of bytes" } { "str" string } }
-{ $description "Decodes a sequence of bytes representing a Unicode string in UTF8 format." }
-{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ;
-
-HELP: encode-utf8
-{ $values { "str" string } { "seq" "a sequence of bytes" } }
-{ $description "Encodes a Unicode string as a sequence of bytes in UTF8 format." } ;
diff --git a/core/io/utf8/utf8-tests.factor b/core/io/utf8/utf8-tests.factor
deleted file mode 100644
index 3576471586..0000000000
--- a/core/io/utf8/utf8-tests.factor
+++ /dev/null
@@ -1,16 +0,0 @@
-USING: io.utf8 tools.test strings arrays unicode.syntax ;
-
-[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
-
-[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
-
-[ "x" ] [ "x" decode-utf8 >string ] unit-test
-
-[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
-
-[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
-
-[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
-
-[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
-[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
diff --git a/core/io/utf8/utf8.factor b/core/io/utf8/utf8.factor
deleted file mode 100644
index 213afb6eae..0000000000
--- a/core/io/utf8/utf8.factor
+++ /dev/null
@@ -1,72 +0,0 @@
-! Copyright (C) 2006, 2007 Daniel Ehrenberg.
-! See http://factorcode.org/license.txt for BSD license.
-USING: math kernel sequences sbufs vectors
-namespaces io.encodings combinators ;
-IN: io.utf8
-
-SYMBOL: double
-SYMBOL: triple
-SYMBOL: triple2
-SYMBOL: quad
-SYMBOL: quad2
-SYMBOL: quad3
-
-: starts-2? ( char -- ? )
-    -6 shift BIN: 10 number= ;
-
-: append-nums ( buf bottom top state-out -- buf num state )
-    >r over starts-2?
-    [ 6 shift swap BIN: 111111 bitand bitor r> ]
-    [ r> 3drop push-replacement ] if ;
-
-: begin-utf8 ( buf byte -- buf ch state )
-    {
-        { [ dup -7 shift zero? ] [ decoded ] }
-        { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
-        { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
-        { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
-        { [ t ] [ drop push-replacement ] }
-    } cond ;
-
-: end-multibyte ( buf byte ch -- buf ch state )
-    f append-nums [ decoded ] unless* ;
-
-: (decode-utf8) ( buf byte ch state -- buf ch state )
-    {
-        { begin [ drop begin-utf8 ] }
-        { double [ end-multibyte ] }
-        { triple [ triple2 append-nums ] }
-        { triple2 [ end-multibyte ] }
-        { quad [ quad2 append-nums ] }
-        { quad2 [ quad3 append-nums ] }
-        { quad3 [ end-multibyte ] }
-    } case ;
-
-: decode-utf8 ( seq -- str )
-    [ -rot (decode-utf8) ] decode ;
-
-: encoded ( char -- )
-    BIN: 111111 bitand BIN: 10000000 bitor , ;
-
-: char>utf8 ( char -- )
-    {
-        { [ dup -7 shift zero? ] [ , ] }
-        { [ dup -11 shift zero? ] [
-            dup -6 shift BIN: 11000000 bitor ,
-            encoded
-        ] }
-        { [ dup -16 shift zero? ] [
-            dup -12 shift BIN: 11100000 bitor ,
-            dup -6 shift encoded
-            encoded
-        ] }
-        { [ t ] [
-            dup -18 shift BIN: 11110000 bitor ,
-            dup -12 shift encoded
-            dup -6 shift encoded
-            encoded
-        ] }
-    } cond ;
-
-: encode-utf8 ( str -- seq )
-    [ [ char>utf8 ] each ] B{ } make ;

From 014b79caada3316522876c098ead25450f978f60 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <ehrenbed@carleton.edu>
Date: Wed, 13 Feb 2008 01:02:37 -0600
Subject: [PATCH 5/8] More unicode changes

---
 core/io/encodings/encodings.factor       | 26 +++++++++++++++++----
 core/io/encodings/utf16/utf16.factor     |  8 +++----
 core/io/encodings/utf8/utf8-tests.factor | 21 +++++++++++------
 core/io/encodings/utf8/utf8.factor       | 29 ++++++++----------------
 4 files changed, 48 insertions(+), 36 deletions(-)

diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor
index e6811b6e6d..dcc055f941 100755
--- a/core/io/encodings/encodings.factor
+++ b/core/io/encodings/encodings.factor
@@ -1,7 +1,7 @@
 ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: math kernel sequences sbufs vectors
-namespaces unicode.syntax ;
+namespaces unicode.syntax growable strings io ;
 IN: io.encodings
 
 TUPLE: encode-error ;
@@ -23,11 +23,27 @@ SYMBOL: begin
 : finish-decoding ( buf ch state -- str )
     begin eq? [ decode-error ] unless drop "" like ;
 
-: decode ( ch state seq quot -- buf ch state )
-    [ -rot ] swap compose each ; inline
+: start-decoding ( seq length -- buf ch state seq )
+    <sbuf> 0 begin roll ;
 
-: start-decoding ( seq -- buf ch state seq )
-    [ length <sbuf> 0 begin ] keep ;
+: decode ( seq quot -- string )
+    >r dup length start-decoding r>
+    [ -rot ] swap compose each
+    finish-decoding ; inline
+
+: space ( resizable -- room-left )
+    dup underlying swap [ length ] 2apply - ;
+
+: full? ( resizable -- ? ) space zero? ;
+
+: decode-part-loop ( buf ch state stream quot -- string )
+    >r >r pick r> r> rot full?
+    [ 2drop 2drop >string ]
+    [ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline
+
+: decode-part ( length stream quot -- string )
+    >r swap start-decoding r>
+    decode-part-loop ; inline
 
 GENERIC: init-decoding ( stream encoding -- decoded-stream )
 
diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor
index c38e7845df..ccf76649e2 100755
--- a/core/io/encodings/utf16/utf16.factor
+++ b/core/io/encodings/utf16/utf16.factor
@@ -30,7 +30,7 @@ SYMBOL: ignore
         >r 2 shift r> BIN: 11 bitand bitor quad3
     ] [ 2drop do-ignore ] if ;
 
-: (decode-utf16be) ( buf byte ch state -- buf ch state )
+: decode-utf16be-step ( buf byte ch state -- buf ch state )
     {
         { begin [ drop begin-utf16be ] }
         { double [ end-multibyte ] }
@@ -41,7 +41,7 @@ SYMBOL: ignore
     } case ;
 
 : decode-utf16be ( seq -- str )
-    [ (decode-utf16be) ] decode ;
+    [ decode-utf16be-step ] decode ;
 
 : handle-double ( buf byte ch -- buf ch state )
     swap dup -3 shift BIN: 11011 = [
@@ -55,7 +55,7 @@ SYMBOL: ignore
         BIN: 11 bitand append-nums HEX: 10000 + decoded
     ] [ 2drop push-replacement ] if ;
 
-: (decode-utf16le) ( buf byte ch state -- buf ch state )
+: decode-utf16le-step ( buf byte ch state -- buf ch state )
     {
         { begin [ drop double ] }
         { double [ handle-double ] }
@@ -65,7 +65,7 @@ SYMBOL: ignore
     } case ;
 
 : decode-utf16le ( seq -- str )
-    [ (decode-utf16le) ] decode ;
+    [ decode-utf16le-step ] decode ;
 
 : encode-first
     -10 shift
diff --git a/core/io/encodings/utf8/utf8-tests.factor b/core/io/encodings/utf8/utf8-tests.factor
index 3576471586..33c4ffbf12 100644
--- a/core/io/encodings/utf8/utf8-tests.factor
+++ b/core/io/encodings/utf8/utf8-tests.factor
@@ -1,16 +1,23 @@
-USING: io.utf8 tools.test strings arrays unicode.syntax ;
+USING: io.encodings.utf8 tools.test sbufs kernel io
+sequences strings arrays unicode.syntax ;
 
-[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
+: decode-utf8-w/stream ( array -- newarray )
+    >sbuf dup reverse-here <utf8> contents >array ;
 
-[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
+: encode-utf8-w/stream ( array -- newarray )
+    SBUF" " clone tuck <utf8> write >array ;
 
-[ "x" ] [ "x" decode-utf8 >string ] unit-test
+[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
 
-[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
+[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
 
-[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
+[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
 
-[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
+[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
+
+[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
+
+[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
 
 [ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
 [ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor
index 90aec4623a..c0fa66e553 100644
--- a/core/io/encodings/utf8/utf8.factor
+++ b/core/io/encodings/utf8/utf8.factor
@@ -1,8 +1,8 @@
 ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: math kernel sequences sbufs vectors growable io
-namespaces io.encodings combinators ;
-IN: io.utf8
+USING: math kernel sequences sbufs vectors growable io continuations
+namespaces io.encodings combinators strings io.streams.c ;
+IN: io.encodings.utf8
 
 ! Decoding UTF-8
 
@@ -33,7 +33,7 @@ SYMBOL: quad3
 : end-multibyte ( buf byte ch -- buf ch state )
     f append-nums [ decoded ] unless* ;
 
-: (decode-utf8) ( buf byte ch state -- buf ch state )
+: decode-utf8-step ( buf byte ch state -- buf ch state )
     {
         { begin [ drop begin-utf8 ] }
         { double [ end-multibyte ] }
@@ -44,11 +44,8 @@ SYMBOL: quad3
         { quad3 [ end-multibyte ] }
     } case ;
 
-: decode-utf8-chunk ( ch state seq -- buf ch state )
-    [ (decode-utf8) ] decode ;
-
 : decode-utf8 ( seq -- str )
-    start-decoding decode-utf8-chunk finish-decoding ;
+    [ decode-utf8-step ] decode ; 
 
 ! Encoding UTF-8
 
@@ -81,6 +78,7 @@ SYMBOL: quad3
 ! Interface for streams
 
 TUPLE: utf8 ;
+: <utf8> utf8 construct-delegate ;
 ! In the future, this should detect and ignore a BOM at the beginning
 
 M: utf8 init-decoding ( stream utf8 -- utf8-stream )
@@ -91,19 +89,8 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream )
 
 M: utf8 stream-read1 1 swap stream-read ;
 
-: space ( resizable -- room-left )
-    dup underlying swap [ length ] 2apply - ;
-
-: full? ( resizable -- ? ) space zero? ;
-
-: utf8-stream-read ( buf ch state stream -- string )
-    >r pick full? [ r> 3drop >string ]  [
-        pick space r> [ stream-read decode-utf8-chunk ] keep
-        utf8-stream-read
-    ] if ;
-
 M: utf8 stream-read
-    >r start-decoding drop r> delegate utf8-stream-read ;
+    [ decode-utf8-step ] decode-part ;
 
 M: utf8 stream-read-until
     ! Copied from { c-reader stream-read-until }!!!
@@ -115,3 +102,5 @@ M: utf8 stream-write1
 
 M: utf8 stream-write
     >r encode-utf8 r> delegate stream-write ;
+
+M: utf8 dispose delegate dispose ;

From 8d163c89369e79a918c06cdf3c614636654bd01e Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <ehrenbed@carleton.edu>
Date: Wed, 13 Feb 2008 01:03:55 -0600
Subject: [PATCH 6/8] Removing extra/const

---
 extra/const/const.factor | 24 ------------------------
 1 file changed, 24 deletions(-)
 delete mode 100644 extra/const/const.factor

diff --git a/extra/const/const.factor b/extra/const/const.factor
deleted file mode 100644
index 8efef7e372..0000000000
--- a/extra/const/const.factor
+++ /dev/null
@@ -1,24 +0,0 @@
-USING: kernel parser words sequences ;
-IN: const
-
-: define-const ( word value -- )
-    [ parsed ] curry dupd define
-    t "parsing" set-word-prop ;
-
-: CONST:
-    CREATE scan-word dup parsing?
-    [ execute dup pop ] when define-const ; parsing
-
-: define-enum ( words -- )
-    dup length [ define-const ] 2each ;
-
-: ENUM:
-    ";" parse-tokens [ create-in ] map define-enum ; parsing
-
-: define-value ( word -- )
-    { f } clone [ first ] curry define ;
-
-: VALUE: CREATE define-value ; parsing
-
-: set-value ( value word -- )
-    word-def first set-first ;

From 4833a6196dc2df53579a911143ce6862860f1aa2 Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <ehrenbed@carleton.edu>
Date: Wed, 13 Feb 2008 14:19:56 -0600
Subject: [PATCH 7/8] Fixing unicode.breaks

---
 extra/unicode/breaks/breaks.factor | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extra/unicode/breaks/breaks.factor b/extra/unicode/breaks/breaks.factor
index 70a9c781a2..1014d3ad7e 100644
--- a/extra/unicode/breaks/breaks.factor
+++ b/extra/unicode/breaks/breaks.factor
@@ -1,7 +1,7 @@
 USING: unicode.categories kernel math combinators splitting
 sequences math.parser io.files io assocs arrays namespaces
 combinators.lib assocs.lib math.ranges unicode.normalize
-unicode.syntax unicode.data compiler.units alien.syntax const ;
+unicode.syntax unicode.data compiler.units alien.syntax ;
 IN: unicode.breaks
 
 C-ENUM: Any L V T Extend Control CR LF graphemes ;

From 01e9a5cb1ac984e99dd68ac534ca8ab9403c766a Mon Sep 17 00:00:00 2001
From: Daniel Ehrenberg <ehrenbed@carleton.edu>
Date: Wed, 13 Feb 2008 17:53:10 -0600
Subject: [PATCH 8/8] utf8 and utf16le streams

---
 core/io/encodings/encodings.factor   | 20 +++++++++++------
 core/io/encodings/utf16/utf16.factor | 32 ++++++++++++++++++++++++++++
 core/io/encodings/utf8/utf8.factor   |  4 +++-
 3 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor
index dcc055f941..b27b89642d 100755
--- a/core/io/encodings/encodings.factor
+++ b/core/io/encodings/encodings.factor
@@ -36,14 +36,22 @@ SYMBOL: begin
 
 : full? ( resizable -- ? ) space zero? ;
 
-: decode-part-loop ( buf ch state stream quot -- string )
-    >r >r pick r> r> rot full?
-    [ 2drop 2drop >string ]
-    [ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline
+: end-read-loop ( buf ch state stream quot -- string/f )
+    2drop 2drop >string f like ;
 
-: decode-part ( length stream quot -- string )
+: under ( a b c -- c a b c )
+    tuck >r swapd r> ; inline
+
+: decode-read-loop ( buf ch state stream quot -- string/f )
+    >r >r pick r> r> rot full?  [ end-read-loop ] [
+        over stream-read1 [
+            -rot tuck >r >r >r -rot r> call r> r> decode-read-loop
+        ] [ end-read-loop ] if*
+    ] if ; inline
+
+: decode-read ( length stream quot -- string )
     >r swap start-decoding r>
-    decode-part-loop ; inline
+    decode-read-loop ; inline
 
 GENERIC: init-decoding ( stream encoding -- decoded-stream )
 
diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor
index ccf76649e2..81c982dd55 100755
--- a/core/io/encodings/utf16/utf16.factor
+++ b/core/io/encodings/utf16/utf16.factor
@@ -114,3 +114,35 @@ SYMBOL: ignore
         { [ utf16be? ] [ decode-utf16be ] }
         { [ t ] [ decode-error ] }
     } cond ;
+
+! UTF16LE streams
+
+TUPLE: utf16le ;
+: <utf16le> utf16le construct-delegate ;
+! In the future, this should detect and ignore a BOM at the beginning
+
+M: utf16le init-decoding ( stream utf16le -- utf16le-stream )
+    tuck set-delegate ;
+
+M: utf16le init-encoding ( stream utf16le -- utf16le-stream )
+    tuck set-delegate ;
+
+M: utf16le stream-read1 1 swap stream-read ;
+
+M: utf16le stream-read
+    delegate [ decode-utf16le-step ] decode-read ;
+
+M: utf16le stream-read-partial stream-read ;
+
+M: utf16le stream-read-until
+    ! Copied from { c-reader stream-read-until }!!!
+    [ swap read-until-loop ] "" make
+    swap over empty? over not and [ 2drop f f ] when ;
+
+M: utf16le stream-write1
+    >r 1string r> stream-write ;
+
+M: utf16le stream-write
+    >r encode-utf16le r> delegate stream-write ;
+
+M: utf16le dispose delegate dispose ;
diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor
index c0fa66e553..de3fd5b67b 100644
--- a/core/io/encodings/utf8/utf8.factor
+++ b/core/io/encodings/utf8/utf8.factor
@@ -90,7 +90,9 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream )
 M: utf8 stream-read1 1 swap stream-read ;
 
 M: utf8 stream-read
-    [ decode-utf8-step ] decode-part ;
+    delegate [ decode-utf8-step ] decode-read ;
+
+M: utf8 stream-read-partial stream-read ;
 
 M: utf8 stream-read-until
     ! Copied from { c-reader stream-read-until }!!!