From 5c37add12a82472d40bd9b0d2e3bf977015c2be1 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sat, 2 Feb 2008 13:23:54 -0600 Subject: [PATCH 1/8] Binary encoding through the new encoding protocol --- core/io/encodings/binary/about.txt | 1 + core/io/encodings/binary/authors.txt | 1 + core/io/encodings/binary/binary-docs.factor | 5 +++++ core/io/encodings/binary/binary.factor | 6 ++++++ 4 files changed, 13 insertions(+) create mode 100644 core/io/encodings/binary/about.txt create mode 100644 core/io/encodings/binary/authors.txt create mode 100644 core/io/encodings/binary/binary-docs.factor create mode 100644 core/io/encodings/binary/binary.factor diff --git a/core/io/encodings/binary/about.txt b/core/io/encodings/binary/about.txt new file mode 100644 index 0000000000..a1eb4bc664 --- /dev/null +++ b/core/io/encodings/binary/about.txt @@ -0,0 +1 @@ +Dummy encoding for binary I/O diff --git a/core/io/encodings/binary/authors.txt b/core/io/encodings/binary/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/core/io/encodings/binary/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/core/io/encodings/binary/binary-docs.factor b/core/io/encodings/binary/binary-docs.factor new file mode 100644 index 0000000000..f8be5054df --- /dev/null +++ b/core/io/encodings/binary/binary-docs.factor @@ -0,0 +1,5 @@ +USING: help.syntax help.markup ; +IN: io.encodings.binary + +HELP: binary +{ $class-description "This is the encoding descriptor for binary I/O." } ; diff --git a/core/io/encodings/binary/binary.factor b/core/io/encodings/binary/binary.factor new file mode 100644 index 0000000000..b58f9836c0 --- /dev/null +++ b/core/io/encodings/binary/binary.factor @@ -0,0 +1,6 @@ +USING: kernel io.encodings ; + +TUPLE: binary ; + +M: binary init-decoding drop ; +M: binary init-encoding drop ; From b08907ef2737c942cf3953ba83ac4ccf5eb47621 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 7 Feb 2008 17:12:50 -0600 Subject: [PATCH 2/8] extra/multiline checks for EOF now --- extra/multiline/multiline.factor | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/extra/multiline/multiline.factor b/extra/multiline/multiline.factor index 7f831e5351..0ed72f6936 100644 --- a/extra/multiline/multiline.factor +++ b/extra/multiline/multiline.factor @@ -7,8 +7,11 @@ IN: multiline lexer get dup next-line line-text ; : (parse-here) ( -- ) - next-line-text dup ";" = - [ drop lexer get next-line ] [ % "\n" % (parse-here) ] if ; + next-line-text [ + dup ";" = + [ drop lexer get next-line ] + [ % "\n" % (parse-here) ] if + ] [ ";" unexpected-eof ] if* ; : parse-here ( -- str ) [ (parse-here) ] "" make 1 head* @@ -19,11 +22,13 @@ IN: multiline parse-here 1quotation define ; parsing : (parse-multiline-string) ( start-index end-text -- end-index ) - lexer get line-text 2dup start - [ rot dupd >r >r swap subseq % r> r> length + ] [ - rot tail % "\n" % 0 - lexer get next-line swap (parse-multiline-string) - ] if* ; + lexer get line-text [ + 2dup start + [ rot dupd >r >r swap subseq % r> r> length + ] [ + rot tail % "\n" % 0 + lexer get next-line swap (parse-multiline-string) + ] if* + ] [ nip unexpected-eof ] if* ; : parse-multiline-string ( end-text -- str ) [ From 99ff43b404b53649ac994eca5af1f71a2cdb131c Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Sun, 10 Feb 2008 23:14:42 -0600 Subject: [PATCH 3/8] Beginning of encoded streams --- core/io/encodings/binary/tags.txt | 1 + core/io/encodings/encodings.factor | 18 ++- core/io/encodings/latin1/about.txt | 1 + core/io/encodings/latin1/authors.txt | 1 + core/io/encodings/latin1/latin1-docs.factor | 5 + core/io/encodings/latin1/latin1.factor | 19 ++++ core/io/encodings/latin1/tags.txt | 1 + core/io/encodings/utf16/.utf16.factor.swo | Bin 0 -> 16384 bytes core/io/encodings/utf16/about.txt | 1 + core/io/encodings/utf16/authors.txt | 1 + core/io/encodings/utf16/summary.txt | 1 + core/io/encodings/utf16/tags.txt | 1 + core/io/encodings/utf16/utf16-docs.factor | 45 ++++++++ core/io/encodings/utf16/utf16-tests.factor | 15 +++ core/io/encodings/utf16/utf16.factor | 116 +++++++++++++++++++ core/io/encodings/utf8/about.txt | 1 + core/io/encodings/utf8/authors.txt | 1 + core/io/encodings/utf8/summary.txt | 1 + core/io/encodings/utf8/tags.txt | 1 + core/io/encodings/utf8/utf8-docs.factor | 18 +++ core/io/encodings/utf8/utf8-tests.factor | 16 +++ core/io/encodings/utf8/utf8.factor | 117 ++++++++++++++++++++ extra/delegate/protocols/protocols.factor | 4 +- 23 files changed, 380 insertions(+), 5 deletions(-) create mode 100644 core/io/encodings/binary/tags.txt create mode 100644 core/io/encodings/latin1/about.txt create mode 100644 core/io/encodings/latin1/authors.txt create mode 100644 core/io/encodings/latin1/latin1-docs.factor create mode 100644 core/io/encodings/latin1/latin1.factor create mode 100644 core/io/encodings/latin1/tags.txt create mode 100644 core/io/encodings/utf16/.utf16.factor.swo create mode 100644 core/io/encodings/utf16/about.txt create mode 100644 core/io/encodings/utf16/authors.txt create mode 100644 core/io/encodings/utf16/summary.txt create mode 100644 core/io/encodings/utf16/tags.txt create mode 100644 core/io/encodings/utf16/utf16-docs.factor create mode 100755 core/io/encodings/utf16/utf16-tests.factor create mode 100755 core/io/encodings/utf16/utf16.factor create mode 100644 core/io/encodings/utf8/about.txt create mode 100644 core/io/encodings/utf8/authors.txt create mode 100644 core/io/encodings/utf8/summary.txt create mode 100644 core/io/encodings/utf8/tags.txt create mode 100644 core/io/encodings/utf8/utf8-docs.factor create mode 100644 core/io/encodings/utf8/utf8-tests.factor create mode 100644 core/io/encodings/utf8/utf8.factor diff --git a/core/io/encodings/binary/tags.txt b/core/io/encodings/binary/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/binary/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 767e9b266b..e6811b6e6d 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -23,6 +23,18 @@ SYMBOL: begin : finish-decoding ( buf ch state -- str ) begin eq? [ decode-error ] unless drop "" like ; -: decode ( seq quot -- str ) - >r [ length 0 begin ] keep r> each - finish-decoding ; inline +: decode ( ch state seq quot -- buf ch state ) + [ -rot ] swap compose each ; inline + +: start-decoding ( seq -- buf ch state seq ) + [ length 0 begin ] keep ; + +GENERIC: init-decoding ( stream encoding -- decoded-stream ) + +: ( stream decoding-class -- decoded-stream ) + construct-empty init-decoding ; + +GENERIC: init-encoding ( stream encoding -- encoded-stream ) + +: ( stream encoding-class -- encoded-stream ) + construct-empty init-encoding ; diff --git a/core/io/encodings/latin1/about.txt b/core/io/encodings/latin1/about.txt new file mode 100644 index 0000000000..d40d628767 --- /dev/null +++ b/core/io/encodings/latin1/about.txt @@ -0,0 +1 @@ +ISO 8859-1 encoding/decoding diff --git a/core/io/encodings/latin1/authors.txt b/core/io/encodings/latin1/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/core/io/encodings/latin1/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/core/io/encodings/latin1/latin1-docs.factor b/core/io/encodings/latin1/latin1-docs.factor new file mode 100644 index 0000000000..5872b2bcfd --- /dev/null +++ b/core/io/encodings/latin1/latin1-docs.factor @@ -0,0 +1,5 @@ +USING: help.syntax help.markup ; +IN: io.encodings.latin1 + +HELP: latin1 +{ $class-description "This class is used for Latin 1 (ISO 8859-1) encoding and decoding" } ; diff --git a/core/io/encodings/latin1/latin1.factor b/core/io/encodings/latin1/latin1.factor new file mode 100644 index 0000000000..2c2aa8d60a --- /dev/null +++ b/core/io/encodings/latin1/latin1.factor @@ -0,0 +1,19 @@ +USING: io.encodings strings kernel ; +IN: io.encodings.latin1 + +TUPLE: latin1 stream ; + +M: latin1 init-decoding tuck set-latin1-stream ; +M: latin1 init-encoding drop ; + +M: latin1 stream-read1 + latin1-stream stream-read1 ; + +M: latin1 stream-read + latin1-stream stream-read >string ; + +M: latin1 stream-read-until + latin1-stream stream-read-until >string ; + +M: latin1 stream-readln + latin1-stream stream-readln >string ; diff --git a/core/io/encodings/latin1/tags.txt b/core/io/encodings/latin1/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/latin1/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/utf16/.utf16.factor.swo b/core/io/encodings/utf16/.utf16.factor.swo new file mode 100644 index 0000000000000000000000000000000000000000..01be8fdab2946825bf902b05af5135496b67501b GIT binary patch literal 16384 zcmeI2O^h5z6~~JZJ2)Xgk^>yynghG?_Uz0o*wI+KE7mc=!kZA*YZ+E6rDwW&rnNoY z)iI3Oil$N>d};6Q}Pkxwa+V&wxN1rQP!j)@fhue!RYyLY^- zk3rSeZ`RZO>Q%jey{hi6nr`#rxivb~XbK!36XLVK`tgImeC~w!?FWPi1EoUgxe;GG z;8=5eL8Rhz;Iu#82;-g`PB;E);c&rsdn!&AdT}Bbf_Onjz1Rz)e!4JLe(PlR*~pBh z`?6=P0#<=rRp9Q8#dfRt$>TG0-@O~RTG-lK1*`&A0jq#jz$#!BunJfOtOD;4@$rdfj;oT61WGv^1%If@|OsSOj;0x9<|-1#k^0kb-s40!<*m zjXQ;S5quLo3eJF4a1Z#`F(G~iz5~7u*1;M$0gi$q_{UKp{s4XtehRLGC%^!F2`m8uN5S8ZAWrZycosYZ z6iC5&@CD$2Bj9%M>g__j0)7IX2Tz0R;1ZYz$H2Q-$ax3+4m=H>0#`vBw7`GR?>E3} z;8}1TJOM6(4WRP^@=lHpr6ac|X_gY129LYlP?Fac`SasT@y+vhOO13nec>6euVEl?bZNycU(9ep8=guAmWz zb?eFO$0vt$J45FpQUgi;Sd9}okN*cLrGt1JdgMtZdy3r1qo5z**Dd7I>ceZ~DqW;2 z`#}`phc4p6#uv|@ou~vA;8)QVH>F-Y3ZzGV5)Ube8+p`e10mk8U7O0fXhU#SdMJ8f zAW;|E;YJj9)1Hh_^DrJqieeugJxShp6b3z41#v`doe8vui)id=trjb`OPi!<3!hpS z>BuCB6P(zqm}t4Xa;0o(&P}%sr`RGra=*KxlUJzSaA(NNoLJ~kPAQ`mSE(Mni$?4=I*(){&lk4{q%g+T<}JGuA5&M)lCcHFE@{Ob+eWI1ox7D=}^fNm(f zM?-g;M@qF}Xm`D7?27wzu-skfW>L-4JWd5Z8YcKMA4Y@}BLV@#Lg>@@K=$NU{UAwI z<)RL@)x<%LF2UCNaI=geByKU7YA(D&<=(<6)8s zSs!MDTvurN<4{#`Pfn_GTFtC66Yn|)GUuB#Yi0_QqI2lG>BVEd zp%oRI?^S1d8i$RS#3QO%@I4pU{Jbg+`P?tQ28j($(VOoVa|bMDrEV}L#RpU+pbKOwACcu#eMNs!!_M_ zsAy{`-<1cXd^SlPCgsfo!vRW_Z^8Kh`Cr{$%Wp2mL#EWpZ9Uu8ec#exsds(ZS!J;3 zFWF7DRO`Mm!-`gW;*ev;iyb3lE&rAi$}>J$u4maKyB2F%lb0#Y!g+qx#|q1cHk&by zaWZXW7n23wcZ)nOW|DH_OIK?no%^swTZKU}4EPS*G`9&J3y;erUeRWH$WIvpW~}H~ zs(n+E|0ZL;oO#)N&o?lB6e-UGB|i}4Ca$W(oSl@N*HzVzT9clhWv>&lF$0(sz;odG zm)BMwKHo8Cim!U`RKB$<-&)1Dtiy!hyEPuh^({`6pUUz4JnROMi~BO(0`Q2f6rS%P z#K!u$haT+U2~7=XTPBeV`8jcn2M`HEO#LZ6hIa)n7P&*IU#e2BgS{m?c&J~Ok_JkR zIt%9a^86g%h?D*T9^_?|%7(hC$j8(9>iPqepF_bj@n|Or`U6F?XXdEgY@VFw&r|e( z8wF^`*+C*B%#r~@8JCY&wKuV#ykF>!1Lfa5QDSe2>2)7<@ufeb?`EH z3VaJ}1AgDX7uDRspMkRlq7>6|f3e1*`&Af&T*q z2nSXiYI_P;cqn$avTZ8oN)pt!y2@9w6;^m^;Vi#a!bSNyUNcLZWaTTZ(#miAm_%EK zBekA2rcVo)|kyUpRRGL2<$p@T3ZO3 z;y_2kVUGgdgw(s6s1qVg_AV=%Q&^Dh4aU(n1lg8yq|2j|ba8(ihASuw2aCG(%kaWo zw*^^6vEx?>AX_6TmN2qvc~!#{)e9b(Qt)~leK U6w0WN#fU9xV|69mAqVFF1)sD~array ] unit-test +[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test + +[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test + +[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test +[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test + +[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor new file mode 100755 index 0000000000..c38e7845df --- /dev/null +++ b/core/io/encodings/utf16/utf16.factor @@ -0,0 +1,116 @@ +! Copyright (C) 2006, 2007 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: math kernel sequences sbufs vectors namespaces io.binary +io.encodings combinators splitting ; +IN: io.utf16 + +SYMBOL: double +SYMBOL: quad1 +SYMBOL: quad2 +SYMBOL: quad3 +SYMBOL: ignore + +: do-ignore ( -- ch state ) 0 ignore ; + +: append-nums ( byte ch -- ch ) + 8 shift bitor ; + +: end-multibyte ( buf byte ch -- buf ch state ) + append-nums decoded ; + +: begin-utf16be ( buf byte -- buf ch state ) + dup -3 shift BIN: 11011 number= [ + dup BIN: 00000100 bitand zero? + [ BIN: 11 bitand quad1 ] + [ drop do-ignore ] if + ] [ double ] if ; + +: handle-quad2be ( byte ch -- ch state ) + swap dup -2 shift BIN: 110111 number= [ + >r 2 shift r> BIN: 11 bitand bitor quad3 + ] [ 2drop do-ignore ] if ; + +: (decode-utf16be) ( buf byte ch state -- buf ch state ) + { + { begin [ drop begin-utf16be ] } + { double [ end-multibyte ] } + { quad1 [ append-nums quad2 ] } + { quad2 [ handle-quad2be ] } + { quad3 [ append-nums HEX: 10000 + decoded ] } + { ignore [ 2drop push-replacement ] } + } case ; + +: decode-utf16be ( seq -- str ) + [ (decode-utf16be) ] decode ; + +: handle-double ( buf byte ch -- buf ch state ) + swap dup -3 shift BIN: 11011 = [ + dup BIN: 100 bitand 0 number= + [ BIN: 11 bitand 8 shift bitor quad2 ] + [ 2drop push-replacement ] if + ] [ end-multibyte ] if ; + +: handle-quad3le ( buf byte ch -- buf ch state ) + swap dup -2 shift BIN: 110111 = [ + BIN: 11 bitand append-nums HEX: 10000 + decoded + ] [ 2drop push-replacement ] if ; + +: (decode-utf16le) ( buf byte ch state -- buf ch state ) + { + { begin [ drop double ] } + { double [ handle-double ] } + { quad1 [ append-nums quad2 ] } + { quad2 [ 10 shift bitor quad3 ] } + { quad3 [ handle-quad3le ] } + } case ; + +: decode-utf16le ( seq -- str ) + [ (decode-utf16le) ] decode ; + +: encode-first + -10 shift + dup -8 shift BIN: 11011000 bitor + swap HEX: FF bitand ; + +: encode-second + BIN: 1111111111 bitand + dup -8 shift BIN: 11011100 bitor + swap BIN: 11111111 bitand ; + +: char>utf16be ( char -- ) + dup HEX: FFFF > [ + HEX: 10000 - + dup encode-first swap , , + encode-second swap , , + ] [ h>b/b , , ] if ; + +: encode-utf16be ( str -- seq ) + [ [ char>utf16be ] each ] B{ } make ; + +: char>utf16le ( char -- ) + dup HEX: FFFF > [ + HEX: 10000 - + dup encode-first , , + encode-second , , + ] [ h>b/b swap , , ] if ; + +: encode-utf16le ( str -- seq ) + [ [ char>utf16le ] each ] B{ } make ; + +: bom-le B{ HEX: ff HEX: fe } ; inline + +: bom-be B{ HEX: fe HEX: ff } ; inline + +: encode-utf16 ( str -- seq ) + encode-utf16le bom-le swap append ; + +: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ; + +: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; + +: decode-utf16 ( seq -- str ) + { + { [ utf16le? ] [ decode-utf16le ] } + { [ utf16be? ] [ decode-utf16be ] } + { [ t ] [ decode-error ] } + } cond ; diff --git a/core/io/encodings/utf8/about.txt b/core/io/encodings/utf8/about.txt new file mode 100644 index 0000000000..7560b72db4 --- /dev/null +++ b/core/io/encodings/utf8/about.txt @@ -0,0 +1 @@ +UTF-8 encoding and decoding diff --git a/core/io/encodings/utf8/authors.txt b/core/io/encodings/utf8/authors.txt new file mode 100644 index 0000000000..f990dd0ed2 --- /dev/null +++ b/core/io/encodings/utf8/authors.txt @@ -0,0 +1 @@ +Daniel Ehrenberg diff --git a/core/io/encodings/utf8/summary.txt b/core/io/encodings/utf8/summary.txt new file mode 100644 index 0000000000..afd259a56b --- /dev/null +++ b/core/io/encodings/utf8/summary.txt @@ -0,0 +1 @@ +UTF8 encoding/decoding diff --git a/core/io/encodings/utf8/tags.txt b/core/io/encodings/utf8/tags.txt new file mode 100644 index 0000000000..8e27be7d61 --- /dev/null +++ b/core/io/encodings/utf8/tags.txt @@ -0,0 +1 @@ +text diff --git a/core/io/encodings/utf8/utf8-docs.factor b/core/io/encodings/utf8/utf8-docs.factor new file mode 100644 index 0000000000..212552519c --- /dev/null +++ b/core/io/encodings/utf8/utf8-docs.factor @@ -0,0 +1,18 @@ +USING: help.markup help.syntax io.encodings strings ; +IN: io.encodings.utf8 + +ARTICLE: "io.utf8" "Working with UTF8-encoded data" +"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences." +{ $subsection encode-utf8 } +{ $subsection decode-utf8 } ; + +ABOUT: "io.utf8" + +HELP: decode-utf8 +{ $values { "seq" "a sequence of bytes" } { "str" string } } +{ $description "Decodes a sequence of bytes representing a Unicode string in UTF8 format." } +{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; + +HELP: encode-utf8 +{ $values { "str" string } { "seq" "a sequence of bytes" } } +{ $description "Encodes a Unicode string as a sequence of bytes in UTF8 format." } ; diff --git a/core/io/encodings/utf8/utf8-tests.factor b/core/io/encodings/utf8/utf8-tests.factor new file mode 100644 index 0000000000..3576471586 --- /dev/null +++ b/core/io/encodings/utf8/utf8-tests.factor @@ -0,0 +1,16 @@ +USING: io.utf8 tools.test strings arrays unicode.syntax ; + +[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test + +[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test + +[ "x" ] [ "x" decode-utf8 >string ] unit-test + +[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test + +[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test + +[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test + +[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] +[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor new file mode 100644 index 0000000000..90aec4623a --- /dev/null +++ b/core/io/encodings/utf8/utf8.factor @@ -0,0 +1,117 @@ +! Copyright (C) 2006, 2007 Daniel Ehrenberg. +! See http://factorcode.org/license.txt for BSD license. +USING: math kernel sequences sbufs vectors growable io +namespaces io.encodings combinators ; +IN: io.utf8 + +! Decoding UTF-8 + +SYMBOL: double +SYMBOL: triple +SYMBOL: triple2 +SYMBOL: quad +SYMBOL: quad2 +SYMBOL: quad3 + +: starts-2? ( char -- ? ) + -6 shift BIN: 10 number= ; + +: append-nums ( buf bottom top state-out -- buf num state ) + >r over starts-2? + [ 6 shift swap BIN: 111111 bitand bitor r> ] + [ r> 3drop push-replacement ] if ; + +: begin-utf8 ( buf byte -- buf ch state ) + { + { [ dup -7 shift zero? ] [ decoded ] } + { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } + { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } + { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } + { [ t ] [ drop push-replacement ] } + } cond ; + +: end-multibyte ( buf byte ch -- buf ch state ) + f append-nums [ decoded ] unless* ; + +: (decode-utf8) ( buf byte ch state -- buf ch state ) + { + { begin [ drop begin-utf8 ] } + { double [ end-multibyte ] } + { triple [ triple2 append-nums ] } + { triple2 [ end-multibyte ] } + { quad [ quad2 append-nums ] } + { quad2 [ quad3 append-nums ] } + { quad3 [ end-multibyte ] } + } case ; + +: decode-utf8-chunk ( ch state seq -- buf ch state ) + [ (decode-utf8) ] decode ; + +: decode-utf8 ( seq -- str ) + start-decoding decode-utf8-chunk finish-decoding ; + +! Encoding UTF-8 + +: encoded ( char -- ) + BIN: 111111 bitand BIN: 10000000 bitor , ; + +: char>utf8 ( char -- ) + { + { [ dup -7 shift zero? ] [ , ] } + { [ dup -11 shift zero? ] [ + dup -6 shift BIN: 11000000 bitor , + encoded + ] } + { [ dup -16 shift zero? ] [ + dup -12 shift BIN: 11100000 bitor , + dup -6 shift encoded + encoded + ] } + { [ t ] [ + dup -18 shift BIN: 11110000 bitor , + dup -12 shift encoded + dup -6 shift encoded + encoded + ] } + } cond ; + +: encode-utf8 ( str -- seq ) + [ [ char>utf8 ] each ] B{ } make ; + +! Interface for streams + +TUPLE: utf8 ; +! In the future, this should detect and ignore a BOM at the beginning + +M: utf8 init-decoding ( stream utf8 -- utf8-stream ) + tuck set-delegate ; + +M: utf8 init-encoding ( stream utf8 -- utf8-stream ) + tuck set-delegate ; + +M: utf8 stream-read1 1 swap stream-read ; + +: space ( resizable -- room-left ) + dup underlying swap [ length ] 2apply - ; + +: full? ( resizable -- ? ) space zero? ; + +: utf8-stream-read ( buf ch state stream -- string ) + >r pick full? [ r> 3drop >string ] [ + pick space r> [ stream-read decode-utf8-chunk ] keep + utf8-stream-read + ] if ; + +M: utf8 stream-read + >r start-decoding drop r> delegate utf8-stream-read ; + +M: utf8 stream-read-until + ! Copied from { c-reader stream-read-until }!!! + [ swap read-until-loop ] "" make + swap over empty? over not and [ 2drop f f ] when ; + +M: utf8 stream-write1 + >r 1string r> stream-write ; + +M: utf8 stream-write + >r encode-utf8 r> delegate stream-write ; diff --git a/extra/delegate/protocols/protocols.factor b/extra/delegate/protocols/protocols.factor index 1121883b7c..e6f794de53 100755 --- a/extra/delegate/protocols/protocols.factor +++ b/extra/delegate/protocols/protocols.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2007 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: delegate sequences.private sequences assocs prettyprint.sections -io definitions kernel ; +io definitions kernel continuations ; IN: delegate.protocols PROTOCOL: sequence-protocol @@ -15,7 +15,7 @@ PROTOCOL: assoc-protocol ! everything should work, just slower (with >alist) PROTOCOL: stream-protocol - stream-read1 stream-read stream-read-until + stream-read1 stream-read stream-read-until dispose stream-flush stream-write1 stream-write stream-format stream-nl make-span-stream make-block-stream stream-readln make-cell-stream stream-write-table set-timeout ; From 8bbc144ce7bb7eddd0aa5a737595e9b4ad2850e9 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Mon, 11 Feb 2008 17:44:14 -0600 Subject: [PATCH 4/8] io/utf8 and 16 were moved to core/io/encodings --- core/io/utf16/authors.txt | 1 - core/io/utf16/summary.txt | 1 - core/io/utf16/utf16-docs.factor | 45 ------------ core/io/utf16/utf16-tests.factor | 15 ---- core/io/utf16/utf16.factor | 116 ------------------------------- core/io/utf8/authors.txt | 1 - core/io/utf8/summary.txt | 1 - core/io/utf8/utf8-docs.factor | 18 ----- core/io/utf8/utf8-tests.factor | 16 ----- core/io/utf8/utf8.factor | 72 ------------------- 10 files changed, 286 deletions(-) delete mode 100644 core/io/utf16/authors.txt delete mode 100644 core/io/utf16/summary.txt delete mode 100644 core/io/utf16/utf16-docs.factor delete mode 100755 core/io/utf16/utf16-tests.factor delete mode 100755 core/io/utf16/utf16.factor delete mode 100644 core/io/utf8/authors.txt delete mode 100644 core/io/utf8/summary.txt delete mode 100644 core/io/utf8/utf8-docs.factor delete mode 100644 core/io/utf8/utf8-tests.factor delete mode 100644 core/io/utf8/utf8.factor diff --git a/core/io/utf16/authors.txt b/core/io/utf16/authors.txt deleted file mode 100644 index f990dd0ed2..0000000000 --- a/core/io/utf16/authors.txt +++ /dev/null @@ -1 +0,0 @@ -Daniel Ehrenberg diff --git a/core/io/utf16/summary.txt b/core/io/utf16/summary.txt deleted file mode 100644 index b2490675aa..0000000000 --- a/core/io/utf16/summary.txt +++ /dev/null @@ -1 +0,0 @@ -UTF16 encoding/decoding diff --git a/core/io/utf16/utf16-docs.factor b/core/io/utf16/utf16-docs.factor deleted file mode 100644 index 6d24f54694..0000000000 --- a/core/io/utf16/utf16-docs.factor +++ /dev/null @@ -1,45 +0,0 @@ -USING: help.markup help.syntax io.encodings strings ; -IN: io.utf16 - -ARTICLE: "io.utf16" "Working with UTF16-encoded data" -"The UTF16 encoding is a variable-width encoding. Unicode code points are encoded as 2 or 4 byte sequences." -{ $subsection encode-utf16le } -{ $subsection encode-utf16be } -{ $subsection decode-utf16le } -{ $subsection decode-utf16be } -"Support for UTF16 data with a byte order mark:" -{ $subsection encode-utf16 } -{ $subsection decode-utf16 } ; - -ABOUT: "io.utf16" - -HELP: decode-utf16 -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in UTF16 format. The bytes must begin with a UTF16 byte order mark, which determines if the input is in little or big endian. To decode data without a byte order mark, use " { $link decode-utf16le } " or " { $link decode-utf16be } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; - -HELP: decode-utf16be -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in big endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; - -HELP: decode-utf16le -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in little endian UTF16 format. The bytes must not begin with a UTF16 byte order mark. To decode data with a byte order mark, use " { $link decode-utf16 } "." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; - -{ decode-utf16 decode-utf16le decode-utf16be } related-words - -HELP: encode-utf16be -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in big endian UTF16 format." } ; - -HELP: encode-utf16le -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in little endian UTF16 format." } ; - -HELP: encode-utf16 -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in UTF16 format with a byte order mark." } ; - -{ encode-utf16 encode-utf16be encode-utf16le } related-words diff --git a/core/io/utf16/utf16-tests.factor b/core/io/utf16/utf16-tests.factor deleted file mode 100755 index 9800a9827d..0000000000 --- a/core/io/utf16/utf16-tests.factor +++ /dev/null @@ -1,15 +0,0 @@ -USING: tools.test io.utf16 arrays unicode.syntax ; - -[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test -[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test - -[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test - -[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test -[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test - -[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test diff --git a/core/io/utf16/utf16.factor b/core/io/utf16/utf16.factor deleted file mode 100755 index 19ebc1d43a..0000000000 --- a/core/io/utf16/utf16.factor +++ /dev/null @@ -1,116 +0,0 @@ -! Copyright (C) 2006, 2007 Daniel Ehrenberg. -! See http://factorcode.org/license.txt for BSD license. -USING: math kernel sequences sbufs vectors namespaces io.binary -io.encodings combinators splitting ; -IN: io.utf16 - -SYMBOL: double -SYMBOL: quad1 -SYMBOL: quad2 -SYMBOL: quad3 -SYMBOL: ignore - -: do-ignore ( -- ch state ) 0 ignore ; - -: append-nums ( byte ch -- ch ) - 8 shift bitor ; - -: end-multibyte ( buf byte ch -- buf ch state ) - append-nums decoded ; - -: begin-utf16be ( buf byte -- buf ch state ) - dup -3 shift BIN: 11011 number= [ - dup BIN: 00000100 bitand zero? - [ BIN: 11 bitand quad1 ] - [ drop do-ignore ] if - ] [ double ] if ; - -: handle-quad2be ( byte ch -- ch state ) - swap dup -2 shift BIN: 110111 number= [ - >r 2 shift r> BIN: 11 bitand bitor quad3 - ] [ 2drop do-ignore ] if ; - -: (decode-utf16be) ( buf byte ch state -- buf ch state ) - { - { begin [ drop begin-utf16be ] } - { double [ end-multibyte ] } - { quad1 [ append-nums quad2 ] } - { quad2 [ handle-quad2be ] } - { quad3 [ append-nums HEX: 10000 + decoded ] } - { ignore [ 2drop push-replacement ] } - } case ; - -: decode-utf16be ( seq -- str ) - [ -rot (decode-utf16be) ] decode ; - -: handle-double ( buf byte ch -- buf ch state ) - swap dup -3 shift BIN: 11011 = [ - dup BIN: 100 bitand 0 number= - [ BIN: 11 bitand 8 shift bitor quad2 ] - [ 2drop push-replacement ] if - ] [ end-multibyte ] if ; - -: handle-quad3le ( buf byte ch -- buf ch state ) - swap dup -2 shift BIN: 110111 = [ - BIN: 11 bitand append-nums HEX: 10000 + decoded - ] [ 2drop push-replacement ] if ; - -: (decode-utf16le) ( buf byte ch state -- buf ch state ) - { - { begin [ drop double ] } - { double [ handle-double ] } - { quad1 [ append-nums quad2 ] } - { quad2 [ 10 shift bitor quad3 ] } - { quad3 [ handle-quad3le ] } - } case ; - -: decode-utf16le ( seq -- str ) - [ -rot (decode-utf16le) ] decode ; - -: encode-first - -10 shift - dup -8 shift BIN: 11011000 bitor - swap HEX: FF bitand ; - -: encode-second - BIN: 1111111111 bitand - dup -8 shift BIN: 11011100 bitor - swap BIN: 11111111 bitand ; - -: char>utf16be ( char -- ) - dup HEX: FFFF > [ - HEX: 10000 - - dup encode-first swap , , - encode-second swap , , - ] [ h>b/b , , ] if ; - -: encode-utf16be ( str -- seq ) - [ [ char>utf16be ] each ] B{ } make ; - -: char>utf16le ( char -- ) - dup HEX: FFFF > [ - HEX: 10000 - - dup encode-first , , - encode-second , , - ] [ h>b/b swap , , ] if ; - -: encode-utf16le ( str -- seq ) - [ [ char>utf16le ] each ] B{ } make ; - -: bom-le B{ HEX: ff HEX: fe } ; inline - -: bom-be B{ HEX: fe HEX: ff } ; inline - -: encode-utf16 ( str -- seq ) - encode-utf16le bom-le swap append ; - -: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ; - -: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; - -: decode-utf16 ( seq -- str ) - { - { [ utf16le? ] [ decode-utf16le ] } - { [ utf16be? ] [ decode-utf16be ] } - { [ t ] [ decode-error ] } - } cond ; diff --git a/core/io/utf8/authors.txt b/core/io/utf8/authors.txt deleted file mode 100644 index f990dd0ed2..0000000000 --- a/core/io/utf8/authors.txt +++ /dev/null @@ -1 +0,0 @@ -Daniel Ehrenberg diff --git a/core/io/utf8/summary.txt b/core/io/utf8/summary.txt deleted file mode 100644 index afd259a56b..0000000000 --- a/core/io/utf8/summary.txt +++ /dev/null @@ -1 +0,0 @@ -UTF8 encoding/decoding diff --git a/core/io/utf8/utf8-docs.factor b/core/io/utf8/utf8-docs.factor deleted file mode 100644 index 28310b5d77..0000000000 --- a/core/io/utf8/utf8-docs.factor +++ /dev/null @@ -1,18 +0,0 @@ -USING: help.markup help.syntax io.encodings strings ; -IN: io.utf8 - -ARTICLE: "io.utf8" "Working with UTF8-encoded data" -"The UTF8 encoding is a variable-width encoding. 7-bit ASCII characters are encoded as single bytes, and other Unicode code points are encoded as 2 to 4 byte sequences." -{ $subsection encode-utf8 } -{ $subsection decode-utf8 } ; - -ABOUT: "io.utf8" - -HELP: decode-utf8 -{ $values { "seq" "a sequence of bytes" } { "str" string } } -{ $description "Decodes a sequence of bytes representing a Unicode string in UTF8 format." } -{ $errors "Throws a " { $link decode-error } " if the input is malformed." } ; - -HELP: encode-utf8 -{ $values { "str" string } { "seq" "a sequence of bytes" } } -{ $description "Encodes a Unicode string as a sequence of bytes in UTF8 format." } ; diff --git a/core/io/utf8/utf8-tests.factor b/core/io/utf8/utf8-tests.factor deleted file mode 100644 index 3576471586..0000000000 --- a/core/io/utf8/utf8-tests.factor +++ /dev/null @@ -1,16 +0,0 @@ -USING: io.utf8 tools.test strings arrays unicode.syntax ; - -[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test - -[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test - -[ "x" ] [ "x" decode-utf8 >string ] unit-test - -[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test - -[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test - -[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test - -[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] -[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/core/io/utf8/utf8.factor b/core/io/utf8/utf8.factor deleted file mode 100644 index 213afb6eae..0000000000 --- a/core/io/utf8/utf8.factor +++ /dev/null @@ -1,72 +0,0 @@ -! Copyright (C) 2006, 2007 Daniel Ehrenberg. -! See http://factorcode.org/license.txt for BSD license. -USING: math kernel sequences sbufs vectors -namespaces io.encodings combinators ; -IN: io.utf8 - -SYMBOL: double -SYMBOL: triple -SYMBOL: triple2 -SYMBOL: quad -SYMBOL: quad2 -SYMBOL: quad3 - -: starts-2? ( char -- ? ) - -6 shift BIN: 10 number= ; - -: append-nums ( buf bottom top state-out -- buf num state ) - >r over starts-2? - [ 6 shift swap BIN: 111111 bitand bitor r> ] - [ r> 3drop push-replacement ] if ; - -: begin-utf8 ( buf byte -- buf ch state ) - { - { [ dup -7 shift zero? ] [ decoded ] } - { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } - { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } - { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } - { [ t ] [ drop push-replacement ] } - } cond ; - -: end-multibyte ( buf byte ch -- buf ch state ) - f append-nums [ decoded ] unless* ; - -: (decode-utf8) ( buf byte ch state -- buf ch state ) - { - { begin [ drop begin-utf8 ] } - { double [ end-multibyte ] } - { triple [ triple2 append-nums ] } - { triple2 [ end-multibyte ] } - { quad [ quad2 append-nums ] } - { quad2 [ quad3 append-nums ] } - { quad3 [ end-multibyte ] } - } case ; - -: decode-utf8 ( seq -- str ) - [ -rot (decode-utf8) ] decode ; - -: encoded ( char -- ) - BIN: 111111 bitand BIN: 10000000 bitor , ; - -: char>utf8 ( char -- ) - { - { [ dup -7 shift zero? ] [ , ] } - { [ dup -11 shift zero? ] [ - dup -6 shift BIN: 11000000 bitor , - encoded - ] } - { [ dup -16 shift zero? ] [ - dup -12 shift BIN: 11100000 bitor , - dup -6 shift encoded - encoded - ] } - { [ t ] [ - dup -18 shift BIN: 11110000 bitor , - dup -12 shift encoded - dup -6 shift encoded - encoded - ] } - } cond ; - -: encode-utf8 ( str -- seq ) - [ [ char>utf8 ] each ] B{ } make ; From 014b79caada3316522876c098ead25450f978f60 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 13 Feb 2008 01:02:37 -0600 Subject: [PATCH 5/8] More unicode changes --- core/io/encodings/encodings.factor | 26 +++++++++++++++++---- core/io/encodings/utf16/utf16.factor | 8 +++---- core/io/encodings/utf8/utf8-tests.factor | 21 +++++++++++------ core/io/encodings/utf8/utf8.factor | 29 ++++++++---------------- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index e6811b6e6d..dcc055f941 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors -namespaces unicode.syntax ; +namespaces unicode.syntax growable strings io ; IN: io.encodings TUPLE: encode-error ; @@ -23,11 +23,27 @@ SYMBOL: begin : finish-decoding ( buf ch state -- str ) begin eq? [ decode-error ] unless drop "" like ; -: decode ( ch state seq quot -- buf ch state ) - [ -rot ] swap compose each ; inline +: start-decoding ( seq length -- buf ch state seq ) + 0 begin roll ; -: start-decoding ( seq -- buf ch state seq ) - [ length 0 begin ] keep ; +: decode ( seq quot -- string ) + >r dup length start-decoding r> + [ -rot ] swap compose each + finish-decoding ; inline + +: space ( resizable -- room-left ) + dup underlying swap [ length ] 2apply - ; + +: full? ( resizable -- ? ) space zero? ; + +: decode-part-loop ( buf ch state stream quot -- string ) + >r >r pick r> r> rot full? + [ 2drop 2drop >string ] + [ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline + +: decode-part ( length stream quot -- string ) + >r swap start-decoding r> + decode-part-loop ; inline GENERIC: init-decoding ( stream encoding -- decoded-stream ) diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor index c38e7845df..ccf76649e2 100755 --- a/core/io/encodings/utf16/utf16.factor +++ b/core/io/encodings/utf16/utf16.factor @@ -30,7 +30,7 @@ SYMBOL: ignore >r 2 shift r> BIN: 11 bitand bitor quad3 ] [ 2drop do-ignore ] if ; -: (decode-utf16be) ( buf byte ch state -- buf ch state ) +: decode-utf16be-step ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf16be ] } { double [ end-multibyte ] } @@ -41,7 +41,7 @@ SYMBOL: ignore } case ; : decode-utf16be ( seq -- str ) - [ (decode-utf16be) ] decode ; + [ decode-utf16be-step ] decode ; : handle-double ( buf byte ch -- buf ch state ) swap dup -3 shift BIN: 11011 = [ @@ -55,7 +55,7 @@ SYMBOL: ignore BIN: 11 bitand append-nums HEX: 10000 + decoded ] [ 2drop push-replacement ] if ; -: (decode-utf16le) ( buf byte ch state -- buf ch state ) +: decode-utf16le-step ( buf byte ch state -- buf ch state ) { { begin [ drop double ] } { double [ handle-double ] } @@ -65,7 +65,7 @@ SYMBOL: ignore } case ; : decode-utf16le ( seq -- str ) - [ (decode-utf16le) ] decode ; + [ decode-utf16le-step ] decode ; : encode-first -10 shift diff --git a/core/io/encodings/utf8/utf8-tests.factor b/core/io/encodings/utf8/utf8-tests.factor index 3576471586..33c4ffbf12 100644 --- a/core/io/encodings/utf8/utf8-tests.factor +++ b/core/io/encodings/utf8/utf8-tests.factor @@ -1,16 +1,23 @@ -USING: io.utf8 tools.test strings arrays unicode.syntax ; +USING: io.encodings.utf8 tools.test sbufs kernel io +sequences strings arrays unicode.syntax ; -[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test +: decode-utf8-w/stream ( array -- newarray ) + >sbuf dup reverse-here contents >array ; -[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test +: encode-utf8-w/stream ( array -- newarray ) + SBUF" " clone tuck write >array ; -[ "x" ] [ "x" decode-utf8 >string ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test -[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test +[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test +[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test -[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test +[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test + +[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test + +[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test [ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] [ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index 90aec4623a..c0fa66e553 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: math kernel sequences sbufs vectors growable io -namespaces io.encodings combinators ; -IN: io.utf8 +USING: math kernel sequences sbufs vectors growable io continuations +namespaces io.encodings combinators strings io.streams.c ; +IN: io.encodings.utf8 ! Decoding UTF-8 @@ -33,7 +33,7 @@ SYMBOL: quad3 : end-multibyte ( buf byte ch -- buf ch state ) f append-nums [ decoded ] unless* ; -: (decode-utf8) ( buf byte ch state -- buf ch state ) +: decode-utf8-step ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf8 ] } { double [ end-multibyte ] } @@ -44,11 +44,8 @@ SYMBOL: quad3 { quad3 [ end-multibyte ] } } case ; -: decode-utf8-chunk ( ch state seq -- buf ch state ) - [ (decode-utf8) ] decode ; - : decode-utf8 ( seq -- str ) - start-decoding decode-utf8-chunk finish-decoding ; + [ decode-utf8-step ] decode ; ! Encoding UTF-8 @@ -81,6 +78,7 @@ SYMBOL: quad3 ! Interface for streams TUPLE: utf8 ; +: utf8 construct-delegate ; ! In the future, this should detect and ignore a BOM at the beginning M: utf8 init-decoding ( stream utf8 -- utf8-stream ) @@ -91,19 +89,8 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream ) M: utf8 stream-read1 1 swap stream-read ; -: space ( resizable -- room-left ) - dup underlying swap [ length ] 2apply - ; - -: full? ( resizable -- ? ) space zero? ; - -: utf8-stream-read ( buf ch state stream -- string ) - >r pick full? [ r> 3drop >string ] [ - pick space r> [ stream-read decode-utf8-chunk ] keep - utf8-stream-read - ] if ; - M: utf8 stream-read - >r start-decoding drop r> delegate utf8-stream-read ; + [ decode-utf8-step ] decode-part ; M: utf8 stream-read-until ! Copied from { c-reader stream-read-until }!!! @@ -115,3 +102,5 @@ M: utf8 stream-write1 M: utf8 stream-write >r encode-utf8 r> delegate stream-write ; + +M: utf8 dispose delegate dispose ; From 8d163c89369e79a918c06cdf3c614636654bd01e Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 13 Feb 2008 01:03:55 -0600 Subject: [PATCH 6/8] Removing extra/const --- extra/const/const.factor | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 extra/const/const.factor diff --git a/extra/const/const.factor b/extra/const/const.factor deleted file mode 100644 index 8efef7e372..0000000000 --- a/extra/const/const.factor +++ /dev/null @@ -1,24 +0,0 @@ -USING: kernel parser words sequences ; -IN: const - -: define-const ( word value -- ) - [ parsed ] curry dupd define - t "parsing" set-word-prop ; - -: CONST: - CREATE scan-word dup parsing? - [ execute dup pop ] when define-const ; parsing - -: define-enum ( words -- ) - dup length [ define-const ] 2each ; - -: ENUM: - ";" parse-tokens [ create-in ] map define-enum ; parsing - -: define-value ( word -- ) - { f } clone [ first ] curry define ; - -: VALUE: CREATE define-value ; parsing - -: set-value ( value word -- ) - word-def first set-first ; From 4833a6196dc2df53579a911143ce6862860f1aa2 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 13 Feb 2008 14:19:56 -0600 Subject: [PATCH 7/8] Fixing unicode.breaks --- extra/unicode/breaks/breaks.factor | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/unicode/breaks/breaks.factor b/extra/unicode/breaks/breaks.factor index 70a9c781a2..1014d3ad7e 100644 --- a/extra/unicode/breaks/breaks.factor +++ b/extra/unicode/breaks/breaks.factor @@ -1,7 +1,7 @@ USING: unicode.categories kernel math combinators splitting sequences math.parser io.files io assocs arrays namespaces combinators.lib assocs.lib math.ranges unicode.normalize -unicode.syntax unicode.data compiler.units alien.syntax const ; +unicode.syntax unicode.data compiler.units alien.syntax ; IN: unicode.breaks C-ENUM: Any L V T Extend Control CR LF graphemes ; From 01e9a5cb1ac984e99dd68ac534ca8ab9403c766a Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 13 Feb 2008 17:53:10 -0600 Subject: [PATCH 8/8] utf8 and utf16le streams --- core/io/encodings/encodings.factor | 20 +++++++++++------ core/io/encodings/utf16/utf16.factor | 32 ++++++++++++++++++++++++++++ core/io/encodings/utf8/utf8.factor | 4 +++- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index dcc055f941..b27b89642d 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -36,14 +36,22 @@ SYMBOL: begin : full? ( resizable -- ? ) space zero? ; -: decode-part-loop ( buf ch state stream quot -- string ) - >r >r pick r> r> rot full? - [ 2drop 2drop >string ] - [ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline +: end-read-loop ( buf ch state stream quot -- string/f ) + 2drop 2drop >string f like ; -: decode-part ( length stream quot -- string ) +: under ( a b c -- c a b c ) + tuck >r swapd r> ; inline + +: decode-read-loop ( buf ch state stream quot -- string/f ) + >r >r pick r> r> rot full? [ end-read-loop ] [ + over stream-read1 [ + -rot tuck >r >r >r -rot r> call r> r> decode-read-loop + ] [ end-read-loop ] if* + ] if ; inline + +: decode-read ( length stream quot -- string ) >r swap start-decoding r> - decode-part-loop ; inline + decode-read-loop ; inline GENERIC: init-decoding ( stream encoding -- decoded-stream ) diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor index ccf76649e2..81c982dd55 100755 --- a/core/io/encodings/utf16/utf16.factor +++ b/core/io/encodings/utf16/utf16.factor @@ -114,3 +114,35 @@ SYMBOL: ignore { [ utf16be? ] [ decode-utf16be ] } { [ t ] [ decode-error ] } } cond ; + +! UTF16LE streams + +TUPLE: utf16le ; +: utf16le construct-delegate ; +! In the future, this should detect and ignore a BOM at the beginning + +M: utf16le init-decoding ( stream utf16le -- utf16le-stream ) + tuck set-delegate ; + +M: utf16le init-encoding ( stream utf16le -- utf16le-stream ) + tuck set-delegate ; + +M: utf16le stream-read1 1 swap stream-read ; + +M: utf16le stream-read + delegate [ decode-utf16le-step ] decode-read ; + +M: utf16le stream-read-partial stream-read ; + +M: utf16le stream-read-until + ! Copied from { c-reader stream-read-until }!!! + [ swap read-until-loop ] "" make + swap over empty? over not and [ 2drop f f ] when ; + +M: utf16le stream-write1 + >r 1string r> stream-write ; + +M: utf16le stream-write + >r encode-utf16le r> delegate stream-write ; + +M: utf16le dispose delegate dispose ; diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index c0fa66e553..de3fd5b67b 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -90,7 +90,9 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream ) M: utf8 stream-read1 1 swap stream-read ; M: utf8 stream-read - [ decode-utf8-step ] decode-part ; + delegate [ decode-utf8-step ] decode-read ; + +M: utf8 stream-read-partial stream-read ; M: utf8 stream-read-until ! Copied from { c-reader stream-read-until }!!!