From 4678da88613322e38acf4449d22accfdfbc84dea Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 15 Feb 2008 19:44:35 -0600 Subject: [PATCH] Working on unicode encodings --- core/io/encodings/encodings.factor | 24 ++++++------- core/io/encodings/utf16/utf16-tests.factor | 35 +++++++++++++------ core/io/encodings/utf16/utf16.factor | 40 +++++++++++++++++----- core/io/encodings/utf8/utf8-tests.factor | 10 +++--- core/io/encodings/utf8/utf8.factor | 1 - 5 files changed, 72 insertions(+), 38 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 27c74fc4bd..7e540a6aab 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -53,27 +53,17 @@ GENERIC: decode-step ( buf byte ch state encoding -- buf ch state ) >r swap start-decoding r> decode-read-loop ; -GENERIC: init-decoding ( stream encoding -- decoded-stream ) - : ( stream decoding-class -- decoded-stream ) - construct-empty init-decoding ; - -GENERIC: init-encoding ( stream encoding -- encoded-stream ) + construct-delegate ; : ( stream encoding-class -- encoded-stream ) - construct-empty init-encoding ; + construct-delegate ; GENERIC: encode-string ( string encoding -- byte-array ) M: tuple-class encode-string construct-empty encode-string ; MIXIN: encoding-stream -M: encoding-stream init-decoding ( stream encoding-stream -- encoding-stream ) - tuck set-delegate ; - -M: encoding-stream init-encoding ( stream encoding-stream -- encoding-stream ) - tuck set-delegate ; - M: encoding-stream stream-read1 1 swap stream-read ; M: encoding-stream stream-read @@ -93,3 +83,13 @@ M: encoding-stream stream-write [ encode-string ] keep delegate stream-write ; M: encoding-stream dispose delegate dispose ; + +GENERIC: underlying-stream ( encoded-stream -- delegate ) +M: encoding-stream underlying-stream delegate ; + +GENERIC: set-underlying-stream ( new-underlying stream -- ) +M: encoding-stream set-underlying-stream set-delegate ; + +: set-encoding ( encoding stream -- ) ! This doesn't work now + [ underlying-stream swap construct-delegate ] keep + set-underlying-stream ; diff --git a/core/io/encodings/utf16/utf16-tests.factor b/core/io/encodings/utf16/utf16-tests.factor index 9800a9827d..c5075d6c42 100755 --- a/core/io/encodings/utf16/utf16-tests.factor +++ b/core/io/encodings/utf16/utf16-tests.factor @@ -1,15 +1,28 @@ -USING: tools.test io.utf16 arrays unicode.syntax ; +USING: kernel tools.test io.encodings.utf16 arrays sbufs sequences io.encodings +io unicode.syntax ; -[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test -[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test +: decode-w/stream ( array encoding -- newarray ) + >r >sbuf dup reverse-here r> contents >array ; -[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test +: encode-w/stream ( array encoding -- newarray ) + >r SBUF" " clone tuck r> stream-write >array ; -[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test -[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test +[ { CHAR: x } ] [ { 0 CHAR: x } utf16be decode-w/stream ] unit-test +[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } utf16be decode-w/stream ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } utf16be decode-w/stream ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } utf16be decode-w/stream ] unit-test -[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test +[ { 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } utf16be encode-w/stream ] unit-test + +[ { CHAR: x } ] [ { CHAR: x 0 } utf16le decode-w/stream ] unit-test +[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } utf16le decode-w/stream ] unit-test +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } utf16le decode-w/stream ] unit-test +[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } utf16le decode-w/stream ] unit-test + +[ { 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16le encode-w/stream ] unit-test + +[ { CHAR: x } ] [ { HEX: ff HEX: fe CHAR: x 0 } utf16 decode-w/stream ] unit-test +[ { CHAR: x } ] [ { HEX: fe HEX: ff 0 CHAR: x } utf16 decode-w/stream ] unit-test + +[ { HEX: ff HEX: fe 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } utf16 encode-w/stream ] unit-test diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor index 99e98cd98c..35b6282e21 100755 --- a/core/io/encodings/utf16/utf16.factor +++ b/core/io/encodings/utf16/utf16.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors namespaces io.binary -io.encodings combinators splitting ; -IN: io.utf16 +io.encodings combinators splitting io byte-arrays ; +IN: io.encodings.utf16 SYMBOL: double SYMBOL: quad1 @@ -104,27 +104,49 @@ SYMBOL: ignore : encode-utf16 ( str -- seq ) encode-utf16le bom-le swap append ; -: utf16le? ( seq1 -- seq2 ? ) bom-le ?head ; +: start-utf16le? ( seq1 -- seq2 ? ) bom-le ?head ; -: utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; +: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; : decode-utf16 ( seq -- str ) { - { [ utf16le? ] [ decode-utf16le ] } - { [ utf16be? ] [ decode-utf16be ] } + { [ start-utf16le? ] [ decode-utf16le ] } + { [ start-utf16be? ] [ decode-utf16be ] } { [ t ] [ decode-error ] } } cond ; TUPLE: utf16le ; -: utf16le construct-delegate ; INSTANCE: utf16le encoding-stream M: utf16le encode-string drop encode-utf16le ; M: utf16le decode-step drop decode-utf16le-step ; TUPLE: utf16be ; -: utf16be construct-delegate ; INSTANCE: utf16be encoding-stream M: utf16be encode-string drop encode-utf16be ; -M: utf16le decode-step drop decode-utf16be-step ; +M: utf16be decode-step drop decode-utf16be-step ; + +TUPLE: utf16 encoding ; +INSTANCE: utf16 encoding-stream +M: utf16 underlying-stream delegate dup delegate [ ] [ ] ?if ; ! necessary? +M: utf16 set-underlying-stream delegate set-delegate ; ! necessary? + +M: utf16 encode-string + >r encode-utf16le r> + dup utf16-encoding [ drop ] + [ t swap set-utf16-encoding bom-le swap append ] if ; + +: bom>le/be ( bom -- le/be ) + dup bom-le sequence= [ drop utf16le ] [ + bom-be sequence= [ utf16be ] [ decode-error ] if + ] if ; + +: read-bom ( utf16 -- encoding ) + 2 over delegate stream-read bom>le/be construct-empty + [ swap set-utf16-encoding ] keep ; + +M: utf16 decode-step + ! inefficient: checks if bom is done many times + ! This should transform itself into utf16be or utf16le after reading BOM + dup utf16-encoding [ ] [ read-bom ] ?if decode-step ; diff --git a/core/io/encodings/utf8/utf8-tests.factor b/core/io/encodings/utf8/utf8-tests.factor index 33c4ffbf12..a1c9edbe49 100644 --- a/core/io/encodings/utf8/utf8-tests.factor +++ b/core/io/encodings/utf8/utf8-tests.factor @@ -1,11 +1,11 @@ -USING: io.encodings.utf8 tools.test sbufs kernel io +USING: io.encodings.utf8 tools.test sbufs kernel io io.encodings sequences strings arrays unicode.syntax ; : decode-utf8-w/stream ( array -- newarray ) - >sbuf dup reverse-here contents >array ; + >sbuf dup reverse-here utf8 contents ; : encode-utf8-w/stream ( array -- newarray ) - SBUF" " clone tuck write >array ; + SBUF" " clone tuck utf8 stream-write >array ; [ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test @@ -19,5 +19,5 @@ sequences strings arrays unicode.syntax ; [ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test -[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] -[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test +[ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] +[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8-w/stream ] unit-test diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index f681b18142..6a3a8b8ec7 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -78,7 +78,6 @@ SYMBOL: quad3 ! Interface for streams TUPLE: utf8 ; -: utf8 construct-delegate ; INSTANCE: utf8 encoding-stream M: utf8 encode-string drop encode-utf8 ;