From 014b79caada3316522876c098ead25450f978f60 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Wed, 13 Feb 2008 01:02:37 -0600 Subject: [PATCH] More unicode changes --- core/io/encodings/encodings.factor | 26 +++++++++++++++++---- core/io/encodings/utf16/utf16.factor | 8 +++---- core/io/encodings/utf8/utf8-tests.factor | 21 +++++++++++------ core/io/encodings/utf8/utf8.factor | 29 ++++++++---------------- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index e6811b6e6d..dcc055f941 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors -namespaces unicode.syntax ; +namespaces unicode.syntax growable strings io ; IN: io.encodings TUPLE: encode-error ; @@ -23,11 +23,27 @@ SYMBOL: begin : finish-decoding ( buf ch state -- str ) begin eq? [ decode-error ] unless drop "" like ; -: decode ( ch state seq quot -- buf ch state ) - [ -rot ] swap compose each ; inline +: start-decoding ( seq length -- buf ch state seq ) + 0 begin roll ; -: start-decoding ( seq -- buf ch state seq ) - [ length 0 begin ] keep ; +: decode ( seq quot -- string ) + >r dup length start-decoding r> + [ -rot ] swap compose each + finish-decoding ; inline + +: space ( resizable -- room-left ) + dup underlying swap [ length ] 2apply - ; + +: full? ( resizable -- ? ) space zero? ; + +: decode-part-loop ( buf ch state stream quot -- string ) + >r >r pick r> r> rot full? + [ 2drop 2drop >string ] + [ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline + +: decode-part ( length stream quot -- string ) + >r swap start-decoding r> + decode-part-loop ; inline GENERIC: init-decoding ( stream encoding -- decoded-stream ) diff --git a/core/io/encodings/utf16/utf16.factor b/core/io/encodings/utf16/utf16.factor index c38e7845df..ccf76649e2 100755 --- a/core/io/encodings/utf16/utf16.factor +++ b/core/io/encodings/utf16/utf16.factor @@ -30,7 +30,7 @@ SYMBOL: ignore >r 2 shift r> BIN: 11 bitand bitor quad3 ] [ 2drop do-ignore ] if ; -: (decode-utf16be) ( buf byte ch state -- buf ch state ) +: decode-utf16be-step ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf16be ] } { double [ end-multibyte ] } @@ -41,7 +41,7 @@ SYMBOL: ignore } case ; : decode-utf16be ( seq -- str ) - [ (decode-utf16be) ] decode ; + [ decode-utf16be-step ] decode ; : handle-double ( buf byte ch -- buf ch state ) swap dup -3 shift BIN: 11011 = [ @@ -55,7 +55,7 @@ SYMBOL: ignore BIN: 11 bitand append-nums HEX: 10000 + decoded ] [ 2drop push-replacement ] if ; -: (decode-utf16le) ( buf byte ch state -- buf ch state ) +: decode-utf16le-step ( buf byte ch state -- buf ch state ) { { begin [ drop double ] } { double [ handle-double ] } @@ -65,7 +65,7 @@ SYMBOL: ignore } case ; : decode-utf16le ( seq -- str ) - [ (decode-utf16le) ] decode ; + [ decode-utf16le-step ] decode ; : encode-first -10 shift diff --git a/core/io/encodings/utf8/utf8-tests.factor b/core/io/encodings/utf8/utf8-tests.factor index 3576471586..33c4ffbf12 100644 --- a/core/io/encodings/utf8/utf8-tests.factor +++ b/core/io/encodings/utf8/utf8-tests.factor @@ -1,16 +1,23 @@ -USING: io.utf8 tools.test strings arrays unicode.syntax ; +USING: io.encodings.utf8 tools.test sbufs kernel io +sequences strings arrays unicode.syntax ; -[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test +: decode-utf8-w/stream ( array -- newarray ) + >sbuf dup reverse-here contents >array ; -[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test +: encode-utf8-w/stream ( array -- newarray ) + SBUF" " clone tuck write >array ; -[ "x" ] [ "x" decode-utf8 >string ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test -[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test +[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test -[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test +[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test -[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test +[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test + +[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test + +[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test [ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] [ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/core/io/encodings/utf8/utf8.factor b/core/io/encodings/utf8/utf8.factor index 90aec4623a..c0fa66e553 100644 --- a/core/io/encodings/utf8/utf8.factor +++ b/core/io/encodings/utf8/utf8.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. -USING: math kernel sequences sbufs vectors growable io -namespaces io.encodings combinators ; -IN: io.utf8 +USING: math kernel sequences sbufs vectors growable io continuations +namespaces io.encodings combinators strings io.streams.c ; +IN: io.encodings.utf8 ! Decoding UTF-8 @@ -33,7 +33,7 @@ SYMBOL: quad3 : end-multibyte ( buf byte ch -- buf ch state ) f append-nums [ decoded ] unless* ; -: (decode-utf8) ( buf byte ch state -- buf ch state ) +: decode-utf8-step ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf8 ] } { double [ end-multibyte ] } @@ -44,11 +44,8 @@ SYMBOL: quad3 { quad3 [ end-multibyte ] } } case ; -: decode-utf8-chunk ( ch state seq -- buf ch state ) - [ (decode-utf8) ] decode ; - : decode-utf8 ( seq -- str ) - start-decoding decode-utf8-chunk finish-decoding ; + [ decode-utf8-step ] decode ; ! Encoding UTF-8 @@ -81,6 +78,7 @@ SYMBOL: quad3 ! Interface for streams TUPLE: utf8 ; +: utf8 construct-delegate ; ! In the future, this should detect and ignore a BOM at the beginning M: utf8 init-decoding ( stream utf8 -- utf8-stream ) @@ -91,19 +89,8 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream ) M: utf8 stream-read1 1 swap stream-read ; -: space ( resizable -- room-left ) - dup underlying swap [ length ] 2apply - ; - -: full? ( resizable -- ? ) space zero? ; - -: utf8-stream-read ( buf ch state stream -- string ) - >r pick full? [ r> 3drop >string ] [ - pick space r> [ stream-read decode-utf8-chunk ] keep - utf8-stream-read - ] if ; - M: utf8 stream-read - >r start-decoding drop r> delegate utf8-stream-read ; + [ decode-utf8-step ] decode-part ; M: utf8 stream-read-until ! Copied from { c-reader stream-read-until }!!! @@ -115,3 +102,5 @@ M: utf8 stream-write1 M: utf8 stream-write >r encode-utf8 r> delegate stream-write ; + +M: utf8 dispose delegate dispose ;