More unicode changes

2008-02-13 01:02:37 -06:00 · 2008-02-13 01:02:37 -06:00 · 014b79caad
parent 8c63311a0f
commit 014b79caad
4 changed files with 48 additions and 36 deletions
--- a/core/io/encodings/encodings.factor
+++ b/core/io/encodings/encodings.factor
@ -1,7 +1,7 @@
 ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: math kernel sequences sbufs vectors
-namespaces unicode.syntax ;
+namespaces unicode.syntax growable strings io ;
 IN: io.encodings

 TUPLE: encode-error ;
@ -23,11 +23,27 @@ SYMBOL: begin
 : finish-decoding ( buf ch state -- str )
    begin eq? [ decode-error ] unless drop "" like ;

-: decode ( ch state seq quot -- buf ch state )
-    [ -rot ] swap compose each ; inline
+: start-decoding ( seq length -- buf ch state seq )
+    <sbuf> 0 begin roll ;

-: start-decoding ( seq -- buf ch state seq )
-    [ length <sbuf> 0 begin ] keep ;
+: decode ( seq quot -- string )
+    >r dup length start-decoding r>
+    [ -rot ] swap compose each
+    finish-decoding ; inline
+
+: space ( resizable -- room-left )
+    dup underlying swap [ length ] 2apply - ;
+
+: full? ( resizable -- ? ) space zero? ;
+
+: decode-part-loop ( buf ch state stream quot -- string )
+    >r >r pick r> r> rot full?
+    [ 2drop 2drop >string ]
+    [ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline
+
+: decode-part ( length stream quot -- string )
+    >r swap start-decoding r>
+    decode-part-loop ; inline

 GENERIC: init-decoding ( stream encoding -- decoded-stream )

--- a/core/io/encodings/utf16/utf16.factor
+++ b/core/io/encodings/utf16/utf16.factor
@ -30,7 +30,7 @@ SYMBOL: ignore
        >r 2 shift r> BIN: 11 bitand bitor quad3
    ] [ 2drop do-ignore ] if ;

-: (decode-utf16be) ( buf byte ch state -- buf ch state )
+: decode-utf16be-step ( buf byte ch state -- buf ch state )
    {
        { begin [ drop begin-utf16be ] }
        { double [ end-multibyte ] }
@ -41,7 +41,7 @@ SYMBOL: ignore
    } case ;

 : decode-utf16be ( seq -- str )
-    [ (decode-utf16be) ] decode ;
+    [ decode-utf16be-step ] decode ;

 : handle-double ( buf byte ch -- buf ch state )
    swap dup -3 shift BIN: 11011 = [
@ -55,7 +55,7 @@ SYMBOL: ignore
        BIN: 11 bitand append-nums HEX: 10000 + decoded
    ] [ 2drop push-replacement ] if ;

-: (decode-utf16le) ( buf byte ch state -- buf ch state )
+: decode-utf16le-step ( buf byte ch state -- buf ch state )
    {
        { begin [ drop double ] }
        { double [ handle-double ] }
@ -65,7 +65,7 @@ SYMBOL: ignore
    } case ;

 : decode-utf16le ( seq -- str )
-    [ (decode-utf16le) ] decode ;
+    [ decode-utf16le-step ] decode ;

 : encode-first
    -10 shift
--- a/core/io/encodings/utf8/utf8-tests.factor
+++ b/core/io/encodings/utf8/utf8-tests.factor
@ -1,16 +1,23 @@
-USING: io.utf8 tools.test strings arrays unicode.syntax ;
+USING: io.encodings.utf8 tools.test sbufs kernel io
+sequences strings arrays unicode.syntax ;

-[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
+: decode-utf8-w/stream ( array -- newarray )
+    >sbuf dup reverse-here <utf8> contents >array ;

-[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
+: encode-utf8-w/stream ( array -- newarray )
+    SBUF" " clone tuck <utf8> write >array ;

-[ "x" ] [ "x" decode-utf8 >string ] unit-test
+[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test

-[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
+[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test

-[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
+[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test

-[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
+[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
+
+[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
+
+[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test

 [ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
 [ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
--- a/core/io/encodings/utf8/utf8.factor
+++ b/core/io/encodings/utf8/utf8.factor
@ -1,8 +1,8 @@
 ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: math kernel sequences sbufs vectors growable io
-namespaces io.encodings combinators ;
-IN: io.utf8
+USING: math kernel sequences sbufs vectors growable io continuations
+namespaces io.encodings combinators strings io.streams.c ;
+IN: io.encodings.utf8

 ! Decoding UTF-8

@ -33,7 +33,7 @@ SYMBOL: quad3
 : end-multibyte ( buf byte ch -- buf ch state )
    f append-nums [ decoded ] unless* ;

-: (decode-utf8) ( buf byte ch state -- buf ch state )
+: decode-utf8-step ( buf byte ch state -- buf ch state )
    {
        { begin [ drop begin-utf8 ] }
        { double [ end-multibyte ] }
@ -44,11 +44,8 @@ SYMBOL: quad3
        { quad3 [ end-multibyte ] }
    } case ;

-: decode-utf8-chunk ( ch state seq -- buf ch state )
-    [ (decode-utf8) ] decode ;
-
 : decode-utf8 ( seq -- str )
-    start-decoding decode-utf8-chunk finish-decoding ;
+    [ decode-utf8-step ] decode ; 

 ! Encoding UTF-8

@ -81,6 +78,7 @@ SYMBOL: quad3
 ! Interface for streams

 TUPLE: utf8 ;
+: <utf8> utf8 construct-delegate ;
 ! In the future, this should detect and ignore a BOM at the beginning

 M: utf8 init-decoding ( stream utf8 -- utf8-stream )
@ -91,19 +89,8 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream )

 M: utf8 stream-read1 1 swap stream-read ;

-: space ( resizable -- room-left )
-    dup underlying swap [ length ] 2apply - ;
-
-: full? ( resizable -- ? ) space zero? ;
-
-: utf8-stream-read ( buf ch state stream -- string )
-    >r pick full? [ r> 3drop >string ]  [
-        pick space r> [ stream-read decode-utf8-chunk ] keep
-        utf8-stream-read
-    ] if ;
-
 M: utf8 stream-read
-    >r start-decoding drop r> delegate utf8-stream-read ;
+    [ decode-utf8-step ] decode-part ;

 M: utf8 stream-read-until
    ! Copied from { c-reader stream-read-until }!!!
@ -115,3 +102,5 @@ M: utf8 stream-write1

 M: utf8 stream-write
    >r encode-utf8 r> delegate stream-write ;
+
+M: utf8 dispose delegate dispose ;