More unicode changes

db4
Daniel Ehrenberg 2008-02-13 01:02:37 -06:00
parent 8c63311a0f
commit 014b79caad
4 changed files with 48 additions and 36 deletions

View File

@ -1,7 +1,7 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors
namespaces unicode.syntax ;
namespaces unicode.syntax growable strings io ;
IN: io.encodings
TUPLE: encode-error ;
@ -23,11 +23,27 @@ SYMBOL: begin
: finish-decoding ( buf ch state -- str )
begin eq? [ decode-error ] unless drop "" like ;
: decode ( ch state seq quot -- buf ch state )
[ -rot ] swap compose each ; inline
: start-decoding ( seq length -- buf ch state seq )
<sbuf> 0 begin roll ;
: start-decoding ( seq -- buf ch state seq )
[ length <sbuf> 0 begin ] keep ;
: decode ( seq quot -- string )
>r dup length start-decoding r>
[ -rot ] swap compose each
finish-decoding ; inline
: space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ;
: full? ( resizable -- ? ) space zero? ;
: decode-part-loop ( buf ch state stream quot -- string )
>r >r pick r> r> rot full?
[ 2drop 2drop >string ]
[ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline
: decode-part ( length stream quot -- string )
>r swap start-decoding r>
decode-part-loop ; inline
GENERIC: init-decoding ( stream encoding -- decoded-stream )

View File

@ -30,7 +30,7 @@ SYMBOL: ignore
>r 2 shift r> BIN: 11 bitand bitor quad3
] [ 2drop do-ignore ] if ;
: (decode-utf16be) ( buf byte ch state -- buf ch state )
: decode-utf16be-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf16be ] }
{ double [ end-multibyte ] }
@ -41,7 +41,7 @@ SYMBOL: ignore
} case ;
: decode-utf16be ( seq -- str )
[ (decode-utf16be) ] decode ;
[ decode-utf16be-step ] decode ;
: handle-double ( buf byte ch -- buf ch state )
swap dup -3 shift BIN: 11011 = [
@ -55,7 +55,7 @@ SYMBOL: ignore
BIN: 11 bitand append-nums HEX: 10000 + decoded
] [ 2drop push-replacement ] if ;
: (decode-utf16le) ( buf byte ch state -- buf ch state )
: decode-utf16le-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop double ] }
{ double [ handle-double ] }
@ -65,7 +65,7 @@ SYMBOL: ignore
} case ;
: decode-utf16le ( seq -- str )
[ (decode-utf16le) ] decode ;
[ decode-utf16le-step ] decode ;
: encode-first
-10 shift

View File

@ -1,16 +1,23 @@
USING: io.utf8 tools.test strings arrays unicode.syntax ;
USING: io.encodings.utf8 tools.test sbufs kernel io
sequences strings arrays unicode.syntax ;
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
: decode-utf8-w/stream ( array -- newarray )
>sbuf dup reverse-here <utf8> contents >array ;
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
: encode-utf8-w/stream ( array -- newarray )
SBUF" " clone tuck <utf8> write >array ;
[ "x" ] [ "x" decode-utf8 >string ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test

View File

@ -1,8 +1,8 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors growable io
namespaces io.encodings combinators ;
IN: io.utf8
USING: math kernel sequences sbufs vectors growable io continuations
namespaces io.encodings combinators strings io.streams.c ;
IN: io.encodings.utf8
! Decoding UTF-8
@ -33,7 +33,7 @@ SYMBOL: quad3
: end-multibyte ( buf byte ch -- buf ch state )
f append-nums [ decoded ] unless* ;
: (decode-utf8) ( buf byte ch state -- buf ch state )
: decode-utf8-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf8 ] }
{ double [ end-multibyte ] }
@ -44,11 +44,8 @@ SYMBOL: quad3
{ quad3 [ end-multibyte ] }
} case ;
: decode-utf8-chunk ( ch state seq -- buf ch state )
[ (decode-utf8) ] decode ;
: decode-utf8 ( seq -- str )
start-decoding decode-utf8-chunk finish-decoding ;
[ decode-utf8-step ] decode ;
! Encoding UTF-8
@ -81,6 +78,7 @@ SYMBOL: quad3
! Interface for streams
TUPLE: utf8 ;
: <utf8> utf8 construct-delegate ;
! In the future, this should detect and ignore a BOM at the beginning
M: utf8 init-decoding ( stream utf8 -- utf8-stream )
@ -91,19 +89,8 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream )
M: utf8 stream-read1 1 swap stream-read ;
: space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ;
: full? ( resizable -- ? ) space zero? ;
: utf8-stream-read ( buf ch state stream -- string )
>r pick full? [ r> 3drop >string ] [
pick space r> [ stream-read decode-utf8-chunk ] keep
utf8-stream-read
] if ;
M: utf8 stream-read
>r start-decoding drop r> delegate utf8-stream-read ;
[ decode-utf8-step ] decode-part ;
M: utf8 stream-read-until
! Copied from { c-reader stream-read-until }!!!
@ -115,3 +102,5 @@ M: utf8 stream-write1
M: utf8 stream-write
>r encode-utf8 r> delegate stream-write ;
M: utf8 dispose delegate dispose ;