More unicode changes

db4
Daniel Ehrenberg 2008-02-13 01:02:37 -06:00
parent 8c63311a0f
commit 014b79caad
4 changed files with 48 additions and 36 deletions

View File

@ -1,7 +1,7 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors USING: math kernel sequences sbufs vectors
namespaces unicode.syntax ; namespaces unicode.syntax growable strings io ;
IN: io.encodings IN: io.encodings
TUPLE: encode-error ; TUPLE: encode-error ;
@ -23,11 +23,27 @@ SYMBOL: begin
: finish-decoding ( buf ch state -- str ) : finish-decoding ( buf ch state -- str )
begin eq? [ decode-error ] unless drop "" like ; begin eq? [ decode-error ] unless drop "" like ;
: decode ( ch state seq quot -- buf ch state ) : start-decoding ( seq length -- buf ch state seq )
[ -rot ] swap compose each ; inline <sbuf> 0 begin roll ;
: start-decoding ( seq -- buf ch state seq ) : decode ( seq quot -- string )
[ length <sbuf> 0 begin ] keep ; >r dup length start-decoding r>
[ -rot ] swap compose each
finish-decoding ; inline
: space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ;
: full? ( resizable -- ? ) space zero? ;
: decode-part-loop ( buf ch state stream quot -- string )
>r >r pick r> r> rot full?
[ 2drop 2drop >string ]
[ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline
: decode-part ( length stream quot -- string )
>r swap start-decoding r>
decode-part-loop ; inline
GENERIC: init-decoding ( stream encoding -- decoded-stream ) GENERIC: init-decoding ( stream encoding -- decoded-stream )

View File

@ -30,7 +30,7 @@ SYMBOL: ignore
>r 2 shift r> BIN: 11 bitand bitor quad3 >r 2 shift r> BIN: 11 bitand bitor quad3
] [ 2drop do-ignore ] if ; ] [ 2drop do-ignore ] if ;
: (decode-utf16be) ( buf byte ch state -- buf ch state ) : decode-utf16be-step ( buf byte ch state -- buf ch state )
{ {
{ begin [ drop begin-utf16be ] } { begin [ drop begin-utf16be ] }
{ double [ end-multibyte ] } { double [ end-multibyte ] }
@ -41,7 +41,7 @@ SYMBOL: ignore
} case ; } case ;
: decode-utf16be ( seq -- str ) : decode-utf16be ( seq -- str )
[ (decode-utf16be) ] decode ; [ decode-utf16be-step ] decode ;
: handle-double ( buf byte ch -- buf ch state ) : handle-double ( buf byte ch -- buf ch state )
swap dup -3 shift BIN: 11011 = [ swap dup -3 shift BIN: 11011 = [
@ -55,7 +55,7 @@ SYMBOL: ignore
BIN: 11 bitand append-nums HEX: 10000 + decoded BIN: 11 bitand append-nums HEX: 10000 + decoded
] [ 2drop push-replacement ] if ; ] [ 2drop push-replacement ] if ;
: (decode-utf16le) ( buf byte ch state -- buf ch state ) : decode-utf16le-step ( buf byte ch state -- buf ch state )
{ {
{ begin [ drop double ] } { begin [ drop double ] }
{ double [ handle-double ] } { double [ handle-double ] }
@ -65,7 +65,7 @@ SYMBOL: ignore
} case ; } case ;
: decode-utf16le ( seq -- str ) : decode-utf16le ( seq -- str )
[ (decode-utf16le) ] decode ; [ decode-utf16le-step ] decode ;
: encode-first : encode-first
-10 shift -10 shift

View File

@ -1,16 +1,23 @@
USING: io.utf8 tools.test strings arrays unicode.syntax ; USING: io.encodings.utf8 tools.test sbufs kernel io
sequences strings arrays unicode.syntax ;
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test : decode-utf8-w/stream ( array -- newarray )
>sbuf dup reverse-here <utf8> contents >array ;
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test : encode-utf8-w/stream ( array -- newarray )
SBUF" " clone tuck <utf8> write >array ;
[ "x" ] [ "x" decode-utf8 >string ] unit-test [ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test [ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test [ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test [ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] [ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test [ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test

View File

@ -1,8 +1,8 @@
! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors growable io USING: math kernel sequences sbufs vectors growable io continuations
namespaces io.encodings combinators ; namespaces io.encodings combinators strings io.streams.c ;
IN: io.utf8 IN: io.encodings.utf8
! Decoding UTF-8 ! Decoding UTF-8
@ -33,7 +33,7 @@ SYMBOL: quad3
: end-multibyte ( buf byte ch -- buf ch state ) : end-multibyte ( buf byte ch -- buf ch state )
f append-nums [ decoded ] unless* ; f append-nums [ decoded ] unless* ;
: (decode-utf8) ( buf byte ch state -- buf ch state ) : decode-utf8-step ( buf byte ch state -- buf ch state )
{ {
{ begin [ drop begin-utf8 ] } { begin [ drop begin-utf8 ] }
{ double [ end-multibyte ] } { double [ end-multibyte ] }
@ -44,11 +44,8 @@ SYMBOL: quad3
{ quad3 [ end-multibyte ] } { quad3 [ end-multibyte ] }
} case ; } case ;
: decode-utf8-chunk ( ch state seq -- buf ch state )
[ (decode-utf8) ] decode ;
: decode-utf8 ( seq -- str ) : decode-utf8 ( seq -- str )
start-decoding decode-utf8-chunk finish-decoding ; [ decode-utf8-step ] decode ;
! Encoding UTF-8 ! Encoding UTF-8
@ -81,6 +78,7 @@ SYMBOL: quad3
! Interface for streams ! Interface for streams
TUPLE: utf8 ; TUPLE: utf8 ;
: <utf8> utf8 construct-delegate ;
! In the future, this should detect and ignore a BOM at the beginning ! In the future, this should detect and ignore a BOM at the beginning
M: utf8 init-decoding ( stream utf8 -- utf8-stream ) M: utf8 init-decoding ( stream utf8 -- utf8-stream )
@ -91,19 +89,8 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream )
M: utf8 stream-read1 1 swap stream-read ; M: utf8 stream-read1 1 swap stream-read ;
: space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ;
: full? ( resizable -- ? ) space zero? ;
: utf8-stream-read ( buf ch state stream -- string )
>r pick full? [ r> 3drop >string ] [
pick space r> [ stream-read decode-utf8-chunk ] keep
utf8-stream-read
] if ;
M: utf8 stream-read M: utf8 stream-read
>r start-decoding drop r> delegate utf8-stream-read ; [ decode-utf8-step ] decode-part ;
M: utf8 stream-read-until M: utf8 stream-read-until
! Copied from { c-reader stream-read-until }!!! ! Copied from { c-reader stream-read-until }!!!
@ -115,3 +102,5 @@ M: utf8 stream-write1
M: utf8 stream-write M: utf8 stream-write
>r encode-utf8 r> delegate stream-write ; >r encode-utf8 r> delegate stream-write ;
M: utf8 dispose delegate dispose ;