More unicode changes
parent
8c63311a0f
commit
014b79caad
|
@ -1,7 +1,7 @@
|
|||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: math kernel sequences sbufs vectors
|
||||
namespaces unicode.syntax ;
|
||||
namespaces unicode.syntax growable strings io ;
|
||||
IN: io.encodings
|
||||
|
||||
TUPLE: encode-error ;
|
||||
|
@ -23,11 +23,27 @@ SYMBOL: begin
|
|||
: finish-decoding ( buf ch state -- str )
|
||||
begin eq? [ decode-error ] unless drop "" like ;
|
||||
|
||||
: decode ( ch state seq quot -- buf ch state )
|
||||
[ -rot ] swap compose each ; inline
|
||||
: start-decoding ( seq length -- buf ch state seq )
|
||||
<sbuf> 0 begin roll ;
|
||||
|
||||
: start-decoding ( seq -- buf ch state seq )
|
||||
[ length <sbuf> 0 begin ] keep ;
|
||||
: decode ( seq quot -- string )
|
||||
>r dup length start-decoding r>
|
||||
[ -rot ] swap compose each
|
||||
finish-decoding ; inline
|
||||
|
||||
: space ( resizable -- room-left )
|
||||
dup underlying swap [ length ] 2apply - ;
|
||||
|
||||
: full? ( resizable -- ? ) space zero? ;
|
||||
|
||||
: decode-part-loop ( buf ch state stream quot -- string )
|
||||
>r >r pick r> r> rot full?
|
||||
[ 2drop 2drop >string ]
|
||||
[ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline
|
||||
|
||||
: decode-part ( length stream quot -- string )
|
||||
>r swap start-decoding r>
|
||||
decode-part-loop ; inline
|
||||
|
||||
GENERIC: init-decoding ( stream encoding -- decoded-stream )
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ SYMBOL: ignore
|
|||
>r 2 shift r> BIN: 11 bitand bitor quad3
|
||||
] [ 2drop do-ignore ] if ;
|
||||
|
||||
: (decode-utf16be) ( buf byte ch state -- buf ch state )
|
||||
: decode-utf16be-step ( buf byte ch state -- buf ch state )
|
||||
{
|
||||
{ begin [ drop begin-utf16be ] }
|
||||
{ double [ end-multibyte ] }
|
||||
|
@ -41,7 +41,7 @@ SYMBOL: ignore
|
|||
} case ;
|
||||
|
||||
: decode-utf16be ( seq -- str )
|
||||
[ (decode-utf16be) ] decode ;
|
||||
[ decode-utf16be-step ] decode ;
|
||||
|
||||
: handle-double ( buf byte ch -- buf ch state )
|
||||
swap dup -3 shift BIN: 11011 = [
|
||||
|
@ -55,7 +55,7 @@ SYMBOL: ignore
|
|||
BIN: 11 bitand append-nums HEX: 10000 + decoded
|
||||
] [ 2drop push-replacement ] if ;
|
||||
|
||||
: (decode-utf16le) ( buf byte ch state -- buf ch state )
|
||||
: decode-utf16le-step ( buf byte ch state -- buf ch state )
|
||||
{
|
||||
{ begin [ drop double ] }
|
||||
{ double [ handle-double ] }
|
||||
|
@ -65,7 +65,7 @@ SYMBOL: ignore
|
|||
} case ;
|
||||
|
||||
: decode-utf16le ( seq -- str )
|
||||
[ (decode-utf16le) ] decode ;
|
||||
[ decode-utf16le-step ] decode ;
|
||||
|
||||
: encode-first
|
||||
-10 shift
|
||||
|
|
|
@ -1,16 +1,23 @@
|
|||
USING: io.utf8 tools.test strings arrays unicode.syntax ;
|
||||
USING: io.encodings.utf8 tools.test sbufs kernel io
|
||||
sequences strings arrays unicode.syntax ;
|
||||
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
|
||||
: decode-utf8-w/stream ( array -- newarray )
|
||||
>sbuf dup reverse-here <utf8> contents >array ;
|
||||
|
||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
|
||||
: encode-utf8-w/stream ( array -- newarray )
|
||||
SBUF" " clone tuck <utf8> write >array ;
|
||||
|
||||
[ "x" ] [ "x" decode-utf8 >string ] unit-test
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
|
||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
|
||||
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
|
||||
|
||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
|
||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||
|
||||
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
|
||||
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: math kernel sequences sbufs vectors growable io
|
||||
namespaces io.encodings combinators ;
|
||||
IN: io.utf8
|
||||
USING: math kernel sequences sbufs vectors growable io continuations
|
||||
namespaces io.encodings combinators strings io.streams.c ;
|
||||
IN: io.encodings.utf8
|
||||
|
||||
! Decoding UTF-8
|
||||
|
||||
|
@ -33,7 +33,7 @@ SYMBOL: quad3
|
|||
: end-multibyte ( buf byte ch -- buf ch state )
|
||||
f append-nums [ decoded ] unless* ;
|
||||
|
||||
: (decode-utf8) ( buf byte ch state -- buf ch state )
|
||||
: decode-utf8-step ( buf byte ch state -- buf ch state )
|
||||
{
|
||||
{ begin [ drop begin-utf8 ] }
|
||||
{ double [ end-multibyte ] }
|
||||
|
@ -44,11 +44,8 @@ SYMBOL: quad3
|
|||
{ quad3 [ end-multibyte ] }
|
||||
} case ;
|
||||
|
||||
: decode-utf8-chunk ( ch state seq -- buf ch state )
|
||||
[ (decode-utf8) ] decode ;
|
||||
|
||||
: decode-utf8 ( seq -- str )
|
||||
start-decoding decode-utf8-chunk finish-decoding ;
|
||||
[ decode-utf8-step ] decode ;
|
||||
|
||||
! Encoding UTF-8
|
||||
|
||||
|
@ -81,6 +78,7 @@ SYMBOL: quad3
|
|||
! Interface for streams
|
||||
|
||||
TUPLE: utf8 ;
|
||||
: <utf8> utf8 construct-delegate ;
|
||||
! In the future, this should detect and ignore a BOM at the beginning
|
||||
|
||||
M: utf8 init-decoding ( stream utf8 -- utf8-stream )
|
||||
|
@ -91,19 +89,8 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream )
|
|||
|
||||
M: utf8 stream-read1 1 swap stream-read ;
|
||||
|
||||
: space ( resizable -- room-left )
|
||||
dup underlying swap [ length ] 2apply - ;
|
||||
|
||||
: full? ( resizable -- ? ) space zero? ;
|
||||
|
||||
: utf8-stream-read ( buf ch state stream -- string )
|
||||
>r pick full? [ r> 3drop >string ] [
|
||||
pick space r> [ stream-read decode-utf8-chunk ] keep
|
||||
utf8-stream-read
|
||||
] if ;
|
||||
|
||||
M: utf8 stream-read
|
||||
>r start-decoding drop r> delegate utf8-stream-read ;
|
||||
[ decode-utf8-step ] decode-part ;
|
||||
|
||||
M: utf8 stream-read-until
|
||||
! Copied from { c-reader stream-read-until }!!!
|
||||
|
@ -115,3 +102,5 @@ M: utf8 stream-write1
|
|||
|
||||
M: utf8 stream-write
|
||||
>r encode-utf8 r> delegate stream-write ;
|
||||
|
||||
M: utf8 dispose delegate dispose ;
|
||||
|
|
Loading…
Reference in New Issue