More unicode changes
parent
8c63311a0f
commit
014b79caad
|
@ -1,7 +1,7 @@
|
||||||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: math kernel sequences sbufs vectors
|
USING: math kernel sequences sbufs vectors
|
||||||
namespaces unicode.syntax ;
|
namespaces unicode.syntax growable strings io ;
|
||||||
IN: io.encodings
|
IN: io.encodings
|
||||||
|
|
||||||
TUPLE: encode-error ;
|
TUPLE: encode-error ;
|
||||||
|
@ -23,11 +23,27 @@ SYMBOL: begin
|
||||||
: finish-decoding ( buf ch state -- str )
|
: finish-decoding ( buf ch state -- str )
|
||||||
begin eq? [ decode-error ] unless drop "" like ;
|
begin eq? [ decode-error ] unless drop "" like ;
|
||||||
|
|
||||||
: decode ( ch state seq quot -- buf ch state )
|
: start-decoding ( seq length -- buf ch state seq )
|
||||||
[ -rot ] swap compose each ; inline
|
<sbuf> 0 begin roll ;
|
||||||
|
|
||||||
: start-decoding ( seq -- buf ch state seq )
|
: decode ( seq quot -- string )
|
||||||
[ length <sbuf> 0 begin ] keep ;
|
>r dup length start-decoding r>
|
||||||
|
[ -rot ] swap compose each
|
||||||
|
finish-decoding ; inline
|
||||||
|
|
||||||
|
: space ( resizable -- room-left )
|
||||||
|
dup underlying swap [ length ] 2apply - ;
|
||||||
|
|
||||||
|
: full? ( resizable -- ? ) space zero? ;
|
||||||
|
|
||||||
|
: decode-part-loop ( buf ch state stream quot -- string )
|
||||||
|
>r >r pick r> r> rot full?
|
||||||
|
[ 2drop 2drop >string ]
|
||||||
|
[ [ >r stream-read1 -rot r> call ] 2keep decode-part-loop ] if ; inline
|
||||||
|
|
||||||
|
: decode-part ( length stream quot -- string )
|
||||||
|
>r swap start-decoding r>
|
||||||
|
decode-part-loop ; inline
|
||||||
|
|
||||||
GENERIC: init-decoding ( stream encoding -- decoded-stream )
|
GENERIC: init-decoding ( stream encoding -- decoded-stream )
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ SYMBOL: ignore
|
||||||
>r 2 shift r> BIN: 11 bitand bitor quad3
|
>r 2 shift r> BIN: 11 bitand bitor quad3
|
||||||
] [ 2drop do-ignore ] if ;
|
] [ 2drop do-ignore ] if ;
|
||||||
|
|
||||||
: (decode-utf16be) ( buf byte ch state -- buf ch state )
|
: decode-utf16be-step ( buf byte ch state -- buf ch state )
|
||||||
{
|
{
|
||||||
{ begin [ drop begin-utf16be ] }
|
{ begin [ drop begin-utf16be ] }
|
||||||
{ double [ end-multibyte ] }
|
{ double [ end-multibyte ] }
|
||||||
|
@ -41,7 +41,7 @@ SYMBOL: ignore
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: decode-utf16be ( seq -- str )
|
: decode-utf16be ( seq -- str )
|
||||||
[ (decode-utf16be) ] decode ;
|
[ decode-utf16be-step ] decode ;
|
||||||
|
|
||||||
: handle-double ( buf byte ch -- buf ch state )
|
: handle-double ( buf byte ch -- buf ch state )
|
||||||
swap dup -3 shift BIN: 11011 = [
|
swap dup -3 shift BIN: 11011 = [
|
||||||
|
@ -55,7 +55,7 @@ SYMBOL: ignore
|
||||||
BIN: 11 bitand append-nums HEX: 10000 + decoded
|
BIN: 11 bitand append-nums HEX: 10000 + decoded
|
||||||
] [ 2drop push-replacement ] if ;
|
] [ 2drop push-replacement ] if ;
|
||||||
|
|
||||||
: (decode-utf16le) ( buf byte ch state -- buf ch state )
|
: decode-utf16le-step ( buf byte ch state -- buf ch state )
|
||||||
{
|
{
|
||||||
{ begin [ drop double ] }
|
{ begin [ drop double ] }
|
||||||
{ double [ handle-double ] }
|
{ double [ handle-double ] }
|
||||||
|
@ -65,7 +65,7 @@ SYMBOL: ignore
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: decode-utf16le ( seq -- str )
|
: decode-utf16le ( seq -- str )
|
||||||
[ (decode-utf16le) ] decode ;
|
[ decode-utf16le-step ] decode ;
|
||||||
|
|
||||||
: encode-first
|
: encode-first
|
||||||
-10 shift
|
-10 shift
|
||||||
|
|
|
@ -1,16 +1,23 @@
|
||||||
USING: io.utf8 tools.test strings arrays unicode.syntax ;
|
USING: io.encodings.utf8 tools.test sbufs kernel io
|
||||||
|
sequences strings arrays unicode.syntax ;
|
||||||
|
|
||||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
|
: decode-utf8-w/stream ( array -- newarray )
|
||||||
|
>sbuf dup reverse-here <utf8> contents >array ;
|
||||||
|
|
||||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
|
: encode-utf8-w/stream ( array -- newarray )
|
||||||
|
SBUF" " clone tuck <utf8> write >array ;
|
||||||
|
|
||||||
[ "x" ] [ "x" decode-utf8 >string ] unit-test
|
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8-w/stream >array ] unit-test
|
||||||
|
|
||||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
|
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||||
|
|
||||||
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
|
[ "x" ] [ "x" decode-utf8-w/stream >string ] unit-test
|
||||||
|
|
||||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
|
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||||
|
|
||||||
|
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8-w/stream >array ] unit-test
|
||||||
|
|
||||||
|
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8-w/stream >array ] unit-test
|
||||||
|
|
||||||
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
|
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
|
||||||
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
|
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: math kernel sequences sbufs vectors growable io
|
USING: math kernel sequences sbufs vectors growable io continuations
|
||||||
namespaces io.encodings combinators ;
|
namespaces io.encodings combinators strings io.streams.c ;
|
||||||
IN: io.utf8
|
IN: io.encodings.utf8
|
||||||
|
|
||||||
! Decoding UTF-8
|
! Decoding UTF-8
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ SYMBOL: quad3
|
||||||
: end-multibyte ( buf byte ch -- buf ch state )
|
: end-multibyte ( buf byte ch -- buf ch state )
|
||||||
f append-nums [ decoded ] unless* ;
|
f append-nums [ decoded ] unless* ;
|
||||||
|
|
||||||
: (decode-utf8) ( buf byte ch state -- buf ch state )
|
: decode-utf8-step ( buf byte ch state -- buf ch state )
|
||||||
{
|
{
|
||||||
{ begin [ drop begin-utf8 ] }
|
{ begin [ drop begin-utf8 ] }
|
||||||
{ double [ end-multibyte ] }
|
{ double [ end-multibyte ] }
|
||||||
|
@ -44,11 +44,8 @@ SYMBOL: quad3
|
||||||
{ quad3 [ end-multibyte ] }
|
{ quad3 [ end-multibyte ] }
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: decode-utf8-chunk ( ch state seq -- buf ch state )
|
|
||||||
[ (decode-utf8) ] decode ;
|
|
||||||
|
|
||||||
: decode-utf8 ( seq -- str )
|
: decode-utf8 ( seq -- str )
|
||||||
start-decoding decode-utf8-chunk finish-decoding ;
|
[ decode-utf8-step ] decode ;
|
||||||
|
|
||||||
! Encoding UTF-8
|
! Encoding UTF-8
|
||||||
|
|
||||||
|
@ -81,6 +78,7 @@ SYMBOL: quad3
|
||||||
! Interface for streams
|
! Interface for streams
|
||||||
|
|
||||||
TUPLE: utf8 ;
|
TUPLE: utf8 ;
|
||||||
|
: <utf8> utf8 construct-delegate ;
|
||||||
! In the future, this should detect and ignore a BOM at the beginning
|
! In the future, this should detect and ignore a BOM at the beginning
|
||||||
|
|
||||||
M: utf8 init-decoding ( stream utf8 -- utf8-stream )
|
M: utf8 init-decoding ( stream utf8 -- utf8-stream )
|
||||||
|
@ -91,19 +89,8 @@ M: utf8 init-encoding ( stream utf8 -- utf8-stream )
|
||||||
|
|
||||||
M: utf8 stream-read1 1 swap stream-read ;
|
M: utf8 stream-read1 1 swap stream-read ;
|
||||||
|
|
||||||
: space ( resizable -- room-left )
|
|
||||||
dup underlying swap [ length ] 2apply - ;
|
|
||||||
|
|
||||||
: full? ( resizable -- ? ) space zero? ;
|
|
||||||
|
|
||||||
: utf8-stream-read ( buf ch state stream -- string )
|
|
||||||
>r pick full? [ r> 3drop >string ] [
|
|
||||||
pick space r> [ stream-read decode-utf8-chunk ] keep
|
|
||||||
utf8-stream-read
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
M: utf8 stream-read
|
M: utf8 stream-read
|
||||||
>r start-decoding drop r> delegate utf8-stream-read ;
|
[ decode-utf8-step ] decode-part ;
|
||||||
|
|
||||||
M: utf8 stream-read-until
|
M: utf8 stream-read-until
|
||||||
! Copied from { c-reader stream-read-until }!!!
|
! Copied from { c-reader stream-read-until }!!!
|
||||||
|
@ -115,3 +102,5 @@ M: utf8 stream-write1
|
||||||
|
|
||||||
M: utf8 stream-write
|
M: utf8 stream-write
|
||||||
>r encode-utf8 r> delegate stream-write ;
|
>r encode-utf8 r> delegate stream-write ;
|
||||||
|
|
||||||
|
M: utf8 dispose delegate dispose ;
|
||||||
|
|
Loading…
Reference in New Issue