Unicode encoding changes
parent
f378122dc5
commit
56afb67bfc
|
@ -1,3 +1,8 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
IN: io.encodings.binary SYMBOL: binary
|
USING: io.encodings kernel ;
|
||||||
|
IN: io.encodings.binary
|
||||||
|
|
||||||
|
TUPLE: binary ;
|
||||||
|
M: binary <encoder> drop ;
|
||||||
|
M: binary <decoder> drop ;
|
||||||
|
|
|
@ -2,62 +2,36 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: math kernel sequences sbufs vectors namespaces
|
USING: math kernel sequences sbufs vectors namespaces
|
||||||
growable strings io classes continuations combinators
|
growable strings io classes continuations combinators
|
||||||
io.styles io.streams.plain io.encodings.binary splitting
|
io.styles io.streams.plain splitting
|
||||||
io.streams.duplex byte-arrays ;
|
io.streams.duplex byte-arrays sequences.private ;
|
||||||
IN: io.encodings
|
IN: io.encodings
|
||||||
|
|
||||||
! The encoding descriptor protocol
|
! The encoding descriptor protocol
|
||||||
|
|
||||||
GENERIC: decode-step ( buf char encoding -- )
|
GENERIC: decode-char ( stream encoding -- char/f )
|
||||||
M: object decode-step drop swap push ;
|
|
||||||
|
|
||||||
GENERIC: init-decoder ( stream encoding -- encoding )
|
GENERIC: encode-char ( char stream encoding -- )
|
||||||
M: tuple-class init-decoder construct-empty init-decoder ;
|
|
||||||
M: object init-decoder nip ;
|
|
||||||
|
|
||||||
GENERIC: stream-write-encoded ( string stream encoding -- byte-array )
|
GENERIC: <decoder> ( stream decoding -- newstream )
|
||||||
M: object stream-write-encoded drop stream-write ;
|
|
||||||
|
GENERIC: <encoder> ( stream encoding -- newstream )
|
||||||
|
|
||||||
|
: replacement-char HEX: fffd ;
|
||||||
|
|
||||||
! Decoding
|
! Decoding
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
TUPLE: decode-error ;
|
TUPLE: decode-error ;
|
||||||
|
|
||||||
: decode-error ( -- * ) \ decode-error construct-empty throw ;
|
: decode-error ( -- * ) \ decode-error construct-empty throw ;
|
||||||
|
|
||||||
SYMBOL: begin
|
TUPLE: decoder stream code cr ;
|
||||||
|
M: tuple-class <decoder> construct-empty <decoder> ;
|
||||||
|
M: tuple <decoder> f decoder construct-boa ;
|
||||||
|
|
||||||
: push-decoded ( buf ch -- buf ch state )
|
: >decoder< ( decoder -- stream encoding )
|
||||||
over push 0 begin ;
|
{ decoder-stream decoder-code } get-slots ;
|
||||||
|
|
||||||
: push-replacement ( buf -- buf ch state )
|
|
||||||
! This is the replacement character
|
|
||||||
HEX: fffd push-decoded ;
|
|
||||||
|
|
||||||
: space ( resizable -- room-left )
|
|
||||||
dup underlying swap [ length ] 2apply - ;
|
|
||||||
|
|
||||||
: full? ( resizable -- ? ) space zero? ;
|
|
||||||
|
|
||||||
: end-read-loop ( buf ch state stream quot -- string/f )
|
|
||||||
2drop 2drop >string f like ;
|
|
||||||
|
|
||||||
: decode-read-loop ( buf stream encoding -- string/f )
|
|
||||||
pick full? [ 2drop >string ] [
|
|
||||||
over stream-read1 [
|
|
||||||
-rot tuck >r >r >r dupd r> decode-step r> r>
|
|
||||||
decode-read-loop
|
|
||||||
] [ 2drop >string f like ] if*
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
: decode-read ( length stream encoding -- string )
|
|
||||||
rot <sbuf> -rot decode-read-loop ;
|
|
||||||
|
|
||||||
TUPLE: decoder code cr ;
|
|
||||||
: <decoder> ( stream encoding -- newstream )
|
|
||||||
dup binary eq? [ drop ] [
|
|
||||||
dupd init-decoder { set-delegate set-decoder-code }
|
|
||||||
decoder construct
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
: cr+ t swap set-decoder-cr ; inline
|
: cr+ t swap set-decoder-cr ; inline
|
||||||
|
|
||||||
|
@ -82,72 +56,78 @@ TUPLE: decoder code cr ;
|
||||||
over decoder-cr [
|
over decoder-cr [
|
||||||
over cr-
|
over cr-
|
||||||
"\n" ?head [
|
"\n" ?head [
|
||||||
swap stream-read1 [ add ] when*
|
over stream-read1 [ add ] when*
|
||||||
] [ nip ] if
|
] when
|
||||||
] [ nip ] if ;
|
] when nip ;
|
||||||
|
|
||||||
|
: read-loop ( n stream -- string )
|
||||||
|
over 0 <string> [
|
||||||
|
[
|
||||||
|
>r stream-read1 dup
|
||||||
|
[ swap r> set-nth-unsafe f ] [ r> 3drop t ] if
|
||||||
|
] 2curry find-integer
|
||||||
|
] keep swap [ head ] when* ;
|
||||||
|
|
||||||
M: decoder stream-read
|
M: decoder stream-read
|
||||||
tuck { delegate decoder-code } get-slots decode-read fix-read ;
|
tuck read-loop fix-read ;
|
||||||
|
|
||||||
M: decoder stream-read-partial stream-read ;
|
: (read-until) ( buf quot -- string/f sep/f )
|
||||||
|
! quot: -- char keep-going?
|
||||||
: decoder-read-until ( stream delim -- ch )
|
dup call
|
||||||
! Copied from { c-reader stream-read-until }!!!
|
[ >r drop "" like r> ]
|
||||||
over stream-read1 dup [
|
[ pick push (read-until) ] if ; inline
|
||||||
dup pick memq? [ 2nip ] [ , decoder-read-until ] if
|
|
||||||
] [
|
|
||||||
2nip
|
|
||||||
] if ;
|
|
||||||
|
|
||||||
M: decoder stream-read-until
|
M: decoder stream-read-until
|
||||||
! Copied from { c-reader stream-read-until }!!!
|
SBUF" " clone -rot >decoder<
|
||||||
[ swap decoder-read-until ] "" make
|
[ decode-char dup rot memq? ] 3curry (read-until) ;
|
||||||
swap over empty? over not and [ 2drop f f ] when ;
|
|
||||||
|
|
||||||
: fix-read1 ( stream char -- char )
|
: fix-read1 ( stream char -- char )
|
||||||
over decoder-cr [
|
over decoder-cr [
|
||||||
over cr-
|
over cr-
|
||||||
dup CHAR: \n = [
|
dup CHAR: \n = [
|
||||||
drop stream-read1
|
drop dup stream-read1
|
||||||
] [ nip ] if
|
] when
|
||||||
] [ nip ] if ;
|
] when nip ;
|
||||||
|
|
||||||
M: decoder stream-read1
|
M: decoder stream-read1
|
||||||
1 swap stream-read f like [ first ] [ f ] if* ;
|
dup >decoder< decode-char fix-read1 ;
|
||||||
|
|
||||||
M: decoder stream-readln ( stream -- str )
|
M: decoder stream-readln ( stream -- str )
|
||||||
"\r\n" over stream-read-until handle-readln ;
|
"\r\n" over stream-read-until handle-readln ;
|
||||||
|
|
||||||
|
M: decoder dispose decoder-stream dispose ;
|
||||||
|
|
||||||
! Encoding
|
! Encoding
|
||||||
|
|
||||||
TUPLE: encode-error ;
|
TUPLE: encode-error ;
|
||||||
|
|
||||||
: encode-error ( -- * ) \ encode-error construct-empty throw ;
|
: encode-error ( -- * ) \ encode-error construct-empty throw ;
|
||||||
|
|
||||||
TUPLE: encoder code ;
|
TUPLE: encoder stream code ;
|
||||||
: <encoder> ( stream encoding -- newstream )
|
M: tuple-class <encoder> construct-empty <encoder> ;
|
||||||
dup binary eq? [ drop ] [
|
M: tuple <encoder> encoder construct-boa ;
|
||||||
construct-empty { set-delegate set-encoder-code }
|
|
||||||
encoder construct
|
: >encoder< ( encoder -- stream encoding )
|
||||||
] if ;
|
{ encoder-stream encoder-code } get-slots ;
|
||||||
|
|
||||||
M: encoder stream-write1
|
M: encoder stream-write1
|
||||||
>r 1string r> stream-write ;
|
>encoder< encode-char ;
|
||||||
|
|
||||||
M: encoder stream-write
|
M: encoder stream-write
|
||||||
{ delegate encoder-code } get-slots stream-write-encoded ;
|
>encoder< [ encode-char ] 2curry each ;
|
||||||
|
|
||||||
M: encoder dispose delegate dispose ;
|
M: encoder dispose encoder-stream dispose ;
|
||||||
|
|
||||||
INSTANCE: encoder plain-writer
|
INSTANCE: encoder plain-writer
|
||||||
|
|
||||||
! Rebinding duplex streams which have not read anything yet
|
! Rebinding duplex streams which have not read anything yet
|
||||||
|
|
||||||
: reencode ( stream encoding -- newstream )
|
: reencode ( stream encoding -- newstream )
|
||||||
over encoder? [ >r delegate r> ] when <encoder> ;
|
over encoder? [ >r encoder-stream r> ] when <encoder> ;
|
||||||
|
|
||||||
: redecode ( stream encoding -- newstream )
|
: redecode ( stream encoding -- newstream )
|
||||||
over decoder? [ >r delegate r> ] when <decoder> ;
|
over decoder? [ >r decoder-stream r> ] when <decoder> ;
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
|
: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
|
||||||
tuck reencode >r redecode r> <duplex-stream> ;
|
tuck reencode >r redecode r> <duplex-stream> ;
|
||||||
|
|
|
@ -6,82 +6,68 @@ IN: io.encodings.utf8
|
||||||
|
|
||||||
! Decoding UTF-8
|
! Decoding UTF-8
|
||||||
|
|
||||||
TUPLE: utf8 ch state ;
|
TUPLE: utf8 ;
|
||||||
|
|
||||||
SYMBOL: double
|
<PRIVATE
|
||||||
SYMBOL: triple
|
|
||||||
SYMBOL: triple2
|
|
||||||
SYMBOL: quad
|
|
||||||
SYMBOL: quad2
|
|
||||||
SYMBOL: quad3
|
|
||||||
|
|
||||||
: starts-2? ( char -- ? )
|
: starts-2? ( char -- ? )
|
||||||
-6 shift BIN: 10 number= ;
|
dup [ -6 shift BIN: 10 number= ] when ;
|
||||||
|
|
||||||
: append-nums ( buf bottom top state-out -- buf num state )
|
: append-nums ( stream byte -- stream char )
|
||||||
>r over starts-2?
|
over stream-read1 dup starts-2?
|
||||||
[ 6 shift swap BIN: 111111 bitand bitor r> ]
|
[ 6 shift swap BIN: 111111 bitand bitor ]
|
||||||
[ r> 3drop push-replacement ] if ;
|
[ 2drop replacement-char ] if ;
|
||||||
|
|
||||||
: begin-utf8 ( buf byte -- buf ch state )
|
: double ( stream byte -- stream char )
|
||||||
|
BIN: 11111 bitand append-nums ;
|
||||||
|
|
||||||
|
: triple ( stream byte -- stream char )
|
||||||
|
BIN: 1111 bitand append-nums append-nums ;
|
||||||
|
|
||||||
|
: quad ( stream byte -- stream char )
|
||||||
|
BIN: 111 bitand append-nums append-nums append-nums ;
|
||||||
|
|
||||||
|
: begin-utf8 ( stream byte -- stream char )
|
||||||
{
|
{
|
||||||
{ [ dup -7 shift zero? ] [ push-decoded ] }
|
{ [ dup -7 shift zero? ] [ ] }
|
||||||
{ [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
|
{ [ dup -5 shift BIN: 110 number= ] [ double ] }
|
||||||
{ [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
|
{ [ dup -4 shift BIN: 1110 number= ] [ triple ] }
|
||||||
{ [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
|
{ [ dup -3 shift BIN: 11110 number= ] [ quad ] }
|
||||||
{ [ t ] [ drop push-replacement ] }
|
{ [ t ] [ drop replacement-char ] }
|
||||||
} cond ;
|
} cond ;
|
||||||
|
|
||||||
: end-multibyte ( buf byte ch -- buf ch state )
|
: decode-utf8 ( stream -- char/f )
|
||||||
f append-nums [ push-decoded ] unless* ;
|
dup stream-read1 dup [ begin-utf8 ] when nip ;
|
||||||
|
|
||||||
: decode-utf8-step ( buf byte ch state -- buf ch state )
|
M: utf8 decode-char
|
||||||
{
|
drop decode-utf8 ;
|
||||||
{ begin [ drop begin-utf8 ] }
|
|
||||||
{ double [ end-multibyte ] }
|
|
||||||
{ triple [ triple2 append-nums ] }
|
|
||||||
{ triple2 [ end-multibyte ] }
|
|
||||||
{ quad [ quad2 append-nums ] }
|
|
||||||
{ quad2 [ quad3 append-nums ] }
|
|
||||||
{ quad3 [ end-multibyte ] }
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: unpack-state ( encoding -- ch state )
|
|
||||||
{ utf8-ch utf8-state } get-slots ;
|
|
||||||
|
|
||||||
: pack-state ( ch state encoding -- )
|
|
||||||
{ set-utf8-ch set-utf8-state } set-slots ;
|
|
||||||
|
|
||||||
M: utf8 decode-step ( buf char encoding -- )
|
|
||||||
[ unpack-state decode-utf8-step ] keep pack-state drop ;
|
|
||||||
|
|
||||||
M: utf8 init-decoder nip begin over set-utf8-state ;
|
|
||||||
|
|
||||||
! Encoding UTF-8
|
! Encoding UTF-8
|
||||||
|
|
||||||
: encoded ( char -- )
|
: encoded ( stream char -- )
|
||||||
BIN: 111111 bitand BIN: 10000000 bitor write1 ;
|
BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ;
|
||||||
|
|
||||||
: char>utf8 ( char -- )
|
: char>utf8 ( stream char -- )
|
||||||
{
|
{
|
||||||
{ [ dup -7 shift zero? ] [ write1 ] }
|
{ [ dup -7 shift zero? ] [ swap stream-write1 ] }
|
||||||
{ [ dup -11 shift zero? ] [
|
{ [ dup -11 shift zero? ] [
|
||||||
dup -6 shift BIN: 11000000 bitor write1
|
2dup -6 shift BIN: 11000000 bitor swap stream-write1
|
||||||
encoded
|
encoded
|
||||||
] }
|
] }
|
||||||
{ [ dup -16 shift zero? ] [
|
{ [ dup -16 shift zero? ] [
|
||||||
dup -12 shift BIN: 11100000 bitor write1
|
2dup -12 shift BIN: 11100000 bitor swap stream-write1
|
||||||
dup -6 shift encoded
|
2dup -6 shift encoded
|
||||||
encoded
|
encoded
|
||||||
] }
|
] }
|
||||||
{ [ t ] [
|
{ [ t ] [
|
||||||
dup -18 shift BIN: 11110000 bitor write1
|
2dup -18 shift BIN: 11110000 bitor swap stream-write1
|
||||||
dup -12 shift encoded
|
2dup -12 shift encoded
|
||||||
dup -6 shift encoded
|
2dup -6 shift encoded
|
||||||
encoded
|
encoded
|
||||||
] }
|
] }
|
||||||
} cond ;
|
} cond ;
|
||||||
|
|
||||||
M: utf8 stream-write-encoded
|
M: utf8 encode-char
|
||||||
! For efficiency, this should be modified to avoid variable reads
|
drop swap char>utf8 ;
|
||||||
drop [ [ char>utf8 ] each ] with-stream* ;
|
|
||||||
|
PRIVATE>
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
! Copyright (C) 2003, 2007 Slava Pestov.
|
! Copyright (C) 2003, 2007 Slava Pestov.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
IN: io.streams.string
|
|
||||||
USING: io kernel math namespaces sequences sbufs strings
|
USING: io kernel math namespaces sequences sbufs strings
|
||||||
generic splitting growable continuations io.streams.plain
|
generic splitting growable continuations io.streams.plain
|
||||||
io.encodings ;
|
io.encodings io.encodings.private ;
|
||||||
|
IN: io.streams.string
|
||||||
|
|
||||||
M: growable dispose drop ;
|
M: growable dispose drop ;
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,20 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ;
|
USING: io io.encodings kernel math ;
|
||||||
IN: io.encodings.ascii
|
IN: io.encodings.ascii
|
||||||
|
|
||||||
: encode-check< ( string stream max -- )
|
<PRIVATE
|
||||||
[ pick <= [ encode-error ] [ stream-write1 ] if ] 2curry each ;
|
: encode-if< ( char stream encoding max -- )
|
||||||
|
nip pick > [ encode-error ] [ stream-write1 ] if ;
|
||||||
|
|
||||||
: push-if< ( sbuf character max -- )
|
: decode-if< ( stream encoding max -- character )
|
||||||
over <= [ drop HEX: fffd ] when swap push ;
|
nip swap stream-read1 tuck > [ drop replacement-character ] unless ;
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
TUPLE: ascii ;
|
TUPLE: ascii ;
|
||||||
|
|
||||||
M: ascii stream-write-encoded ( string stream encoding -- )
|
M: ascii encode-char
|
||||||
drop 128 encode-check< ;
|
128 encode-if< ;
|
||||||
|
|
||||||
M: ascii decode-step
|
M: ascii decode-char
|
||||||
drop 128 push-if< ;
|
128 decode-if< ;
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
! Copyright (C) 2008 Daniel Ehrenberg.
|
! Copyright (C) 2008 Daniel Ehrenberg.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: io io.encodings strings kernel io.encodings.ascii sequences math ;
|
USING: io io.encodings kernel io.encodings.ascii.private ;
|
||||||
IN: io.encodings.latin1
|
IN: io.encodings.latin1
|
||||||
|
|
||||||
TUPLE: latin1 ;
|
TUPLE: latin1 ;
|
||||||
|
|
||||||
M: latin1 stream-write-encoded
|
M: latin1 encode-char
|
||||||
drop 256 encode-check< ;
|
256 encode-if< ;
|
||||||
|
|
||||||
M: latin1 decode-step
|
M: latin1 decode-char
|
||||||
drop swap push ;
|
drop stream-read1 ;
|
||||||
|
|
|
@ -4,92 +4,71 @@ USING: math kernel sequences sbufs vectors namespaces io.binary
|
||||||
io.encodings combinators splitting io byte-arrays ;
|
io.encodings combinators splitting io byte-arrays ;
|
||||||
IN: io.encodings.utf16
|
IN: io.encodings.utf16
|
||||||
|
|
||||||
! UTF-16BE decoding
|
TUPLE: utf16be ;
|
||||||
|
|
||||||
TUPLE: utf16be ch state ;
|
|
||||||
|
|
||||||
SYMBOL: double
|
|
||||||
SYMBOL: quad1
|
|
||||||
SYMBOL: quad2
|
|
||||||
SYMBOL: quad3
|
|
||||||
SYMBOL: ignore
|
|
||||||
|
|
||||||
: do-ignore ( -- ch state ) 0 ignore ;
|
|
||||||
|
|
||||||
: append-nums ( byte ch -- ch )
|
|
||||||
8 shift bitor ;
|
|
||||||
|
|
||||||
: end-multibyte ( buf byte ch -- buf ch state )
|
|
||||||
append-nums push-decoded ;
|
|
||||||
|
|
||||||
: begin-utf16be ( buf byte -- buf ch state )
|
|
||||||
dup -3 shift BIN: 11011 number= [
|
|
||||||
dup BIN: 00000100 bitand zero?
|
|
||||||
[ BIN: 11 bitand quad1 ]
|
|
||||||
[ drop do-ignore ] if
|
|
||||||
] [ double ] if ;
|
|
||||||
|
|
||||||
: handle-quad2be ( byte ch -- ch state )
|
|
||||||
swap dup -2 shift BIN: 110111 number= [
|
|
||||||
>r 2 shift r> BIN: 11 bitand bitor quad3
|
|
||||||
] [ 2drop do-ignore ] if ;
|
|
||||||
|
|
||||||
: decode-utf16be-step ( buf byte ch state -- buf ch state )
|
|
||||||
{
|
|
||||||
{ begin [ drop begin-utf16be ] }
|
|
||||||
{ double [ end-multibyte ] }
|
|
||||||
{ quad1 [ append-nums quad2 ] }
|
|
||||||
{ quad2 [ handle-quad2be ] }
|
|
||||||
{ quad3 [ append-nums HEX: 10000 + push-decoded ] }
|
|
||||||
{ ignore [ 2drop push-replacement ] }
|
|
||||||
} case ;
|
|
||||||
|
|
||||||
: unpack-state-be ( encoding -- ch state )
|
|
||||||
{ utf16be-ch utf16be-state } get-slots ;
|
|
||||||
|
|
||||||
: pack-state-be ( ch state encoding -- )
|
|
||||||
{ set-utf16be-ch set-utf16be-state } set-slots ;
|
|
||||||
|
|
||||||
M: utf16be decode-step
|
|
||||||
[ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ;
|
|
||||||
|
|
||||||
M: utf16be init-decoder nip begin over set-utf16be-state ;
|
|
||||||
|
|
||||||
! UTF-16LE decoding
|
|
||||||
|
|
||||||
TUPLE: utf16le ch state ;
|
TUPLE: utf16le ch state ;
|
||||||
|
|
||||||
: handle-double ( buf byte ch -- buf ch state )
|
TUPLE: utf16 started? ;
|
||||||
swap dup -3 shift BIN: 11011 = [
|
|
||||||
dup BIN: 100 bitand 0 number=
|
|
||||||
[ BIN: 11 bitand 8 shift bitor quad2 ]
|
|
||||||
[ 2drop push-replacement ] if
|
|
||||||
] [ end-multibyte ] if ;
|
|
||||||
|
|
||||||
: handle-quad3le ( buf byte ch -- buf ch state )
|
<PRIVATE
|
||||||
swap dup -2 shift BIN: 110111 = [
|
|
||||||
BIN: 11 bitand append-nums HEX: 10000 + push-decoded
|
! UTF-16BE decoding
|
||||||
] [ 2drop push-replacement ] if ;
|
|
||||||
|
: append-nums ( byte ch -- ch )
|
||||||
|
over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
|
||||||
|
|
||||||
|
: double-be ( stream byte -- stream char )
|
||||||
|
over stream-read1 swap append-nums ;
|
||||||
|
|
||||||
|
: quad-be ( stream byte -- stream char )
|
||||||
|
double-be over stream-read1 dup [
|
||||||
|
dup -2 shift BIN: 110111 number= [
|
||||||
|
>r 2 shift r> BIN: 11 bitand bitor
|
||||||
|
over stream-read1 swap append-nums HEX: 10000 +
|
||||||
|
] [ 2drop replacement-char ] if
|
||||||
|
] when ;
|
||||||
|
|
||||||
|
: ignore ( stream -- stream char )
|
||||||
|
dup stream-read1 drop replacement-char ;
|
||||||
|
|
||||||
|
: begin-utf16be ( stream byte -- stream char )
|
||||||
|
dup -3 shift BIN: 11011 number= [
|
||||||
|
dup BIN: 00000100 bitand zero?
|
||||||
|
[ BIN: 11 bitand quad-be ]
|
||||||
|
[ drop ignore ] if
|
||||||
|
] [ double-be ] if ;
|
||||||
|
|
||||||
|
M: decode-char
|
||||||
|
drop dup stream-read1 dup [ begin-utf16be ] when nip ;
|
||||||
|
|
||||||
|
! UTF-16LE decoding
|
||||||
|
|
||||||
|
: quad-le ( stream ch -- stream char )
|
||||||
|
over stream-read1 swap 10 shift bitor
|
||||||
|
over stream-read1 dup -2 shift BIN: 110111 = [
|
||||||
|
BIN: 11 bitand append-nums HEX: 10000 +
|
||||||
|
] [ 2drop replacement-char ] if ;
|
||||||
|
|
||||||
|
: double-le ( stream byte1 byte2 -- stream char )
|
||||||
|
dup -3 shift BIN: 11011 = [
|
||||||
|
dup BIN: 100 bitand 0 number=
|
||||||
|
[ BIN: 11 bitand 8 shift bitor quad-le ]
|
||||||
|
[ 2drop replacement-char ] if
|
||||||
|
] [ swap append-nums ] if ;
|
||||||
|
|
||||||
: decode-utf16le-step ( buf byte ch state -- buf ch state )
|
: decode-utf16le-step ( buf byte ch state -- buf ch state )
|
||||||
{
|
{
|
||||||
{ begin [ drop double ] }
|
{ begin [ drop double ] }
|
||||||
{ double [ handle-double ] }
|
{ double [ handle-double ] }
|
||||||
{ quad1 [ append-nums quad2 ] }
|
|
||||||
{ quad2 [ 10 shift bitor quad3 ] }
|
{ quad2 [ 10 shift bitor quad3 ] }
|
||||||
{ quad3 [ handle-quad3le ] }
|
{ quad3 [ handle-quad3le ] }
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: unpack-state-le ( encoding -- ch state )
|
: begin-utf16le ( stream byte -- stream char )
|
||||||
{ utf16le-ch utf16le-state } get-slots ;
|
over stream-read1 [ double-le ] [ drop replacement-char ] if*
|
||||||
|
|
||||||
: pack-state-le ( ch state encoding -- )
|
M: decode-char
|
||||||
{ set-utf16le-ch set-utf16le-state } set-slots ;
|
drop dup stream-read1 dup [ begin-utf16le ] when nip ;
|
||||||
|
|
||||||
M: utf16le decode-step
|
|
||||||
[ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ;
|
|
||||||
|
|
||||||
M: utf16le init-decoder nip begin over set-utf16le-state ;
|
|
||||||
|
|
||||||
! UTF-16LE/BE encoding
|
! UTF-16LE/BE encoding
|
||||||
|
|
||||||
|
@ -103,25 +82,25 @@ M: utf16le init-decoder nip begin over set-utf16le-state ;
|
||||||
dup -8 shift BIN: 11011100 bitor
|
dup -8 shift BIN: 11011100 bitor
|
||||||
swap BIN: 11111111 bitand ;
|
swap BIN: 11111111 bitand ;
|
||||||
|
|
||||||
: char>utf16be ( char -- )
|
: stream-write2 ( stream char1 char2 -- )
|
||||||
|
rot [ stream-write1 ] 2apply ;
|
||||||
|
|
||||||
|
: char>utf16be ( stream char -- )
|
||||||
dup HEX: FFFF > [
|
dup HEX: FFFF > [
|
||||||
HEX: 10000 -
|
HEX: 10000 -
|
||||||
dup encode-first swap write1 write1
|
dup encode-first stream-write2
|
||||||
encode-second swap write1 write1
|
encode-second stream-write2
|
||||||
] [ h>b/b write1 write1 ] if ;
|
] [ h>b/b swap stream-write2 ] if ;
|
||||||
|
|
||||||
: stream-write-utf16be ( string stream -- )
|
M: utf16be encode-char ( char stream encoding -- )
|
||||||
[ [ char>utf16be ] each ] with-stream* ;
|
drop char>utf16be ;
|
||||||
|
|
||||||
M: utf16be stream-write-encoded ( string stream encoding -- )
|
|
||||||
drop stream-write-utf16be ;
|
|
||||||
|
|
||||||
: char>utf16le ( char -- )
|
: char>utf16le ( char -- )
|
||||||
dup HEX: FFFF > [
|
dup HEX: FFFF > [
|
||||||
HEX: 10000 -
|
HEX: 10000 -
|
||||||
dup encode-first write1 write1
|
dup encode-first swap stream-write2
|
||||||
encode-second write1 write1
|
encode-second swap stream-write2
|
||||||
] [ h>b/b swap write1 write1 ] if ;
|
] [ h>b/b stream-write2 ] if ;
|
||||||
|
|
||||||
: stream-write-utf16le ( string stream -- )
|
: stream-write-utf16le ( string stream -- )
|
||||||
[ [ char>utf16le ] each ] with-stream* ;
|
[ [ char>utf16le ] each ] with-stream* ;
|
||||||
|
@ -139,17 +118,15 @@ M: utf16le stream-write-encoded ( string stream encoding -- )
|
||||||
|
|
||||||
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
|
||||||
|
|
||||||
TUPLE: utf16 started? ;
|
|
||||||
|
|
||||||
M: utf16 stream-write-encoded
|
|
||||||
dup utf16-started? [ drop ]
|
|
||||||
[ t swap set-utf16-started? bom-le over stream-write ] if
|
|
||||||
stream-write-utf16le ;
|
|
||||||
|
|
||||||
: bom>le/be ( bom -- le/be )
|
: bom>le/be ( bom -- le/be )
|
||||||
dup bom-le sequence= [ drop utf16le ] [
|
dup bom-le sequence= [ drop utf16le ] [
|
||||||
bom-be sequence= [ utf16be ] [ decode-error ] if
|
bom-be sequence= [ utf16be ] [ decode-error ] if
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
M: utf16 init-decoder ( stream encoding -- newencoding )
|
M: utf16 <decoder> ( stream utf16 -- decoder )
|
||||||
2 rot stream-read bom>le/be construct-empty init-decoder ;
|
2 rot stream-read bom>le/be <decoder> ;
|
||||||
|
|
||||||
|
M: utf16 <encoder> ( stream utf16 -- encoder )
|
||||||
|
drop bom-le over stream-write utf16le <encoder> ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
Loading…
Reference in New Issue