Unicode encoding changes

db4
Daniel Ehrenberg 2008-03-14 03:09:51 -05:00
parent f378122dc5
commit 56afb67bfc
7 changed files with 187 additions and 237 deletions

View File

@ -1,3 +1,8 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
IN: io.encodings.binary SYMBOL: binary
USING: io.encodings kernel ;
IN: io.encodings.binary
TUPLE: binary ;
M: binary <encoder> drop ;
M: binary <decoder> drop ;

View File

@ -2,62 +2,36 @@
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces
growable strings io classes continuations combinators
io.styles io.streams.plain io.encodings.binary splitting
io.streams.duplex byte-arrays ;
io.styles io.streams.plain splitting
io.streams.duplex byte-arrays sequences.private ;
IN: io.encodings
! The encoding descriptor protocol
GENERIC: decode-step ( buf char encoding -- )
M: object decode-step drop swap push ;
GENERIC: decode-char ( stream encoding -- char/f )
GENERIC: init-decoder ( stream encoding -- encoding )
M: tuple-class init-decoder construct-empty init-decoder ;
M: object init-decoder nip ;
GENERIC: encode-char ( char stream encoding -- )
GENERIC: stream-write-encoded ( string stream encoding -- byte-array )
M: object stream-write-encoded drop stream-write ;
GENERIC: <decoder> ( stream decoding -- newstream )
GENERIC: <encoder> ( stream encoding -- newstream )
: replacement-char HEX: fffd ;
! Decoding
<PRIVATE
TUPLE: decode-error ;
: decode-error ( -- * ) \ decode-error construct-empty throw ;
SYMBOL: begin
TUPLE: decoder stream code cr ;
M: tuple-class <decoder> construct-empty <decoder> ;
M: tuple <decoder> f decoder construct-boa ;
: push-decoded ( buf ch -- buf ch state )
over push 0 begin ;
: push-replacement ( buf -- buf ch state )
! This is the replacement character
HEX: fffd push-decoded ;
: space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ;
: full? ( resizable -- ? ) space zero? ;
: end-read-loop ( buf ch state stream quot -- string/f )
2drop 2drop >string f like ;
: decode-read-loop ( buf stream encoding -- string/f )
pick full? [ 2drop >string ] [
over stream-read1 [
-rot tuck >r >r >r dupd r> decode-step r> r>
decode-read-loop
] [ 2drop >string f like ] if*
] if ;
: decode-read ( length stream encoding -- string )
rot <sbuf> -rot decode-read-loop ;
TUPLE: decoder code cr ;
: <decoder> ( stream encoding -- newstream )
dup binary eq? [ drop ] [
dupd init-decoder { set-delegate set-decoder-code }
decoder construct
] if ;
: >decoder< ( decoder -- stream encoding )
{ decoder-stream decoder-code } get-slots ;
: cr+ t swap set-decoder-cr ; inline
@ -82,72 +56,78 @@ TUPLE: decoder code cr ;
over decoder-cr [
over cr-
"\n" ?head [
swap stream-read1 [ add ] when*
] [ nip ] if
] [ nip ] if ;
over stream-read1 [ add ] when*
] when
] when nip ;
: read-loop ( n stream -- string )
over 0 <string> [
[
>r stream-read1 dup
[ swap r> set-nth-unsafe f ] [ r> 3drop t ] if
] 2curry find-integer
] keep swap [ head ] when* ;
M: decoder stream-read
tuck { delegate decoder-code } get-slots decode-read fix-read ;
tuck read-loop fix-read ;
M: decoder stream-read-partial stream-read ;
: decoder-read-until ( stream delim -- ch )
! Copied from { c-reader stream-read-until }!!!
over stream-read1 dup [
dup pick memq? [ 2nip ] [ , decoder-read-until ] if
] [
2nip
] if ;
: (read-until) ( buf quot -- string/f sep/f )
! quot: -- char keep-going?
dup call
[ >r drop "" like r> ]
[ pick push (read-until) ] if ; inline
M: decoder stream-read-until
! Copied from { c-reader stream-read-until }!!!
[ swap decoder-read-until ] "" make
swap over empty? over not and [ 2drop f f ] when ;
SBUF" " clone -rot >decoder<
[ decode-char dup rot memq? ] 3curry (read-until) ;
: fix-read1 ( stream char -- char )
over decoder-cr [
over cr-
dup CHAR: \n = [
drop stream-read1
] [ nip ] if
] [ nip ] if ;
drop dup stream-read1
] when
] when nip ;
M: decoder stream-read1
1 swap stream-read f like [ first ] [ f ] if* ;
dup >decoder< decode-char fix-read1 ;
M: decoder stream-readln ( stream -- str )
"\r\n" over stream-read-until handle-readln ;
M: decoder dispose decoder-stream dispose ;
! Encoding
TUPLE: encode-error ;
: encode-error ( -- * ) \ encode-error construct-empty throw ;
TUPLE: encoder code ;
: <encoder> ( stream encoding -- newstream )
dup binary eq? [ drop ] [
construct-empty { set-delegate set-encoder-code }
encoder construct
] if ;
TUPLE: encoder stream code ;
M: tuple-class <encoder> construct-empty <encoder> ;
M: tuple <encoder> encoder construct-boa ;
: >encoder< ( encoder -- stream encoding )
{ encoder-stream encoder-code } get-slots ;
M: encoder stream-write1
>r 1string r> stream-write ;
>encoder< encode-char ;
M: encoder stream-write
{ delegate encoder-code } get-slots stream-write-encoded ;
>encoder< [ encode-char ] 2curry each ;
M: encoder dispose delegate dispose ;
M: encoder dispose encoder-stream dispose ;
INSTANCE: encoder plain-writer
! Rebinding duplex streams which have not read anything yet
: reencode ( stream encoding -- newstream )
over encoder? [ >r delegate r> ] when <encoder> ;
over encoder? [ >r encoder-stream r> ] when <encoder> ;
: redecode ( stream encoding -- newstream )
over decoder? [ >r delegate r> ] when <decoder> ;
over decoder? [ >r decoder-stream r> ] when <decoder> ;
PRIVATE>
: <encoder-duplex> ( stream-in stream-out encoding -- duplex )
tuck reencode >r redecode r> <duplex-stream> ;

View File

@ -6,82 +6,68 @@ IN: io.encodings.utf8
! Decoding UTF-8
TUPLE: utf8 ch state ;
TUPLE: utf8 ;
SYMBOL: double
SYMBOL: triple
SYMBOL: triple2
SYMBOL: quad
SYMBOL: quad2
SYMBOL: quad3
<PRIVATE
: starts-2? ( char -- ? )
-6 shift BIN: 10 number= ;
dup [ -6 shift BIN: 10 number= ] when ;
: append-nums ( buf bottom top state-out -- buf num state )
>r over starts-2?
[ 6 shift swap BIN: 111111 bitand bitor r> ]
[ r> 3drop push-replacement ] if ;
: append-nums ( stream byte -- stream char )
over stream-read1 dup starts-2?
[ 6 shift swap BIN: 111111 bitand bitor ]
[ 2drop replacement-char ] if ;
: begin-utf8 ( buf byte -- buf ch state )
: double ( stream byte -- stream char )
BIN: 11111 bitand append-nums ;
: triple ( stream byte -- stream char )
BIN: 1111 bitand append-nums append-nums ;
: quad ( stream byte -- stream char )
BIN: 111 bitand append-nums append-nums append-nums ;
: begin-utf8 ( stream byte -- stream char )
{
{ [ dup -7 shift zero? ] [ push-decoded ] }
{ [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
{ [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
{ [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
{ [ t ] [ drop push-replacement ] }
{ [ dup -7 shift zero? ] [ ] }
{ [ dup -5 shift BIN: 110 number= ] [ double ] }
{ [ dup -4 shift BIN: 1110 number= ] [ triple ] }
{ [ dup -3 shift BIN: 11110 number= ] [ quad ] }
{ [ t ] [ drop replacement-char ] }
} cond ;
: end-multibyte ( buf byte ch -- buf ch state )
f append-nums [ push-decoded ] unless* ;
: decode-utf8 ( stream -- char/f )
dup stream-read1 dup [ begin-utf8 ] when nip ;
: decode-utf8-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf8 ] }
{ double [ end-multibyte ] }
{ triple [ triple2 append-nums ] }
{ triple2 [ end-multibyte ] }
{ quad [ quad2 append-nums ] }
{ quad2 [ quad3 append-nums ] }
{ quad3 [ end-multibyte ] }
} case ;
: unpack-state ( encoding -- ch state )
{ utf8-ch utf8-state } get-slots ;
: pack-state ( ch state encoding -- )
{ set-utf8-ch set-utf8-state } set-slots ;
M: utf8 decode-step ( buf char encoding -- )
[ unpack-state decode-utf8-step ] keep pack-state drop ;
M: utf8 init-decoder nip begin over set-utf8-state ;
M: utf8 decode-char
drop decode-utf8 ;
! Encoding UTF-8
: encoded ( char -- )
BIN: 111111 bitand BIN: 10000000 bitor write1 ;
: encoded ( stream char -- )
BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ;
: char>utf8 ( char -- )
: char>utf8 ( stream char -- )
{
{ [ dup -7 shift zero? ] [ write1 ] }
{ [ dup -7 shift zero? ] [ swap stream-write1 ] }
{ [ dup -11 shift zero? ] [
dup -6 shift BIN: 11000000 bitor write1
2dup -6 shift BIN: 11000000 bitor swap stream-write1
encoded
] }
{ [ dup -16 shift zero? ] [
dup -12 shift BIN: 11100000 bitor write1
dup -6 shift encoded
2dup -12 shift BIN: 11100000 bitor swap stream-write1
2dup -6 shift encoded
encoded
] }
{ [ t ] [
dup -18 shift BIN: 11110000 bitor write1
dup -12 shift encoded
dup -6 shift encoded
2dup -18 shift BIN: 11110000 bitor swap stream-write1
2dup -12 shift encoded
2dup -6 shift encoded
encoded
] }
} cond ;
M: utf8 stream-write-encoded
! For efficiency, this should be modified to avoid variable reads
drop [ [ char>utf8 ] each ] with-stream* ;
M: utf8 encode-char
drop swap char>utf8 ;
PRIVATE>

View File

@ -1,9 +1,9 @@
! Copyright (C) 2003, 2007 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license.
IN: io.streams.string
USING: io kernel math namespaces sequences sbufs strings
generic splitting growable continuations io.streams.plain
io.encodings ;
io.encodings io.encodings.private ;
IN: io.streams.string
M: growable dispose drop ;

View File

@ -1,18 +1,20 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ;
USING: io io.encodings kernel math ;
IN: io.encodings.ascii
: encode-check< ( string stream max -- )
[ pick <= [ encode-error ] [ stream-write1 ] if ] 2curry each ;
<PRIVATE
: encode-if< ( char stream encoding max -- )
nip pick > [ encode-error ] [ stream-write1 ] if ;
: push-if< ( sbuf character max -- )
over <= [ drop HEX: fffd ] when swap push ;
: decode-if< ( stream encoding max -- character )
nip swap stream-read1 tuck > [ drop replacement-character ] unless ;
PRIVATE>
TUPLE: ascii ;
M: ascii stream-write-encoded ( string stream encoding -- )
drop 128 encode-check< ;
M: ascii encode-char
128 encode-if< ;
M: ascii decode-step
drop 128 push-if< ;
M: ascii decode-char
128 decode-if< ;

View File

@ -1,12 +1,12 @@
! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings strings kernel io.encodings.ascii sequences math ;
USING: io io.encodings kernel io.encodings.ascii.private ;
IN: io.encodings.latin1
TUPLE: latin1 ;
M: latin1 stream-write-encoded
drop 256 encode-check< ;
M: latin1 encode-char
256 encode-if< ;
M: latin1 decode-step
drop swap push ;
M: latin1 decode-char
drop stream-read1 ;

View File

@ -4,92 +4,71 @@ USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting io byte-arrays ;
IN: io.encodings.utf16
! UTF-16BE decoding
TUPLE: utf16be ch state ;
SYMBOL: double
SYMBOL: quad1
SYMBOL: quad2
SYMBOL: quad3
SYMBOL: ignore
: do-ignore ( -- ch state ) 0 ignore ;
: append-nums ( byte ch -- ch )
8 shift bitor ;
: end-multibyte ( buf byte ch -- buf ch state )
append-nums push-decoded ;
: begin-utf16be ( buf byte -- buf ch state )
dup -3 shift BIN: 11011 number= [
dup BIN: 00000100 bitand zero?
[ BIN: 11 bitand quad1 ]
[ drop do-ignore ] if
] [ double ] if ;
: handle-quad2be ( byte ch -- ch state )
swap dup -2 shift BIN: 110111 number= [
>r 2 shift r> BIN: 11 bitand bitor quad3
] [ 2drop do-ignore ] if ;
: decode-utf16be-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf16be ] }
{ double [ end-multibyte ] }
{ quad1 [ append-nums quad2 ] }
{ quad2 [ handle-quad2be ] }
{ quad3 [ append-nums HEX: 10000 + push-decoded ] }
{ ignore [ 2drop push-replacement ] }
} case ;
: unpack-state-be ( encoding -- ch state )
{ utf16be-ch utf16be-state } get-slots ;
: pack-state-be ( ch state encoding -- )
{ set-utf16be-ch set-utf16be-state } set-slots ;
M: utf16be decode-step
[ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ;
M: utf16be init-decoder nip begin over set-utf16be-state ;
! UTF-16LE decoding
TUPLE: utf16be ;
TUPLE: utf16le ch state ;
: handle-double ( buf byte ch -- buf ch state )
swap dup -3 shift BIN: 11011 = [
dup BIN: 100 bitand 0 number=
[ BIN: 11 bitand 8 shift bitor quad2 ]
[ 2drop push-replacement ] if
] [ end-multibyte ] if ;
TUPLE: utf16 started? ;
: handle-quad3le ( buf byte ch -- buf ch state )
swap dup -2 shift BIN: 110111 = [
BIN: 11 bitand append-nums HEX: 10000 + push-decoded
] [ 2drop push-replacement ] if ;
<PRIVATE
! UTF-16BE decoding
: append-nums ( byte ch -- ch )
over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
: double-be ( stream byte -- stream char )
over stream-read1 swap append-nums ;
: quad-be ( stream byte -- stream char )
double-be over stream-read1 dup [
dup -2 shift BIN: 110111 number= [
>r 2 shift r> BIN: 11 bitand bitor
over stream-read1 swap append-nums HEX: 10000 +
] [ 2drop replacement-char ] if
] when ;
: ignore ( stream -- stream char )
dup stream-read1 drop replacement-char ;
: begin-utf16be ( stream byte -- stream char )
dup -3 shift BIN: 11011 number= [
dup BIN: 00000100 bitand zero?
[ BIN: 11 bitand quad-be ]
[ drop ignore ] if
] [ double-be ] if ;
M: decode-char
drop dup stream-read1 dup [ begin-utf16be ] when nip ;
! UTF-16LE decoding
: quad-le ( stream ch -- stream char )
over stream-read1 swap 10 shift bitor
over stream-read1 dup -2 shift BIN: 110111 = [
BIN: 11 bitand append-nums HEX: 10000 +
] [ 2drop replacement-char ] if ;
: double-le ( stream byte1 byte2 -- stream char )
dup -3 shift BIN: 11011 = [
dup BIN: 100 bitand 0 number=
[ BIN: 11 bitand 8 shift bitor quad-le ]
[ 2drop replacement-char ] if
] [ swap append-nums ] if ;
: decode-utf16le-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop double ] }
{ double [ handle-double ] }
{ quad1 [ append-nums quad2 ] }
{ quad2 [ 10 shift bitor quad3 ] }
{ quad3 [ handle-quad3le ] }
} case ;
: unpack-state-le ( encoding -- ch state )
{ utf16le-ch utf16le-state } get-slots ;
: begin-utf16le ( stream byte -- stream char )
over stream-read1 [ double-le ] [ drop replacement-char ] if*
: pack-state-le ( ch state encoding -- )
{ set-utf16le-ch set-utf16le-state } set-slots ;
M: utf16le decode-step
[ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ;
M: utf16le init-decoder nip begin over set-utf16le-state ;
M: decode-char
drop dup stream-read1 dup [ begin-utf16le ] when nip ;
! UTF-16LE/BE encoding
@ -103,25 +82,25 @@ M: utf16le init-decoder nip begin over set-utf16le-state ;
dup -8 shift BIN: 11011100 bitor
swap BIN: 11111111 bitand ;
: char>utf16be ( char -- )
: stream-write2 ( stream char1 char2 -- )
rot [ stream-write1 ] 2apply ;
: char>utf16be ( stream char -- )
dup HEX: FFFF > [
HEX: 10000 -
dup encode-first swap write1 write1
encode-second swap write1 write1
] [ h>b/b write1 write1 ] if ;
dup encode-first stream-write2
encode-second stream-write2
] [ h>b/b swap stream-write2 ] if ;
: stream-write-utf16be ( string stream -- )
[ [ char>utf16be ] each ] with-stream* ;
M: utf16be stream-write-encoded ( string stream encoding -- )
drop stream-write-utf16be ;
M: utf16be encode-char ( char stream encoding -- )
drop char>utf16be ;
: char>utf16le ( char -- )
dup HEX: FFFF > [
HEX: 10000 -
dup encode-first write1 write1
encode-second write1 write1
] [ h>b/b swap write1 write1 ] if ;
dup encode-first swap stream-write2
encode-second swap stream-write2
] [ h>b/b stream-write2 ] if ;
: stream-write-utf16le ( string stream -- )
[ [ char>utf16le ] each ] with-stream* ;
@ -139,17 +118,15 @@ M: utf16le stream-write-encoded ( string stream encoding -- )
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
TUPLE: utf16 started? ;
M: utf16 stream-write-encoded
dup utf16-started? [ drop ]
[ t swap set-utf16-started? bom-le over stream-write ] if
stream-write-utf16le ;
: bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [
bom-be sequence= [ utf16be ] [ decode-error ] if
] if ;
M: utf16 init-decoder ( stream encoding -- newencoding )
2 rot stream-read bom>le/be construct-empty init-decoder ;
M: utf16 <decoder> ( stream utf16 -- decoder )
2 rot stream-read bom>le/be <decoder> ;
M: utf16 <encoder> ( stream utf16 -- encoder )
drop bom-le over stream-write utf16le <encoder> ;
PRIVATE>