Unicode encoding changes

db4
Daniel Ehrenberg 2008-03-14 03:09:51 -05:00
parent f378122dc5
commit 56afb67bfc
7 changed files with 187 additions and 237 deletions

View File

@ -1,3 +1,8 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
IN: io.encodings.binary SYMBOL: binary USING: io.encodings kernel ;
IN: io.encodings.binary
TUPLE: binary ;
M: binary <encoder> drop ;
M: binary <decoder> drop ;

View File

@ -2,62 +2,36 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces USING: math kernel sequences sbufs vectors namespaces
growable strings io classes continuations combinators growable strings io classes continuations combinators
io.styles io.streams.plain io.encodings.binary splitting io.styles io.streams.plain splitting
io.streams.duplex byte-arrays ; io.streams.duplex byte-arrays sequences.private ;
IN: io.encodings IN: io.encodings
! The encoding descriptor protocol ! The encoding descriptor protocol
GENERIC: decode-step ( buf char encoding -- ) GENERIC: decode-char ( stream encoding -- char/f )
M: object decode-step drop swap push ;
GENERIC: init-decoder ( stream encoding -- encoding ) GENERIC: encode-char ( char stream encoding -- )
M: tuple-class init-decoder construct-empty init-decoder ;
M: object init-decoder nip ;
GENERIC: stream-write-encoded ( string stream encoding -- byte-array ) GENERIC: <decoder> ( stream decoding -- newstream )
M: object stream-write-encoded drop stream-write ;
GENERIC: <encoder> ( stream encoding -- newstream )
: replacement-char HEX: fffd ;
! Decoding ! Decoding
<PRIVATE
TUPLE: decode-error ; TUPLE: decode-error ;
: decode-error ( -- * ) \ decode-error construct-empty throw ; : decode-error ( -- * ) \ decode-error construct-empty throw ;
SYMBOL: begin TUPLE: decoder stream code cr ;
M: tuple-class <decoder> construct-empty <decoder> ;
M: tuple <decoder> f decoder construct-boa ;
: push-decoded ( buf ch -- buf ch state ) : >decoder< ( decoder -- stream encoding )
over push 0 begin ; { decoder-stream decoder-code } get-slots ;
: push-replacement ( buf -- buf ch state )
! This is the replacement character
HEX: fffd push-decoded ;
: space ( resizable -- room-left )
dup underlying swap [ length ] 2apply - ;
: full? ( resizable -- ? ) space zero? ;
: end-read-loop ( buf ch state stream quot -- string/f )
2drop 2drop >string f like ;
: decode-read-loop ( buf stream encoding -- string/f )
pick full? [ 2drop >string ] [
over stream-read1 [
-rot tuck >r >r >r dupd r> decode-step r> r>
decode-read-loop
] [ 2drop >string f like ] if*
] if ;
: decode-read ( length stream encoding -- string )
rot <sbuf> -rot decode-read-loop ;
TUPLE: decoder code cr ;
: <decoder> ( stream encoding -- newstream )
dup binary eq? [ drop ] [
dupd init-decoder { set-delegate set-decoder-code }
decoder construct
] if ;
: cr+ t swap set-decoder-cr ; inline : cr+ t swap set-decoder-cr ; inline
@ -82,72 +56,78 @@ TUPLE: decoder code cr ;
over decoder-cr [ over decoder-cr [
over cr- over cr-
"\n" ?head [ "\n" ?head [
swap stream-read1 [ add ] when* over stream-read1 [ add ] when*
] [ nip ] if ] when
] [ nip ] if ; ] when nip ;
: read-loop ( n stream -- string )
over 0 <string> [
[
>r stream-read1 dup
[ swap r> set-nth-unsafe f ] [ r> 3drop t ] if
] 2curry find-integer
] keep swap [ head ] when* ;
M: decoder stream-read M: decoder stream-read
tuck { delegate decoder-code } get-slots decode-read fix-read ; tuck read-loop fix-read ;
M: decoder stream-read-partial stream-read ; : (read-until) ( buf quot -- string/f sep/f )
! quot: -- char keep-going?
: decoder-read-until ( stream delim -- ch ) dup call
! Copied from { c-reader stream-read-until }!!! [ >r drop "" like r> ]
over stream-read1 dup [ [ pick push (read-until) ] if ; inline
dup pick memq? [ 2nip ] [ , decoder-read-until ] if
] [
2nip
] if ;
M: decoder stream-read-until M: decoder stream-read-until
! Copied from { c-reader stream-read-until }!!! SBUF" " clone -rot >decoder<
[ swap decoder-read-until ] "" make [ decode-char dup rot memq? ] 3curry (read-until) ;
swap over empty? over not and [ 2drop f f ] when ;
: fix-read1 ( stream char -- char ) : fix-read1 ( stream char -- char )
over decoder-cr [ over decoder-cr [
over cr- over cr-
dup CHAR: \n = [ dup CHAR: \n = [
drop stream-read1 drop dup stream-read1
] [ nip ] if ] when
] [ nip ] if ; ] when nip ;
M: decoder stream-read1 M: decoder stream-read1
1 swap stream-read f like [ first ] [ f ] if* ; dup >decoder< decode-char fix-read1 ;
M: decoder stream-readln ( stream -- str ) M: decoder stream-readln ( stream -- str )
"\r\n" over stream-read-until handle-readln ; "\r\n" over stream-read-until handle-readln ;
M: decoder dispose decoder-stream dispose ;
! Encoding ! Encoding
TUPLE: encode-error ; TUPLE: encode-error ;
: encode-error ( -- * ) \ encode-error construct-empty throw ; : encode-error ( -- * ) \ encode-error construct-empty throw ;
TUPLE: encoder code ; TUPLE: encoder stream code ;
: <encoder> ( stream encoding -- newstream ) M: tuple-class <encoder> construct-empty <encoder> ;
dup binary eq? [ drop ] [ M: tuple <encoder> encoder construct-boa ;
construct-empty { set-delegate set-encoder-code }
encoder construct : >encoder< ( encoder -- stream encoding )
] if ; { encoder-stream encoder-code } get-slots ;
M: encoder stream-write1 M: encoder stream-write1
>r 1string r> stream-write ; >encoder< encode-char ;
M: encoder stream-write M: encoder stream-write
{ delegate encoder-code } get-slots stream-write-encoded ; >encoder< [ encode-char ] 2curry each ;
M: encoder dispose delegate dispose ; M: encoder dispose encoder-stream dispose ;
INSTANCE: encoder plain-writer INSTANCE: encoder plain-writer
! Rebinding duplex streams which have not read anything yet ! Rebinding duplex streams which have not read anything yet
: reencode ( stream encoding -- newstream ) : reencode ( stream encoding -- newstream )
over encoder? [ >r delegate r> ] when <encoder> ; over encoder? [ >r encoder-stream r> ] when <encoder> ;
: redecode ( stream encoding -- newstream ) : redecode ( stream encoding -- newstream )
over decoder? [ >r delegate r> ] when <decoder> ; over decoder? [ >r decoder-stream r> ] when <decoder> ;
PRIVATE>
: <encoder-duplex> ( stream-in stream-out encoding -- duplex ) : <encoder-duplex> ( stream-in stream-out encoding -- duplex )
tuck reencode >r redecode r> <duplex-stream> ; tuck reencode >r redecode r> <duplex-stream> ;

View File

@ -6,82 +6,68 @@ IN: io.encodings.utf8
! Decoding UTF-8 ! Decoding UTF-8
TUPLE: utf8 ch state ; TUPLE: utf8 ;
SYMBOL: double <PRIVATE
SYMBOL: triple
SYMBOL: triple2
SYMBOL: quad
SYMBOL: quad2
SYMBOL: quad3
: starts-2? ( char -- ? ) : starts-2? ( char -- ? )
-6 shift BIN: 10 number= ; dup [ -6 shift BIN: 10 number= ] when ;
: append-nums ( buf bottom top state-out -- buf num state ) : append-nums ( stream byte -- stream char )
>r over starts-2? over stream-read1 dup starts-2?
[ 6 shift swap BIN: 111111 bitand bitor r> ] [ 6 shift swap BIN: 111111 bitand bitor ]
[ r> 3drop push-replacement ] if ; [ 2drop replacement-char ] if ;
: begin-utf8 ( buf byte -- buf ch state ) : double ( stream byte -- stream char )
BIN: 11111 bitand append-nums ;
: triple ( stream byte -- stream char )
BIN: 1111 bitand append-nums append-nums ;
: quad ( stream byte -- stream char )
BIN: 111 bitand append-nums append-nums append-nums ;
: begin-utf8 ( stream byte -- stream char )
{ {
{ [ dup -7 shift zero? ] [ push-decoded ] } { [ dup -7 shift zero? ] [ ] }
{ [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } { [ dup -5 shift BIN: 110 number= ] [ double ] }
{ [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } { [ dup -4 shift BIN: 1110 number= ] [ triple ] }
{ [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } { [ dup -3 shift BIN: 11110 number= ] [ quad ] }
{ [ t ] [ drop push-replacement ] } { [ t ] [ drop replacement-char ] }
} cond ; } cond ;
: end-multibyte ( buf byte ch -- buf ch state ) : decode-utf8 ( stream -- char/f )
f append-nums [ push-decoded ] unless* ; dup stream-read1 dup [ begin-utf8 ] when nip ;
: decode-utf8-step ( buf byte ch state -- buf ch state ) M: utf8 decode-char
{ drop decode-utf8 ;
{ begin [ drop begin-utf8 ] }
{ double [ end-multibyte ] }
{ triple [ triple2 append-nums ] }
{ triple2 [ end-multibyte ] }
{ quad [ quad2 append-nums ] }
{ quad2 [ quad3 append-nums ] }
{ quad3 [ end-multibyte ] }
} case ;
: unpack-state ( encoding -- ch state )
{ utf8-ch utf8-state } get-slots ;
: pack-state ( ch state encoding -- )
{ set-utf8-ch set-utf8-state } set-slots ;
M: utf8 decode-step ( buf char encoding -- )
[ unpack-state decode-utf8-step ] keep pack-state drop ;
M: utf8 init-decoder nip begin over set-utf8-state ;
! Encoding UTF-8 ! Encoding UTF-8
: encoded ( char -- ) : encoded ( stream char -- )
BIN: 111111 bitand BIN: 10000000 bitor write1 ; BIN: 111111 bitand BIN: 10000000 bitor swap stream-write1 ;
: char>utf8 ( char -- ) : char>utf8 ( stream char -- )
{ {
{ [ dup -7 shift zero? ] [ write1 ] } { [ dup -7 shift zero? ] [ swap stream-write1 ] }
{ [ dup -11 shift zero? ] [ { [ dup -11 shift zero? ] [
dup -6 shift BIN: 11000000 bitor write1 2dup -6 shift BIN: 11000000 bitor swap stream-write1
encoded encoded
] } ] }
{ [ dup -16 shift zero? ] [ { [ dup -16 shift zero? ] [
dup -12 shift BIN: 11100000 bitor write1 2dup -12 shift BIN: 11100000 bitor swap stream-write1
dup -6 shift encoded 2dup -6 shift encoded
encoded encoded
] } ] }
{ [ t ] [ { [ t ] [
dup -18 shift BIN: 11110000 bitor write1 2dup -18 shift BIN: 11110000 bitor swap stream-write1
dup -12 shift encoded 2dup -12 shift encoded
dup -6 shift encoded 2dup -6 shift encoded
encoded encoded
] } ] }
} cond ; } cond ;
M: utf8 stream-write-encoded M: utf8 encode-char
! For efficiency, this should be modified to avoid variable reads drop swap char>utf8 ;
drop [ [ char>utf8 ] each ] with-stream* ;
PRIVATE>

View File

@ -1,9 +1,9 @@
! Copyright (C) 2003, 2007 Slava Pestov. ! Copyright (C) 2003, 2007 Slava Pestov.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
IN: io.streams.string
USING: io kernel math namespaces sequences sbufs strings USING: io kernel math namespaces sequences sbufs strings
generic splitting growable continuations io.streams.plain generic splitting growable continuations io.streams.plain
io.encodings ; io.encodings io.encodings.private ;
IN: io.streams.string
M: growable dispose drop ; M: growable dispose drop ;

View File

@ -1,18 +1,20 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings strings kernel math sequences byte-arrays io.encodings ; USING: io io.encodings kernel math ;
IN: io.encodings.ascii IN: io.encodings.ascii
: encode-check< ( string stream max -- ) <PRIVATE
[ pick <= [ encode-error ] [ stream-write1 ] if ] 2curry each ; : encode-if< ( char stream encoding max -- )
nip pick > [ encode-error ] [ stream-write1 ] if ;
: push-if< ( sbuf character max -- ) : decode-if< ( stream encoding max -- character )
over <= [ drop HEX: fffd ] when swap push ; nip swap stream-read1 tuck > [ drop replacement-character ] unless ;
PRIVATE>
TUPLE: ascii ; TUPLE: ascii ;
M: ascii stream-write-encoded ( string stream encoding -- ) M: ascii encode-char
drop 128 encode-check< ; 128 encode-if< ;
M: ascii decode-step M: ascii decode-char
drop 128 push-if< ; 128 decode-if< ;

View File

@ -1,12 +1,12 @@
! Copyright (C) 2008 Daniel Ehrenberg. ! Copyright (C) 2008 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: io io.encodings strings kernel io.encodings.ascii sequences math ; USING: io io.encodings kernel io.encodings.ascii.private ;
IN: io.encodings.latin1 IN: io.encodings.latin1
TUPLE: latin1 ; TUPLE: latin1 ;
M: latin1 stream-write-encoded M: latin1 encode-char
drop 256 encode-check< ; 256 encode-if< ;
M: latin1 decode-step M: latin1 decode-char
drop swap push ; drop stream-read1 ;

View File

@ -4,92 +4,71 @@ USING: math kernel sequences sbufs vectors namespaces io.binary
io.encodings combinators splitting io byte-arrays ; io.encodings combinators splitting io byte-arrays ;
IN: io.encodings.utf16 IN: io.encodings.utf16
! UTF-16BE decoding TUPLE: utf16be ;
TUPLE: utf16be ch state ;
SYMBOL: double
SYMBOL: quad1
SYMBOL: quad2
SYMBOL: quad3
SYMBOL: ignore
: do-ignore ( -- ch state ) 0 ignore ;
: append-nums ( byte ch -- ch )
8 shift bitor ;
: end-multibyte ( buf byte ch -- buf ch state )
append-nums push-decoded ;
: begin-utf16be ( buf byte -- buf ch state )
dup -3 shift BIN: 11011 number= [
dup BIN: 00000100 bitand zero?
[ BIN: 11 bitand quad1 ]
[ drop do-ignore ] if
] [ double ] if ;
: handle-quad2be ( byte ch -- ch state )
swap dup -2 shift BIN: 110111 number= [
>r 2 shift r> BIN: 11 bitand bitor quad3
] [ 2drop do-ignore ] if ;
: decode-utf16be-step ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf16be ] }
{ double [ end-multibyte ] }
{ quad1 [ append-nums quad2 ] }
{ quad2 [ handle-quad2be ] }
{ quad3 [ append-nums HEX: 10000 + push-decoded ] }
{ ignore [ 2drop push-replacement ] }
} case ;
: unpack-state-be ( encoding -- ch state )
{ utf16be-ch utf16be-state } get-slots ;
: pack-state-be ( ch state encoding -- )
{ set-utf16be-ch set-utf16be-state } set-slots ;
M: utf16be decode-step
[ unpack-state-be decode-utf16be-step ] keep pack-state-be drop ;
M: utf16be init-decoder nip begin over set-utf16be-state ;
! UTF-16LE decoding
TUPLE: utf16le ch state ; TUPLE: utf16le ch state ;
: handle-double ( buf byte ch -- buf ch state ) TUPLE: utf16 started? ;
swap dup -3 shift BIN: 11011 = [
dup BIN: 100 bitand 0 number=
[ BIN: 11 bitand 8 shift bitor quad2 ]
[ 2drop push-replacement ] if
] [ end-multibyte ] if ;
: handle-quad3le ( buf byte ch -- buf ch state ) <PRIVATE
swap dup -2 shift BIN: 110111 = [
BIN: 11 bitand append-nums HEX: 10000 + push-decoded ! UTF-16BE decoding
] [ 2drop push-replacement ] if ;
: append-nums ( byte ch -- ch )
over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
: double-be ( stream byte -- stream char )
over stream-read1 swap append-nums ;
: quad-be ( stream byte -- stream char )
double-be over stream-read1 dup [
dup -2 shift BIN: 110111 number= [
>r 2 shift r> BIN: 11 bitand bitor
over stream-read1 swap append-nums HEX: 10000 +
] [ 2drop replacement-char ] if
] when ;
: ignore ( stream -- stream char )
dup stream-read1 drop replacement-char ;
: begin-utf16be ( stream byte -- stream char )
dup -3 shift BIN: 11011 number= [
dup BIN: 00000100 bitand zero?
[ BIN: 11 bitand quad-be ]
[ drop ignore ] if
] [ double-be ] if ;
M: decode-char
drop dup stream-read1 dup [ begin-utf16be ] when nip ;
! UTF-16LE decoding
: quad-le ( stream ch -- stream char )
over stream-read1 swap 10 shift bitor
over stream-read1 dup -2 shift BIN: 110111 = [
BIN: 11 bitand append-nums HEX: 10000 +
] [ 2drop replacement-char ] if ;
: double-le ( stream byte1 byte2 -- stream char )
dup -3 shift BIN: 11011 = [
dup BIN: 100 bitand 0 number=
[ BIN: 11 bitand 8 shift bitor quad-le ]
[ 2drop replacement-char ] if
] [ swap append-nums ] if ;
: decode-utf16le-step ( buf byte ch state -- buf ch state ) : decode-utf16le-step ( buf byte ch state -- buf ch state )
{ {
{ begin [ drop double ] } { begin [ drop double ] }
{ double [ handle-double ] } { double [ handle-double ] }
{ quad1 [ append-nums quad2 ] }
{ quad2 [ 10 shift bitor quad3 ] } { quad2 [ 10 shift bitor quad3 ] }
{ quad3 [ handle-quad3le ] } { quad3 [ handle-quad3le ] }
} case ; } case ;
: unpack-state-le ( encoding -- ch state ) : begin-utf16le ( stream byte -- stream char )
{ utf16le-ch utf16le-state } get-slots ; over stream-read1 [ double-le ] [ drop replacement-char ] if*
: pack-state-le ( ch state encoding -- ) M: decode-char
{ set-utf16le-ch set-utf16le-state } set-slots ; drop dup stream-read1 dup [ begin-utf16le ] when nip ;
M: utf16le decode-step
[ unpack-state-le decode-utf16le-step ] keep pack-state-le drop ;
M: utf16le init-decoder nip begin over set-utf16le-state ;
! UTF-16LE/BE encoding ! UTF-16LE/BE encoding
@ -103,25 +82,25 @@ M: utf16le init-decoder nip begin over set-utf16le-state ;
dup -8 shift BIN: 11011100 bitor dup -8 shift BIN: 11011100 bitor
swap BIN: 11111111 bitand ; swap BIN: 11111111 bitand ;
: char>utf16be ( char -- ) : stream-write2 ( stream char1 char2 -- )
rot [ stream-write1 ] 2apply ;
: char>utf16be ( stream char -- )
dup HEX: FFFF > [ dup HEX: FFFF > [
HEX: 10000 - HEX: 10000 -
dup encode-first swap write1 write1 dup encode-first stream-write2
encode-second swap write1 write1 encode-second stream-write2
] [ h>b/b write1 write1 ] if ; ] [ h>b/b swap stream-write2 ] if ;
: stream-write-utf16be ( string stream -- ) M: utf16be encode-char ( char stream encoding -- )
[ [ char>utf16be ] each ] with-stream* ; drop char>utf16be ;
M: utf16be stream-write-encoded ( string stream encoding -- )
drop stream-write-utf16be ;
: char>utf16le ( char -- ) : char>utf16le ( char -- )
dup HEX: FFFF > [ dup HEX: FFFF > [
HEX: 10000 - HEX: 10000 -
dup encode-first write1 write1 dup encode-first swap stream-write2
encode-second write1 write1 encode-second swap stream-write2
] [ h>b/b swap write1 write1 ] if ; ] [ h>b/b stream-write2 ] if ;
: stream-write-utf16le ( string stream -- ) : stream-write-utf16le ( string stream -- )
[ [ char>utf16le ] each ] with-stream* ; [ [ char>utf16le ] each ] with-stream* ;
@ -139,17 +118,15 @@ M: utf16le stream-write-encoded ( string stream encoding -- )
: start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ; : start-utf16be? ( seq1 -- seq2 ? ) bom-be ?head ;
TUPLE: utf16 started? ;
M: utf16 stream-write-encoded
dup utf16-started? [ drop ]
[ t swap set-utf16-started? bom-le over stream-write ] if
stream-write-utf16le ;
: bom>le/be ( bom -- le/be ) : bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [ dup bom-le sequence= [ drop utf16le ] [
bom-be sequence= [ utf16be ] [ decode-error ] if bom-be sequence= [ utf16be ] [ decode-error ] if
] if ; ] if ;
M: utf16 init-decoder ( stream encoding -- newencoding ) M: utf16 <decoder> ( stream utf16 -- decoder )
2 rot stream-read bom>le/be construct-empty init-decoder ; 2 rot stream-read bom>le/be <decoder> ;
M: utf16 <encoder> ( stream utf16 -- encoder )
drop bom-le over stream-write utf16le <encoder> ;
PRIVATE>