factor/basis/io/encodings/utf16/utf16.factor

126 lines
3.3 KiB
Factor
Raw Normal View History

2009-02-03 18:32:05 -05:00
! Copyright (C) 2006, 2009 Daniel Ehrenberg.
2008-02-11 00:14:42 -05:00
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors namespaces io.binary
2009-02-03 18:32:05 -05:00
io.encodings combinators splitting io byte-arrays io.encodings.iana ;
2008-02-15 20:44:35 -05:00
IN: io.encodings.utf16
2008-02-11 00:14:42 -05:00
SINGLETON: utf16be
2009-02-03 18:32:05 -05:00
utf16be "UTF-16BE" register-encoding
SINGLETON: utf16le
2008-02-11 00:14:42 -05:00
2009-02-03 18:32:05 -05:00
utf16le "UTF-16LE" register-encoding
SINGLETON: utf16
2008-02-11 00:14:42 -05:00
2009-02-03 18:32:05 -05:00
utf16 "UTF-16" register-encoding
2008-07-28 23:03:13 -04:00
ERROR: missing-bom ;
2008-03-14 04:09:51 -04:00
<PRIVATE
2008-02-11 00:14:42 -05:00
2008-03-14 04:09:51 -04:00
! UTF-16BE decoding
2008-02-11 00:14:42 -05:00
2008-03-14 04:09:51 -04:00
: append-nums ( byte ch -- ch )
over [ 8 shift bitor ] [ 2drop replacement-char ] if ;
2008-02-11 00:14:42 -05:00
2008-03-14 04:09:51 -04:00
: double-be ( stream byte -- stream char )
over stream-read1 swap append-nums ;
2008-03-14 04:09:51 -04:00
: quad-be ( stream byte -- stream char )
2008-03-18 17:01:14 -04:00
double-be over stream-read1 [
2008-03-14 04:09:51 -04:00
dup -2 shift BIN: 110111 number= [
[ 2 shift ] dip BIN: 11 bitand bitor
2008-03-14 04:09:51 -04:00
over stream-read1 swap append-nums HEX: 10000 +
2008-03-18 17:01:14 -04:00
] [ 2drop dup stream-read1 drop replacement-char ] if
] when* ;
2008-03-14 04:09:51 -04:00
: ignore ( stream -- stream char )
dup stream-read1 drop replacement-char ;
2008-03-14 04:09:51 -04:00
: begin-utf16be ( stream byte -- stream char )
dup -3 shift BIN: 11011 number= [
dup BIN: 00000100 bitand zero?
[ BIN: 11 bitand quad-be ]
[ drop ignore ] if
] [ double-be ] if ;
2008-03-18 17:01:14 -04:00
M: utf16be decode-char
2008-03-14 04:09:51 -04:00
drop dup stream-read1 dup [ begin-utf16be ] when nip ;
! UTF-16LE decoding
2008-03-14 04:09:51 -04:00
: quad-le ( stream ch -- stream char )
over stream-read1 swap 10 shift bitor
over stream-read1 dup -2 shift BIN: 110111 = [
BIN: 11 bitand append-nums HEX: 10000 +
] [ 2drop replacement-char ] if ;
2008-02-11 00:14:42 -05:00
2008-03-14 04:09:51 -04:00
: double-le ( stream byte1 byte2 -- stream char )
dup -3 shift BIN: 11011 = [
2008-02-11 00:14:42 -05:00
dup BIN: 100 bitand 0 number=
2008-03-14 04:09:51 -04:00
[ BIN: 11 bitand 8 shift bitor quad-le ]
[ 2drop replacement-char ] if
2008-03-18 17:01:14 -04:00
] [ append-nums ] if ;
2008-02-11 00:14:42 -05:00
2008-03-14 04:09:51 -04:00
: begin-utf16le ( stream byte -- stream char )
2008-03-18 17:01:14 -04:00
over stream-read1 [ double-le ] [ drop replacement-char ] if* ;
2008-03-18 17:01:14 -04:00
M: utf16le decode-char
2008-03-14 04:09:51 -04:00
drop dup stream-read1 dup [ begin-utf16le ] when nip ;
! UTF-16LE/BE encoding
2008-02-11 00:14:42 -05:00
2008-03-18 17:01:14 -04:00
: encode-first ( char -- byte1 byte2 )
2008-02-11 00:14:42 -05:00
-10 shift
dup -8 shift BIN: 11011000 bitor
swap HEX: FF bitand ;
2008-03-18 17:01:14 -04:00
: encode-second ( char -- byte3 byte4 )
2008-02-11 00:14:42 -05:00
BIN: 1111111111 bitand
dup -8 shift BIN: 11011100 bitor
swap BIN: 11111111 bitand ;
2008-03-14 04:09:51 -04:00
: stream-write2 ( stream char1 char2 -- )
2008-03-29 21:36:58 -04:00
rot [ stream-write1 ] curry bi@ ;
2008-03-14 04:09:51 -04:00
: char>utf16be ( stream char -- )
2008-02-11 00:14:42 -05:00
dup HEX: FFFF > [
HEX: 10000 -
2008-03-18 17:01:14 -04:00
2dup encode-first stream-write2
2008-03-14 04:09:51 -04:00
encode-second stream-write2
] [ h>b/b swap stream-write2 ] if ;
2008-02-11 00:14:42 -05:00
2008-03-14 04:09:51 -04:00
M: utf16be encode-char ( char stream encoding -- )
2008-03-18 17:01:14 -04:00
drop swap char>utf16be ;
2008-02-11 00:14:42 -05:00
2008-03-18 17:01:14 -04:00
: char>utf16le ( char stream -- )
2008-02-11 00:14:42 -05:00
dup HEX: FFFF > [
HEX: 10000 -
2008-03-18 17:01:14 -04:00
2dup encode-first swap stream-write2
2008-03-14 04:09:51 -04:00
encode-second swap stream-write2
] [ h>b/b stream-write2 ] if ;
2008-02-11 00:14:42 -05:00
2008-03-18 17:01:14 -04:00
M: utf16le encode-char ( char stream encoding -- )
drop swap char>utf16le ;
! UTF-16
CONSTANT: bom-le B{ HEX: ff HEX: fe }
2008-02-11 00:14:42 -05:00
CONSTANT: bom-be B{ HEX: fe HEX: ff }
2008-02-11 00:14:42 -05:00
2008-02-15 20:44:35 -05:00
: bom>le/be ( bom -- le/be )
dup bom-le sequence= [ drop utf16le ] [
2008-03-18 17:01:14 -04:00
bom-be sequence= [ utf16be ] [ missing-bom ] if
2008-02-15 20:44:35 -05:00
] if ;
2008-03-14 04:09:51 -04:00
M: utf16 <decoder> ( stream utf16 -- decoder )
2008-03-18 17:01:14 -04:00
drop 2 over stream-read bom>le/be <decoder> ;
2008-03-14 04:09:51 -04:00
M: utf16 <encoder> ( stream utf16 -- encoder )
drop bom-le over stream-write utf16le <encoder> ;
PRIVATE>