factor/core/io/utf8/utf8.factor

73 lines
1.9 KiB
Factor

! Copyright (C) 2006, 2007 Daniel Ehrenberg.
! See http://factorcode.org/license.txt for BSD license.
USING: math kernel sequences sbufs vectors
namespaces io.encodings combinators ;
IN: io.utf8
SYMBOL: double
SYMBOL: triple
SYMBOL: triple2
SYMBOL: quad
SYMBOL: quad2
SYMBOL: quad3
: starts-2? ( char -- ? )
-6 shift BIN: 10 number= ;
: append-nums ( bottom top -- num )
over starts-2?
[ 6 shift swap BIN: 111111 bitand bitor ]
[ decode-error ] if ;
: begin-utf8 ( buf byte -- buf ch state )
{
{ [ dup -7 shift zero? ] [ decoded ] }
{ [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
{ [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
{ [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
{ [ t ] [ decode-error ] }
} cond ;
: end-multibyte ( buf byte ch -- buf ch state )
append-nums decoded ;
: (decode-utf8) ( buf byte ch state -- buf ch state )
{
{ begin [ drop begin-utf8 ] }
{ double [ end-multibyte ] }
{ triple [ append-nums triple2 ] }
{ triple2 [ end-multibyte ] }
{ quad [ append-nums quad2 ] }
{ quad2 [ append-nums quad3 ] }
{ quad3 [ end-multibyte ] }
} case ;
: decode-utf8 ( seq -- str )
[ -rot (decode-utf8) ] decode ;
: encoded ( char -- )
BIN: 111111 bitand BIN: 10000000 bitor , ;
: char>utf8 ( char -- )
{
{ [ dup -7 shift zero? ] [ , ] }
{ [ dup -11 shift zero? ] [
dup -6 shift BIN: 11000000 bitor ,
encoded
] }
{ [ dup -16 shift zero? ] [
dup -12 shift BIN: 11100000 bitor ,
dup -6 shift encoded
encoded
] }
{ [ t ] [
dup -18 shift BIN: 11110000 bitor ,
dup -12 shift encoded
dup -6 shift encoded
encoded
] }
} cond ;
: encode-utf8 ( str -- seq )
[ [ char>utf8 ] each ] B{ } make ;