From c66b264af57cf61a2d37e34eb93a91d2e10f80b1 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 1 Feb 2008 17:45:35 -0600 Subject: [PATCH] Incomplete update of UTF decoder --- core/io/encodings/encodings.factor | 7 +++++-- core/io/utf8/utf8.factor | 18 +++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 5bc679cd27..956c512780 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors -namespaces ; +namespaces unicode.syntax ; IN: io.encodings TUPLE: encode-error ; @@ -10,13 +10,16 @@ TUPLE: encode-error ; TUPLE: decode-error ; -: decode-error ( -- * ) \ decode-error construct-empty throw ; +: decode-error ( -- * ) \ encode-error construct-empty throw ; SYMBOL: begin : decoded ( buf ch -- buf ch state ) over push 0 begin ; +: push-replacement ( buf -- buf ch state ) + UNICHAR: replacement-character decoded ; + : finish-decoding ( buf ch state -- str ) begin eq? [ decode-error ] unless drop "" like ; diff --git a/core/io/utf8/utf8.factor b/core/io/utf8/utf8.factor index 0269e20e93..321469378d 100644 --- a/core/io/utf8/utf8.factor +++ b/core/io/utf8/utf8.factor @@ -14,10 +14,10 @@ SYMBOL: quad3 : starts-2? ( char -- ? ) -6 shift BIN: 10 number= ; -: append-nums ( bottom top -- num ) - over starts-2? - [ 6 shift swap BIN: 111111 bitand bitor ] - [ decode-error ] if ; +: append-nums ( buf bottom top state-out -- buf num state ) + >r over starts-2? + [ 6 shift swap BIN: 111111 bitand bitor r> ] + [ r> 3drop push-replacement ] if ; : begin-utf8 ( buf byte -- buf ch state ) { @@ -25,20 +25,20 @@ SYMBOL: quad3 { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } - { [ t ] [ decode-error ] } + { [ t ] [ drop push-replacement ] } } cond ; : end-multibyte ( buf byte ch -- buf ch state ) - append-nums decoded ; + begin append-nums decoded ; : (decode-utf8) ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf8 ] } { double [ end-multibyte ] } - { triple [ append-nums triple2 ] } + { triple [ triple2 append-nums ] } { triple2 [ end-multibyte ] } - { quad [ append-nums quad2 ] } - { quad2 [ append-nums quad3 ] } + { quad [ quad2 append-nums ] } + { quad2 [ quad3 append-nums ] } { quad3 [ end-multibyte ] } } case ;