From c584e50c04dc4ce9d5e78988dc6f07e5839b6fe8 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 1 Feb 2008 23:59:46 -0600 Subject: [PATCH] Finishing updating UTF --- core/io/encodings/encodings.factor | 2 +- core/io/utf16/utf16-tests.factor | 14 +++++++------- core/io/utf16/utf16.factor | 18 +++++++++++------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 956c512780..767e9b266b 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -10,7 +10,7 @@ TUPLE: encode-error ; TUPLE: decode-error ; -: decode-error ( -- * ) \ encode-error construct-empty throw ; +: decode-error ( -- * ) \ decode-error construct-empty throw ; SYMBOL: begin diff --git a/core/io/utf16/utf16-tests.factor b/core/io/utf16/utf16-tests.factor index 7a4b766941..9800a9827d 100755 --- a/core/io/utf16/utf16-tests.factor +++ b/core/io/utf16/utf16-tests.factor @@ -1,15 +1,15 @@ -USING: tools.test io.utf16 ; +USING: tools.test io.utf16 arrays unicode.syntax ; [ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test [ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test -[ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test-fails -[ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test-fails +[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test -[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be >array ] unit-test +[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test [ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test [ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test -[ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test-fails -[ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test-fails +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test -[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le >array ] unit-test +[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test diff --git a/core/io/utf16/utf16.factor b/core/io/utf16/utf16.factor index d6b160e156..19ebc1d43a 100755 --- a/core/io/utf16/utf16.factor +++ b/core/io/utf16/utf16.factor @@ -8,6 +8,9 @@ SYMBOL: double SYMBOL: quad1 SYMBOL: quad2 SYMBOL: quad3 +SYMBOL: ignore + +: do-ignore ( -- ch state ) 0 ignore ; : append-nums ( byte ch -- ch ) 8 shift bitor ; @@ -19,21 +22,22 @@ SYMBOL: quad3 dup -3 shift BIN: 11011 number= [ dup BIN: 00000100 bitand zero? [ BIN: 11 bitand quad1 ] - [ decode-error ] if + [ drop do-ignore ] if ] [ double ] if ; -: handle-quad2be ( byte ch -- ch ) +: handle-quad2be ( byte ch -- ch state ) swap dup -2 shift BIN: 110111 number= [ - >r 2 shift r> BIN: 11 bitand bitor - ] [ decode-error ] if ; + >r 2 shift r> BIN: 11 bitand bitor quad3 + ] [ 2drop do-ignore ] if ; : (decode-utf16be) ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf16be ] } { double [ end-multibyte ] } { quad1 [ append-nums quad2 ] } - { quad2 [ handle-quad2be quad3 ] } + { quad2 [ handle-quad2be ] } { quad3 [ append-nums HEX: 10000 + decoded ] } + { ignore [ 2drop push-replacement ] } } case ; : decode-utf16be ( seq -- str ) @@ -43,13 +47,13 @@ SYMBOL: quad3 swap dup -3 shift BIN: 11011 = [ dup BIN: 100 bitand 0 number= [ BIN: 11 bitand 8 shift bitor quad2 ] - [ decode-error ] if + [ 2drop push-replacement ] if ] [ end-multibyte ] if ; : handle-quad3le ( buf byte ch -- buf ch state ) swap dup -2 shift BIN: 110111 = [ BIN: 11 bitand append-nums HEX: 10000 + decoded - ] [ decode-error ] if ; + ] [ 2drop push-replacement ] if ; : (decode-utf16le) ( buf byte ch state -- buf ch state ) {