From c66b264af57cf61a2d37e34eb93a91d2e10f80b1 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 1 Feb 2008 17:45:35 -0600 Subject: [PATCH 1/3] Incomplete update of UTF decoder --- core/io/encodings/encodings.factor | 7 +++++-- core/io/utf8/utf8.factor | 18 +++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 5bc679cd27..956c512780 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2006, 2007 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: math kernel sequences sbufs vectors -namespaces ; +namespaces unicode.syntax ; IN: io.encodings TUPLE: encode-error ; @@ -10,13 +10,16 @@ TUPLE: encode-error ; TUPLE: decode-error ; -: decode-error ( -- * ) \ decode-error construct-empty throw ; +: decode-error ( -- * ) \ encode-error construct-empty throw ; SYMBOL: begin : decoded ( buf ch -- buf ch state ) over push 0 begin ; +: push-replacement ( buf -- buf ch state ) + UNICHAR: replacement-character decoded ; + : finish-decoding ( buf ch state -- str ) begin eq? [ decode-error ] unless drop "" like ; diff --git a/core/io/utf8/utf8.factor b/core/io/utf8/utf8.factor index 0269e20e93..321469378d 100644 --- a/core/io/utf8/utf8.factor +++ b/core/io/utf8/utf8.factor @@ -14,10 +14,10 @@ SYMBOL: quad3 : starts-2? ( char -- ? ) -6 shift BIN: 10 number= ; -: append-nums ( bottom top -- num ) - over starts-2? - [ 6 shift swap BIN: 111111 bitand bitor ] - [ decode-error ] if ; +: append-nums ( buf bottom top state-out -- buf num state ) + >r over starts-2? + [ 6 shift swap BIN: 111111 bitand bitor r> ] + [ r> 3drop push-replacement ] if ; : begin-utf8 ( buf byte -- buf ch state ) { @@ -25,20 +25,20 @@ SYMBOL: quad3 { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } - { [ t ] [ decode-error ] } + { [ t ] [ drop push-replacement ] } } cond ; : end-multibyte ( buf byte ch -- buf ch state ) - append-nums decoded ; + begin append-nums decoded ; : (decode-utf8) ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf8 ] } { double [ end-multibyte ] } - { triple [ append-nums triple2 ] } + { triple [ triple2 append-nums ] } { triple2 [ end-multibyte ] } - { quad [ append-nums quad2 ] } - { quad2 [ append-nums quad3 ] } + { quad [ quad2 append-nums ] } + { quad2 [ quad3 append-nums ] } { quad3 [ end-multibyte ] } } case ; From 64650d8500e99b88fcbb19570537a2232fab77da Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 1 Feb 2008 22:50:30 -0600 Subject: [PATCH 2/3] Fixing UTF-8 to put the replacement character for malformed stuff --- core/io/utf8/utf8-tests.factor | 12 ++++++------ core/io/utf8/utf8.factor | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/core/io/utf8/utf8-tests.factor b/core/io/utf8/utf8-tests.factor index d120b6243d..3576471586 100644 --- a/core/io/utf8/utf8-tests.factor +++ b/core/io/utf8/utf8-tests.factor @@ -1,16 +1,16 @@ -USING: io.utf8 tools.test strings ; +USING: io.utf8 tools.test strings arrays unicode.syntax ; -[ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 ] unit-test-fails +[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test -[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 ] unit-test +[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test [ "x" ] [ "x" decode-utf8 >string ] unit-test -[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 ] unit-test +[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test -[ { BIN: 10000000 } decode-utf8 ] unit-test-fails +[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test -[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 ] unit-test +[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test [ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ] [ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test diff --git a/core/io/utf8/utf8.factor b/core/io/utf8/utf8.factor index 321469378d..213afb6eae 100644 --- a/core/io/utf8/utf8.factor +++ b/core/io/utf8/utf8.factor @@ -29,7 +29,7 @@ SYMBOL: quad3 } cond ; : end-multibyte ( buf byte ch -- buf ch state ) - begin append-nums decoded ; + f append-nums [ decoded ] unless* ; : (decode-utf8) ( buf byte ch state -- buf ch state ) { From c584e50c04dc4ce9d5e78988dc6f07e5839b6fe8 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Fri, 1 Feb 2008 23:59:46 -0600 Subject: [PATCH 3/3] Finishing updating UTF --- core/io/encodings/encodings.factor | 2 +- core/io/utf16/utf16-tests.factor | 14 +++++++------- core/io/utf16/utf16.factor | 18 +++++++++++------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/core/io/encodings/encodings.factor b/core/io/encodings/encodings.factor index 956c512780..767e9b266b 100755 --- a/core/io/encodings/encodings.factor +++ b/core/io/encodings/encodings.factor @@ -10,7 +10,7 @@ TUPLE: encode-error ; TUPLE: decode-error ; -: decode-error ( -- * ) \ encode-error construct-empty throw ; +: decode-error ( -- * ) \ decode-error construct-empty throw ; SYMBOL: begin diff --git a/core/io/utf16/utf16-tests.factor b/core/io/utf16/utf16-tests.factor index 7a4b766941..9800a9827d 100755 --- a/core/io/utf16/utf16-tests.factor +++ b/core/io/utf16/utf16-tests.factor @@ -1,15 +1,15 @@ -USING: tools.test io.utf16 ; +USING: tools.test io.utf16 arrays unicode.syntax ; [ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test [ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test -[ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test-fails -[ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test-fails +[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test -[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be >array ] unit-test +[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test [ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test [ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test -[ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test-fails -[ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test-fails +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test +[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test -[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le >array ] unit-test +[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test diff --git a/core/io/utf16/utf16.factor b/core/io/utf16/utf16.factor index d6b160e156..19ebc1d43a 100755 --- a/core/io/utf16/utf16.factor +++ b/core/io/utf16/utf16.factor @@ -8,6 +8,9 @@ SYMBOL: double SYMBOL: quad1 SYMBOL: quad2 SYMBOL: quad3 +SYMBOL: ignore + +: do-ignore ( -- ch state ) 0 ignore ; : append-nums ( byte ch -- ch ) 8 shift bitor ; @@ -19,21 +22,22 @@ SYMBOL: quad3 dup -3 shift BIN: 11011 number= [ dup BIN: 00000100 bitand zero? [ BIN: 11 bitand quad1 ] - [ decode-error ] if + [ drop do-ignore ] if ] [ double ] if ; -: handle-quad2be ( byte ch -- ch ) +: handle-quad2be ( byte ch -- ch state ) swap dup -2 shift BIN: 110111 number= [ - >r 2 shift r> BIN: 11 bitand bitor - ] [ decode-error ] if ; + >r 2 shift r> BIN: 11 bitand bitor quad3 + ] [ 2drop do-ignore ] if ; : (decode-utf16be) ( buf byte ch state -- buf ch state ) { { begin [ drop begin-utf16be ] } { double [ end-multibyte ] } { quad1 [ append-nums quad2 ] } - { quad2 [ handle-quad2be quad3 ] } + { quad2 [ handle-quad2be ] } { quad3 [ append-nums HEX: 10000 + decoded ] } + { ignore [ 2drop push-replacement ] } } case ; : decode-utf16be ( seq -- str ) @@ -43,13 +47,13 @@ SYMBOL: quad3 swap dup -3 shift BIN: 11011 = [ dup BIN: 100 bitand 0 number= [ BIN: 11 bitand 8 shift bitor quad2 ] - [ decode-error ] if + [ 2drop push-replacement ] if ] [ end-multibyte ] if ; : handle-quad3le ( buf byte ch -- buf ch state ) swap dup -2 shift BIN: 110111 = [ BIN: 11 bitand append-nums HEX: 10000 + decoded - ] [ decode-error ] if ; + ] [ 2drop push-replacement ] if ; : (decode-utf16le) ( buf byte ch state -- buf ch state ) {