Merge branch 'master' of git://littledan.onigirihouse.com/git/littledan
						commit
						10986f0c77
					
				| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
! Copyright (C) 2006, 2007 Daniel Ehrenberg.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: math kernel sequences sbufs vectors
 | 
			
		||||
namespaces ;
 | 
			
		||||
namespaces unicode.syntax ;
 | 
			
		||||
IN: io.encodings
 | 
			
		||||
 | 
			
		||||
TUPLE: encode-error ;
 | 
			
		||||
| 
						 | 
				
			
			@ -17,6 +17,9 @@ SYMBOL: begin
 | 
			
		|||
: decoded ( buf ch -- buf ch state )
 | 
			
		||||
    over push 0 begin ;
 | 
			
		||||
 | 
			
		||||
: push-replacement ( buf -- buf ch state )
 | 
			
		||||
    UNICHAR: replacement-character decoded ;
 | 
			
		||||
 | 
			
		||||
: finish-decoding ( buf ch state -- str )
 | 
			
		||||
    begin eq? [ decode-error ] unless drop "" like ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,15 +1,15 @@
 | 
			
		|||
USING: tools.test io.utf16 ;
 | 
			
		||||
USING: tools.test io.utf16 arrays unicode.syntax ;
 | 
			
		||||
 | 
			
		||||
[ { CHAR: x } ] [ { 0 CHAR: x } decode-utf16be >array ] unit-test
 | 
			
		||||
[ { HEX: 1D11E } ] [ { HEX: D8 HEX: 34 HEX: DD HEX: 1E } decode-utf16be >array ] unit-test
 | 
			
		||||
[ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test-fails
 | 
			
		||||
[ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test-fails
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11011111 CHAR: q } decode-utf16be >array ] unit-test
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11011011 CHAR: x BIN: 11011011 CHAR: x } decode-utf16be >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be >array ] unit-test
 | 
			
		||||
[ B{ 0 120 216 52 221 30 } ] [ { CHAR: x HEX: 1d11e } encode-utf16be ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { CHAR: x } ] [ { CHAR: x 0 } decode-utf16le >array ] unit-test
 | 
			
		||||
[ { 119070 } ] [ { HEX: 34 HEX: D8 HEX: 1E HEX: DD } decode-utf16le >array ] unit-test
 | 
			
		||||
[ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test-fails
 | 
			
		||||
[ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test-fails
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011111 } decode-utf16le >array ] unit-test
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { 0 BIN: 11011011 0 0 } decode-utf16le >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le >array ] unit-test
 | 
			
		||||
[ B{ 120 0 52 216 30 221 } ] [ { CHAR: x HEX: 1d11e } encode-utf16le ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,6 +8,9 @@ SYMBOL: double
 | 
			
		|||
SYMBOL: quad1
 | 
			
		||||
SYMBOL: quad2
 | 
			
		||||
SYMBOL: quad3
 | 
			
		||||
SYMBOL: ignore
 | 
			
		||||
 | 
			
		||||
: do-ignore ( -- ch state ) 0 ignore ;
 | 
			
		||||
 | 
			
		||||
: append-nums ( byte ch -- ch )
 | 
			
		||||
    8 shift bitor ;
 | 
			
		||||
| 
						 | 
				
			
			@ -19,21 +22,22 @@ SYMBOL: quad3
 | 
			
		|||
    dup -3 shift BIN: 11011 number= [
 | 
			
		||||
        dup BIN: 00000100 bitand zero?
 | 
			
		||||
        [ BIN: 11 bitand quad1 ]
 | 
			
		||||
        [ decode-error ] if
 | 
			
		||||
        [ drop do-ignore ] if
 | 
			
		||||
    ] [ double ] if ;
 | 
			
		||||
 | 
			
		||||
: handle-quad2be ( byte ch -- ch )
 | 
			
		||||
: handle-quad2be ( byte ch -- ch state )
 | 
			
		||||
    swap dup -2 shift BIN: 110111 number= [
 | 
			
		||||
        >r 2 shift r> BIN: 11 bitand bitor
 | 
			
		||||
    ] [ decode-error ] if ;
 | 
			
		||||
        >r 2 shift r> BIN: 11 bitand bitor quad3
 | 
			
		||||
    ] [ 2drop do-ignore ] if ;
 | 
			
		||||
 | 
			
		||||
: (decode-utf16be) ( buf byte ch state -- buf ch state )
 | 
			
		||||
    {
 | 
			
		||||
        { begin [ drop begin-utf16be ] }
 | 
			
		||||
        { double [ end-multibyte ] }
 | 
			
		||||
        { quad1 [ append-nums quad2 ] }
 | 
			
		||||
        { quad2 [ handle-quad2be quad3 ] }
 | 
			
		||||
        { quad2 [ handle-quad2be ] }
 | 
			
		||||
        { quad3 [ append-nums HEX: 10000 + decoded ] }
 | 
			
		||||
        { ignore [ 2drop push-replacement ] }
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
: decode-utf16be ( seq -- str )
 | 
			
		||||
| 
						 | 
				
			
			@ -43,13 +47,13 @@ SYMBOL: quad3
 | 
			
		|||
    swap dup -3 shift BIN: 11011 = [
 | 
			
		||||
        dup BIN: 100 bitand 0 number=
 | 
			
		||||
        [ BIN: 11 bitand 8 shift bitor quad2 ]
 | 
			
		||||
        [ decode-error ] if
 | 
			
		||||
        [ 2drop push-replacement ] if
 | 
			
		||||
    ] [ end-multibyte ] if ;
 | 
			
		||||
 | 
			
		||||
: handle-quad3le ( buf byte ch -- buf ch state )
 | 
			
		||||
    swap dup -2 shift BIN: 110111 = [
 | 
			
		||||
        BIN: 11 bitand append-nums HEX: 10000 + decoded
 | 
			
		||||
    ] [ decode-error ] if ;
 | 
			
		||||
    ] [ 2drop push-replacement ] if ;
 | 
			
		||||
 | 
			
		||||
: (decode-utf16le) ( buf byte ch state -- buf ch state )
 | 
			
		||||
    {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,16 +1,16 @@
 | 
			
		|||
USING: io.utf8 tools.test strings ;
 | 
			
		||||
USING: io.utf8 tools.test strings arrays unicode.syntax ;
 | 
			
		||||
 | 
			
		||||
[ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 ] unit-test-fails
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 11111111 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 ] unit-test
 | 
			
		||||
[ { BIN: 101111111000000111111 } ] [ { BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ "x" ] [ "x" decode-utf8 >string ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 ] unit-test
 | 
			
		||||
[ { BIN: 11111000000 } ] [ { BIN: 11011111 BIN: 10000000 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { BIN: 10000000 } decode-utf8 ] unit-test-fails
 | 
			
		||||
[ { UNICHAR: replacement-character } ] [ { BIN: 10000000 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 ] unit-test
 | 
			
		||||
[ { BIN: 1111000000111111 } ] [ { BIN: 11101111 BIN: 10000000 BIN: 10111111 } decode-utf8 >array ] unit-test
 | 
			
		||||
 | 
			
		||||
[ B{ BIN: 11110101 BIN: 10111111 BIN: 10000000 BIN: 10111111 BIN: 11101111 BIN: 10000000 BIN: 10111111 BIN: 11011111 BIN: 10000000 CHAR: x } ]
 | 
			
		||||
[ { BIN: 101111111000000111111 BIN: 1111000000111111 BIN: 11111000000 CHAR: x } encode-utf8 ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,10 +14,10 @@ SYMBOL: quad3
 | 
			
		|||
: starts-2? ( char -- ? )
 | 
			
		||||
    -6 shift BIN: 10 number= ;
 | 
			
		||||
 | 
			
		||||
: append-nums ( bottom top -- num )
 | 
			
		||||
    over starts-2?
 | 
			
		||||
    [ 6 shift swap BIN: 111111 bitand bitor ]
 | 
			
		||||
    [ decode-error ] if ;
 | 
			
		||||
: append-nums ( buf bottom top state-out -- buf num state )
 | 
			
		||||
    >r over starts-2?
 | 
			
		||||
    [ 6 shift swap BIN: 111111 bitand bitor r> ]
 | 
			
		||||
    [ r> 3drop push-replacement ] if ;
 | 
			
		||||
 | 
			
		||||
: begin-utf8 ( buf byte -- buf ch state )
 | 
			
		||||
    {
 | 
			
		||||
| 
						 | 
				
			
			@ -25,20 +25,20 @@ SYMBOL: quad3
 | 
			
		|||
        { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
 | 
			
		||||
        { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
 | 
			
		||||
        { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
 | 
			
		||||
        { [ t ] [ decode-error ] }
 | 
			
		||||
        { [ t ] [ drop push-replacement ] }
 | 
			
		||||
    } cond ;
 | 
			
		||||
 | 
			
		||||
: end-multibyte ( buf byte ch -- buf ch state )
 | 
			
		||||
    append-nums decoded ;
 | 
			
		||||
    f append-nums [ decoded ] unless* ;
 | 
			
		||||
 | 
			
		||||
: (decode-utf8) ( buf byte ch state -- buf ch state )
 | 
			
		||||
    {
 | 
			
		||||
        { begin [ drop begin-utf8 ] }
 | 
			
		||||
        { double [ end-multibyte ] }
 | 
			
		||||
        { triple [ append-nums triple2 ] }
 | 
			
		||||
        { triple [ triple2 append-nums ] }
 | 
			
		||||
        { triple2 [ end-multibyte ] }
 | 
			
		||||
        { quad [ append-nums quad2 ] }
 | 
			
		||||
        { quad2 [ append-nums quad3 ] }
 | 
			
		||||
        { quad [ quad2 append-nums ] }
 | 
			
		||||
        { quad2 [ quad3 append-nums ] }
 | 
			
		||||
        { quad3 [ end-multibyte ] }
 | 
			
		||||
    } case ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue