73 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Factor
		
	
	
		
		
			
		
	
	
			73 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Factor
		
	
	
|  | ! Copyright (C) 2006, 2007 Daniel Ehrenberg. | ||
|  | ! See http://factorcode.org/license.txt for BSD license. | ||
|  | USING: math kernel sequences sbufs vectors | ||
|  | namespaces io.encodings combinators ;
 | ||
|  | IN: io.utf8 | ||
|  | 
 | ||
|  | SYMBOL: double | ||
|  | SYMBOL: triple | ||
|  | SYMBOL: triple2 | ||
|  | SYMBOL: quad | ||
|  | SYMBOL: quad2 | ||
|  | SYMBOL: quad3 | ||
|  | 
 | ||
|  | : starts-2? ( char -- ? )
 | ||
|  |     -6 shift BIN: 10 number= ;
 | ||
|  | 
 | ||
|  | : append-nums ( bottom top -- num )
 | ||
|  |     over starts-2? | ||
|  |     [ 6 shift swap BIN: 111111 bitand bitor ] | ||
|  |     [ decode-error ] if ;
 | ||
|  | 
 | ||
|  | : begin-utf8 ( buf byte -- buf ch state )
 | ||
|  |     { | ||
|  |         { [ dup -7 shift zero? ] [ decoded ] } | ||
|  |         { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] } | ||
|  |         { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] } | ||
|  |         { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] } | ||
|  |         { [ t ] [ decode-error ] } | ||
|  |     } cond ;
 | ||
|  | 
 | ||
|  | : end-multibyte ( buf byte ch -- buf ch state )
 | ||
|  |     append-nums decoded ;
 | ||
|  | 
 | ||
|  | : (decode-utf8) ( buf byte ch state -- buf ch state )
 | ||
|  |     { | ||
|  |         { begin [ drop begin-utf8 ] } | ||
|  |         { double [ end-multibyte ] } | ||
|  |         { triple [ append-nums triple2 ] } | ||
|  |         { triple2 [ end-multibyte ] } | ||
|  |         { quad [ append-nums quad2 ] } | ||
|  |         { quad2 [ append-nums quad3 ] } | ||
|  |         { quad3 [ end-multibyte ] } | ||
|  |     } case ;
 | ||
|  | 
 | ||
|  | : decode-utf8 ( seq -- str )
 | ||
|  |     [ -rot (decode-utf8) ] decode ;
 | ||
|  | 
 | ||
|  | : encoded ( char -- )
 | ||
|  |     BIN: 111111 bitand BIN: 10000000 bitor , ;
 | ||
|  | 
 | ||
|  | : char>utf8 ( char -- )
 | ||
|  |     { | ||
|  |         { [ dup -7 shift zero? ] [ , ] } | ||
|  |         { [ dup -11 shift zero? ] [ | ||
|  |             dup -6 shift BIN: 11000000 bitor , | ||
|  |             encoded | ||
|  |         ] } | ||
|  |         { [ dup -16 shift zero? ] [ | ||
|  |             dup -12 shift BIN: 11100000 bitor , | ||
|  |             dup -6 shift encoded | ||
|  |             encoded | ||
|  |         ] } | ||
|  |         { [ t ] [ | ||
|  |             dup -18 shift BIN: 11110000 bitor , | ||
|  |             dup -12 shift encoded | ||
|  |             dup -6 shift encoded | ||
|  |             encoded | ||
|  |         ] } | ||
|  |     } cond ;
 | ||
|  | 
 | ||
|  | : encode-utf8 ( str -- seq )
 | ||
|  |     [ [ char>utf8 ] each ] B{ } make ;
 |