73 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Factor
		
	
	
		
		
			
		
	
	
			73 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Factor
		
	
	
| 
								 | 
							
								! Copyright (C) 2006, 2007 Daniel Ehrenberg.
							 | 
						||
| 
								 | 
							
								! See http://factorcode.org/license.txt for BSD license.
							 | 
						||
| 
								 | 
							
								USING: math kernel sequences sbufs vectors
							 | 
						||
| 
								 | 
							
								namespaces io.encodings combinators ;
							 | 
						||
| 
								 | 
							
								IN: io.utf8
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								SYMBOL: double
							 | 
						||
| 
								 | 
							
								SYMBOL: triple
							 | 
						||
| 
								 | 
							
								SYMBOL: triple2
							 | 
						||
| 
								 | 
							
								SYMBOL: quad
							 | 
						||
| 
								 | 
							
								SYMBOL: quad2
							 | 
						||
| 
								 | 
							
								SYMBOL: quad3
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: starts-2? ( char -- ? )
							 | 
						||
| 
								 | 
							
								    -6 shift BIN: 10 number= ;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: append-nums ( bottom top -- num )
							 | 
						||
| 
								 | 
							
								    over starts-2?
							 | 
						||
| 
								 | 
							
								    [ 6 shift swap BIN: 111111 bitand bitor ]
							 | 
						||
| 
								 | 
							
								    [ decode-error ] if ;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: begin-utf8 ( buf byte -- buf ch state )
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        { [ dup -7 shift zero? ] [ decoded ] }
							 | 
						||
| 
								 | 
							
								        { [ dup -5 shift BIN: 110 number= ] [ BIN: 11111 bitand double ] }
							 | 
						||
| 
								 | 
							
								        { [ dup -4 shift BIN: 1110 number= ] [ BIN: 1111 bitand triple ] }
							 | 
						||
| 
								 | 
							
								        { [ dup -3 shift BIN: 11110 number= ] [ BIN: 111 bitand quad ] }
							 | 
						||
| 
								 | 
							
								        { [ t ] [ decode-error ] }
							 | 
						||
| 
								 | 
							
								    } cond ;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: end-multibyte ( buf byte ch -- buf ch state )
							 | 
						||
| 
								 | 
							
								    append-nums decoded ;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: (decode-utf8) ( buf byte ch state -- buf ch state )
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        { begin [ drop begin-utf8 ] }
							 | 
						||
| 
								 | 
							
								        { double [ end-multibyte ] }
							 | 
						||
| 
								 | 
							
								        { triple [ append-nums triple2 ] }
							 | 
						||
| 
								 | 
							
								        { triple2 [ end-multibyte ] }
							 | 
						||
| 
								 | 
							
								        { quad [ append-nums quad2 ] }
							 | 
						||
| 
								 | 
							
								        { quad2 [ append-nums quad3 ] }
							 | 
						||
| 
								 | 
							
								        { quad3 [ end-multibyte ] }
							 | 
						||
| 
								 | 
							
								    } case ;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: decode-utf8 ( seq -- str )
							 | 
						||
| 
								 | 
							
								    [ -rot (decode-utf8) ] decode ;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: encoded ( char -- )
							 | 
						||
| 
								 | 
							
								    BIN: 111111 bitand BIN: 10000000 bitor , ;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: char>utf8 ( char -- )
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        { [ dup -7 shift zero? ] [ , ] }
							 | 
						||
| 
								 | 
							
								        { [ dup -11 shift zero? ] [
							 | 
						||
| 
								 | 
							
								            dup -6 shift BIN: 11000000 bitor ,
							 | 
						||
| 
								 | 
							
								            encoded
							 | 
						||
| 
								 | 
							
								        ] }
							 | 
						||
| 
								 | 
							
								        { [ dup -16 shift zero? ] [
							 | 
						||
| 
								 | 
							
								            dup -12 shift BIN: 11100000 bitor ,
							 | 
						||
| 
								 | 
							
								            dup -6 shift encoded
							 | 
						||
| 
								 | 
							
								            encoded
							 | 
						||
| 
								 | 
							
								        ] }
							 | 
						||
| 
								 | 
							
								        { [ t ] [
							 | 
						||
| 
								 | 
							
								            dup -18 shift BIN: 11110000 bitor ,
							 | 
						||
| 
								 | 
							
								            dup -12 shift encoded
							 | 
						||
| 
								 | 
							
								            dup -6 shift encoded
							 | 
						||
| 
								 | 
							
								            encoded
							 | 
						||
| 
								 | 
							
								        ] }
							 | 
						||
| 
								 | 
							
								    } cond ;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								: encode-utf8 ( str -- seq )
							 | 
						||
| 
								 | 
							
								    [ [ char>utf8 ] each ] B{ } make ;
							 |