| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | ! Copyright (C) 2005, 2009 Daniel Ehrenberg | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							|  |  |  | USING: kernel namespaces xml.name io.encodings.utf8 xml.elements | 
					
						
							|  |  |  | io.encodings.utf16 xml.tokenize xml.state math ascii sequences | 
					
						
							| 
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 |  |  | io.encodings.string io.encodings combinators accessors | 
					
						
							| 
									
										
										
										
											2009-03-18 19:32:34 -04:00
										 |  |  | xml.data io.encodings.iana xml.errors ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | IN: xml.autoencoding | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  | : decode-stream ( encoding -- )
 | 
					
						
							|  |  |  |     spot get [ swap re-decode ] change-stream drop ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | : continue-make-tag ( str -- tag )
 | 
					
						
							|  |  |  |     parse-name-starting middle-tag end-tag ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : start-utf16le ( -- tag )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |     utf16le decode-stream | 
					
						
							| 
									
										
										
										
											2009-01-23 16:29:28 -05:00
										 |  |  |     "?\0" expect | 
					
						
							|  |  |  |     check instruct ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : 10xxxxxx? ( ch -- ? )
 | 
					
						
							|  |  |  |     -6 shift 3 bitand 2 = ;
 | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | : start<name ( ch -- tag )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 |  |  |     ! This is unfortunate, and exists for the corner case | 
					
						
							|  |  |  |     ! that the first letter of the document is < and second is | 
					
						
							|  |  |  |     ! not ASCII | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  |     ascii? | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |     [ utf8 decode-stream next make-tag ] [ | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  |         next | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |         [ drop get-next 10xxxxxx? not ] take-until | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  |         get-char suffix utf8 decode | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |         utf8 decode-stream next | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  |         continue-make-tag | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							| 
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : prolog-encoding ( prolog -- )
 | 
					
						
							|  |  |  |     encoding>> dup "UTF-16" =
 | 
					
						
							| 
									
										
										
										
											2009-03-18 19:32:34 -04:00
										 |  |  |     [ drop ] [ | 
					
						
							|  |  |  |         dup name>encoding | 
					
						
							|  |  |  |         [ decode-stream ] [ bad-encoding ] ?if
 | 
					
						
							|  |  |  |     ] if ;
 | 
					
						
							| 
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : instruct-encoding ( instruct/prolog -- )
 | 
					
						
							|  |  |  |     dup prolog? | 
					
						
							|  |  |  |     [ prolog-encoding ] | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |     [ drop utf8 decode-stream ] if ;
 | 
					
						
							| 
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 19:25:23 -05:00
										 |  |  | : go-utf8 ( -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |     check utf8 decode-stream next next ;
 | 
					
						
							| 
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | : start< ( -- tag )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 |  |  |     ! What if first letter of processing instruction is non-ASCII? | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  |     get-next { | 
					
						
							|  |  |  |         { 0 [ next next start-utf16le ] } | 
					
						
							| 
									
										
										
										
											2009-01-29 19:25:23 -05:00
										 |  |  |         { CHAR: ? [ go-utf8 instruct dup instruct-encoding ] } | 
					
						
							|  |  |  |         { CHAR: ! [ go-utf8 direct ] } | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  |         [ check start<name ] | 
					
						
							|  |  |  |     } case ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : skip-utf8-bom ( -- tag )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |     "\u0000bb\u0000bf" expect utf8 decode-stream | 
					
						
							| 
									
										
										
										
											2009-01-23 16:29:28 -05:00
										 |  |  |     "<" expect check make-tag ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : decode-expecting ( encoding string -- tag )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |     [ decode-stream next ] [ expect ] bi* check make-tag ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : start-utf16be ( -- tag )
 | 
					
						
							|  |  |  |     utf16be "<" decode-expecting ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : skip-utf16le-bom ( -- tag )
 | 
					
						
							|  |  |  |     utf16le "\u0000fe<" decode-expecting ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : skip-utf16be-bom ( -- tag )
 | 
					
						
							|  |  |  |     utf16be "\u0000ff<" decode-expecting ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : start-document ( -- tag )
 | 
					
						
							|  |  |  |     get-char { | 
					
						
							|  |  |  |         { CHAR: < [ start< ] } | 
					
						
							|  |  |  |         { 0 [ start-utf16be ] } | 
					
						
							| 
									
										
										
										
											2011-11-23 21:49:33 -05:00
										 |  |  |         { 0xEF [ skip-utf8-bom ] } | 
					
						
							|  |  |  |         { 0xFF [ skip-utf16le-bom ] } | 
					
						
							|  |  |  |         { 0xFE [ skip-utf16be-bom ] } | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |         [ drop utf8 decode-stream check f ] | 
					
						
							| 
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 |  |  |     } case ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | 
 |