2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								! Copyright (C) 2005, 2009 Daniel Ehrenberg
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								! See http://factorcode.org/license.txt for BSD license.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								USING: kernel namespaces xml.name io.encodings.utf8 xml.elements
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								io.encodings.utf16 xml.tokenize xml.state math ascii sequences
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								io.encodings.string io.encodings combinators accessors
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-18 19:32:34 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								xml.data io.encodings.iana xml.errors ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								IN: xml.autoencoding
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: decode-stream ( encoding -- )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    spot get [ swap re-decode ] change-stream drop ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: continue-make-tag ( str -- tag )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    parse-name-starting middle-tag end-tag ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: start-utf16le ( -- tag )
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    utf16le decode-stream
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-23 16:29:28 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    "?\0" expect
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    check instruct ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: 10xxxxxx? ( ch -- ? )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    -6 shift 3 bitand 2 = ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								          
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: start<name ( ch -- tag )
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ! This is unfortunate, and exists for the corner case
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ! that the first letter of the document is < and second is
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ! not ASCII
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ascii?
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [ utf8 decode-stream next make-tag ] [
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        next
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        [ get-next 10xxxxxx? not ] take-until
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        get-char suffix utf8 decode
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        utf8 decode-stream next
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        continue-make-tag
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ] if ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: prolog-encoding ( prolog -- )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    encoding>> dup "UTF-16" =
							 | 
						
					
						
							
								
									
										
										
										
											2009-03-18 19:32:34 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [ drop ] [
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        dup name>encoding
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        [ decode-stream ] [ bad-encoding ] ?if
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ] if ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: instruct-encoding ( instruct/prolog -- )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    dup prolog?
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [ prolog-encoding ]
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [ drop utf8 decode-stream ] if ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 19:25:23 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								: go-utf8 ( -- )
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    check utf8 decode-stream next next ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: start< ( -- tag )
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ! What if first letter of processing instruction is non-ASCII?
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    get-next {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        { 0 [ next next start-utf16le ] }
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 19:25:23 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        { CHAR: ? [ go-utf8 instruct dup instruct-encoding ] }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        { CHAR: ! [ go-utf8 direct ] }
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        [ check start<name ]
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    } case ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: skip-utf8-bom ( -- tag )
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    "\u0000bb\u0000bf" expect utf8 decode-stream
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-23 16:29:28 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    "<" expect check make-tag ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: decode-expecting ( encoding string -- tag )
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [ decode-stream next ] [ expect ] bi* check make-tag ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: start-utf16be ( -- tag )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    utf16be "<" decode-expecting ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: skip-utf16le-bom ( -- tag )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    utf16le "\u0000fe<" decode-expecting ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: skip-utf16be-bom ( -- tag )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    utf16be "\u0000ff<" decode-expecting ;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								: start-document ( -- tag )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    get-char {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        { CHAR: < [ start< ] }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        { 0 [ start-utf16be ] }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        { HEX: EF [ skip-utf8-bom ] }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        { HEX: FF [ skip-utf16le-bom ] }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        { HEX: FE [ skip-utf16be-bom ] }
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        [ drop utf8 decode-stream check f ]
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-29 17:57:13 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    } case ;
							 | 
						
					
						
							
								
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 |