diff --git a/basis/xml/tests/encodings.factor b/basis/xml/tests/encodings.factor
index 92fea88b4a..720b04ca42 100644
--- a/basis/xml/tests/encodings.factor
+++ b/basis/xml/tests/encodings.factor
@@ -1,7 +1,8 @@
-USING: xml xml.data xml.utilities tools.test ;
+USING: xml xml.data xml.utilities tools.test accessors kernel ;
[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/spaces.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test
@@ -10,3 +11,4 @@ USING: xml xml.data xml.utilities tools.test ;
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test
[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test
+[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test
\ No newline at end of file
diff --git a/basis/xml/tests/spaces.xml b/basis/xml/tests/spaces.xml
new file mode 100644
index 0000000000..dd194ff218
--- /dev/null
+++ b/basis/xml/tests/spaces.xml
@@ -0,0 +1,3 @@
+
+
+é
diff --git a/basis/xml/tests/unitag.xml b/basis/xml/tests/unitag.xml
new file mode 100644
index 0000000000..b7ef6ade3c
--- /dev/null
+++ b/basis/xml/tests/unitag.xml
@@ -0,0 +1 @@
+<é>xé>
\ No newline at end of file
diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor
index 26b04310d6..f2bb5a2e97 100644
--- a/basis/xml/tokenize/tokenize.factor
+++ b/basis/xml/tokenize/tokenize.factor
@@ -1,9 +1,10 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays ascii assocs combinators fry io.encodings
-io.encodings.iana io.encodings.utf16 io.encodings.utf8 kernel
-make math.parser namespaces sequences sets splitting state-parser
-xml.char-classes xml.data xml.entities xml.errors strings ;
+USING: accessors arrays ascii assocs combinators
+combinators.short-circuit fry io.encodings io.encodings.iana
+io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make
+math math.parser namespaces sequences sets splitting state-parser
+strings xml.char-classes xml.data xml.entities xml.errors ;
IN: xml.tokenize
! XML namespace processing: ns = namespace
@@ -53,17 +54,23 @@ SYMBOL: ns-stack
! version=1.0? is calculated once and passed around for efficiency
-: (parse-name) ( -- str )
- version=1.0? dup
- get-char name-start? [
- [ dup get-char name-char? not ] take-until nip
- ] [
- "Malformed name" xml-string-error
- ] if ;
+: assure-name ( str version=1.0? -- str )
+ over {
+ [ first name-start? ]
+ [ rest-slice [ name-char? ] with all? ]
+ } 2&& [ "Malformed name" xml-string-error ] unless ;
+
+: (parse-name) ( start -- str )
+ version=1.0?
+ [ [ get-char name-char? not ] curry take-until append ]
+ [ assure-name ] bi ;
+
+: parse-name-starting ( start -- name )
+ (parse-name) get-char CHAR: : =
+ [ next "" (parse-name) ] [ "" swap ] if f ;
: parse-name ( -- name )
- (parse-name) get-char CHAR: : =
- [ next (parse-name) ] [ "" swap ] if f ;
+ "" parse-name-starting ;
! -- Parsing strings
@@ -99,7 +106,7 @@ SYMBOL: ns-stack
: parse-text ( -- string )
CHAR: < parse-char ;
-
+
! Parsing tags
: start-tag ( -- name ? )
@@ -274,7 +281,7 @@ SYMBOL: string-input?
dup prolog-data set ;
: instruct ( -- instruction )
- (parse-name) dup "xml" =
+ "" (parse-name) dup "xml" =
[ drop parse-prolog ] [
dup >lower "xml" =
[ capitalized-prolog ]
@@ -294,17 +301,33 @@ SYMBOL: string-input?
! Autodetecting encodings
+: continue-make-tag ( str -- tag )
+ parse-name-starting middle-tag end-tag CHAR: > expect ;
+
: start-utf16le ( -- tag )
utf16le decode-input-if
CHAR: ? expect
0 expect instruct ;
+: 10xxxxxx? ( ch -- ? )
+ -6 shift 3 bitand 2 = ;
+
+: start