diff --git a/basis/xml/tests/ascii.xml b/basis/xml/tests/ascii.xml
new file mode 100644
index 0000000000..ca1c355c81
--- /dev/null
+++ b/basis/xml/tests/ascii.xml
@@ -0,0 +1 @@
+e
\ No newline at end of file
diff --git a/basis/xml/tests/encodings.factor b/basis/xml/tests/encodings.factor
new file mode 100644
index 0000000000..92fea88b4a
--- /dev/null
+++ b/basis/xml/tests/encodings.factor
@@ -0,0 +1,12 @@
+USING: xml xml.data xml.utilities tools.test ;
+
+[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be-bom.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test
+[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test
+[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test
diff --git a/basis/xml/tests/latin1.xml b/basis/xml/tests/latin1.xml
new file mode 100644
index 0000000000..f8bc6bcd61
--- /dev/null
+++ b/basis/xml/tests/latin1.xml
@@ -0,0 +1 @@
+é
\ No newline at end of file
diff --git a/basis/xml/tests/latin5.xml b/basis/xml/tests/latin5.xml
new file mode 100644
index 0000000000..afbcf09fc7
--- /dev/null
+++ b/basis/xml/tests/latin5.xml
@@ -0,0 +1 @@
+ý
\ No newline at end of file
diff --git a/basis/xml/tests/prologless.xml b/basis/xml/tests/prologless.xml
new file mode 100644
index 0000000000..a60ed31cc6
--- /dev/null
+++ b/basis/xml/tests/prologless.xml
@@ -0,0 +1 @@
+é
\ No newline at end of file
diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor
index f37c3fa7ac..a565df6a9d 100644
--- a/basis/xml/tests/test.factor
+++ b/basis/xml/tests/test.factor
@@ -53,12 +53,12 @@ SYMBOL: xml-file
[ " bar " string>xml pprint-xml>string ] unit-test
[ "" string>xml ] must-fail
[ ] [ "" string>xml drop ] unit-test
-[ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ element-decl f "container" "ANY" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk second ] unit-test
-[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk second ] unit-test
-[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk second ] unit-test
+[ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ element-decl f "container" "ANY" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test
+[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test
+[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test
[ t ] [ "" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test
diff --git a/basis/xml/tests/utf16.xml b/basis/xml/tests/utf16.xml
new file mode 100644
index 0000000000..d8775098a1
Binary files /dev/null and b/basis/xml/tests/utf16.xml differ
diff --git a/basis/xml/tests/utf16be-bom.xml b/basis/xml/tests/utf16be-bom.xml
new file mode 100644
index 0000000000..4a6f3e255c
Binary files /dev/null and b/basis/xml/tests/utf16be-bom.xml differ
diff --git a/basis/xml/tests/utf16be.xml b/basis/xml/tests/utf16be.xml
new file mode 100644
index 0000000000..c97bff7593
Binary files /dev/null and b/basis/xml/tests/utf16be.xml differ
diff --git a/basis/xml/tests/utf16le-bom.xml b/basis/xml/tests/utf16le-bom.xml
new file mode 100644
index 0000000000..ac7d8b8c70
Binary files /dev/null and b/basis/xml/tests/utf16le-bom.xml differ
diff --git a/basis/xml/tests/utf16le.xml b/basis/xml/tests/utf16le.xml
new file mode 100644
index 0000000000..5a0c7d9551
Binary files /dev/null and b/basis/xml/tests/utf16le.xml differ
diff --git a/basis/xml/tests/utf8-bom.xml b/basis/xml/tests/utf8-bom.xml
new file mode 100644
index 0000000000..5486916f73
--- /dev/null
+++ b/basis/xml/tests/utf8-bom.xml
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/basis/xml/tests/utf8.xml b/basis/xml/tests/utf8.xml
new file mode 100644
index 0000000000..83b3e2d501
--- /dev/null
+++ b/basis/xml/tests/utf8.xml
@@ -0,0 +1 @@
+é
\ No newline at end of file
diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor
index 0c475c108d..26b04310d6 100644
--- a/basis/xml/tokenize/tokenize.factor
+++ b/basis/xml/tokenize/tokenize.factor
@@ -1,9 +1,9 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
-USING: xml.errors xml.data xml.utilities xml.char-classes sets
-xml.entities kernel state-parser kernel namespaces make strings
-math math.parser sequences assocs arrays splitting combinators
-unicode.case accessors fry ascii ;
+USING: accessors arrays ascii assocs combinators fry io.encodings
+io.encodings.iana io.encodings.utf16 io.encodings.utf8 kernel
+make math.parser namespaces sequences sets splitting state-parser
+xml.char-classes xml.data xml.entities xml.errors strings ;
IN: xml.tokenize
! XML namespace processing: ns = namespace
@@ -262,9 +262,15 @@ DEFER: direct
[ yes/no>bool ] [ f ] if*
;
+SYMBOL: string-input?
+: decode-input-if ( encoding -- )
+ string-input? get [ drop ] [ decode-input ] if ;
+
: parse-prolog ( -- prolog )
pass-blank middle-tag "?>" expect-string
dup assure-no-extra prolog-attrs
+ dup encoding>> dup "UTF-16" =
+ [ drop ] [ name>encoding [ decode-input-if ] when* ] if
dup prolog-data set ;
: instruct ( -- instruction )
@@ -285,3 +291,50 @@ DEFER: direct
CHAR: > expect
]
} cond ;
+
+! Autodetecting encodings
+
+: start-utf16le ( -- tag )
+ utf16le decode-input-if
+ CHAR: ? expect
+ 0 expect instruct ;
+
+: start< ( -- tag )
+ get-next {
+ { 0 [ next next start-utf16le ] }
+ { CHAR: ? [ next next instruct ] } ! XML prolog parsing sets the encoding
+ [ drop utf8 decode-input-if next make-tag ]
+ ! That is a hack. It fails if you have > [
@@ -133,11 +133,12 @@ TUPLE: pull-xml scope ;
: sax ( stream quot: ( xml-elem -- ) -- )
swap [
reset-prolog init-ns-stack
- prolog-data get call-under
+ start-document call-under
sax-loop
] state-parse ; inline recursive
: (read-xml) ( -- )
+ start-document process
[ process ] sax-loop ; inline
: (read-xml-chunk) ( stream -- prolog seq )
@@ -159,11 +160,12 @@ TUPLE: pull-xml scope ;
read-xml ;
: string>xml-chunk ( string -- xml )
- read-xml-chunk ;
+ t string-input?
+ [ read-xml-chunk ] with-variable ;
: file>xml ( filename -- xml )
! Autodetect encoding!
- utf8 read-xml ;
+ binary read-xml ;
: xml-reprint ( string -- )
string>xml print-xml ;