diff --git a/basis/io/streams/duplex/duplex-docs.factor b/basis/io/streams/duplex/duplex-docs.factor index 48afafeec7..5bf33e9002 100644 --- a/basis/io/streams/duplex/duplex-docs.factor +++ b/basis/io/streams/duplex/duplex-docs.factor @@ -20,11 +20,11 @@ HELP: HELP: with-stream { $values { "stream" duplex-stream } { "quot" quotation } } -{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ". The stream is closed if the quotation returns or throws an error." } ; +{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ", which must be a duplex stream. The stream is closed if the quotation returns or throws an error." } ; HELP: with-stream* { $values { "stream" duplex-stream } { "quot" quotation } } -{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } "." } +{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ", which must be a duplex stream." } { $notes "This word does not close the stream. Compare with " { $link with-stream } "." } ; HELP: diff --git a/basis/xml/tests/ascii.xml b/basis/xml/tests/ascii.xml new file mode 100644 index 0000000000..ca1c355c81 --- /dev/null +++ b/basis/xml/tests/ascii.xml @@ -0,0 +1 @@ +e \ No newline at end of file diff --git a/basis/xml/tests/encodings.factor b/basis/xml/tests/encodings.factor new file mode 100644 index 0000000000..92fea88b4a --- /dev/null +++ b/basis/xml/tests/encodings.factor @@ -0,0 +1,12 @@ +USING: xml xml.data xml.utilities tools.test ; + +[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be-bom.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test +[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test diff --git a/basis/xml/tests/latin1.xml b/basis/xml/tests/latin1.xml new file mode 100644 index 0000000000..f8bc6bcd61 --- /dev/null +++ b/basis/xml/tests/latin1.xml @@ -0,0 +1 @@ +é \ No newline at end of file diff --git a/basis/xml/tests/latin5.xml b/basis/xml/tests/latin5.xml new file mode 100644 index 0000000000..afbcf09fc7 --- /dev/null +++ b/basis/xml/tests/latin5.xml @@ -0,0 +1 @@ +ý \ No newline at end of file diff --git a/basis/xml/tests/prologless.xml b/basis/xml/tests/prologless.xml new file mode 100644 index 0000000000..a60ed31cc6 --- /dev/null +++ b/basis/xml/tests/prologless.xml @@ -0,0 +1 @@ +é \ No newline at end of file diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor index f37c3fa7ac..a565df6a9d 100644 --- a/basis/xml/tests/test.factor +++ b/basis/xml/tests/test.factor @@ -53,12 +53,12 @@ SYMBOL: xml-file [ " bar " string>xml pprint-xml>string ] unit-test [ "" string>xml ] must-fail [ ] [ "" string>xml drop ] unit-test -[ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk second ] unit-test -[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>xml-chunk second ] unit-test -[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>xml-chunk second ] unit-test -[ T{ element-decl f "container" "ANY" } ] [ "" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk second ] unit-test -[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk second ] unit-test +[ T{ element-decl f "br" "EMPTY" } ] [ "" string>xml-chunk first ] unit-test +[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>xml-chunk first ] unit-test +[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>xml-chunk first ] unit-test +[ T{ element-decl f "container" "ANY" } ] [ "" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test +[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test [ t ] [ "" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test diff --git a/basis/xml/tests/utf16.xml b/basis/xml/tests/utf16.xml new file mode 100644 index 0000000000..d8775098a1 Binary files /dev/null and b/basis/xml/tests/utf16.xml differ diff --git a/basis/xml/tests/utf16be-bom.xml b/basis/xml/tests/utf16be-bom.xml new file mode 100644 index 0000000000..4a6f3e255c Binary files /dev/null and b/basis/xml/tests/utf16be-bom.xml differ diff --git a/basis/xml/tests/utf16be.xml b/basis/xml/tests/utf16be.xml new file mode 100644 index 0000000000..c97bff7593 Binary files /dev/null and b/basis/xml/tests/utf16be.xml differ diff --git a/basis/xml/tests/utf16le-bom.xml b/basis/xml/tests/utf16le-bom.xml new file mode 100644 index 0000000000..ac7d8b8c70 Binary files /dev/null and b/basis/xml/tests/utf16le-bom.xml differ diff --git a/basis/xml/tests/utf16le.xml b/basis/xml/tests/utf16le.xml new file mode 100644 index 0000000000..5a0c7d9551 Binary files /dev/null and b/basis/xml/tests/utf16le.xml differ diff --git a/basis/xml/tests/utf8-bom.xml b/basis/xml/tests/utf8-bom.xml new file mode 100644 index 0000000000..5486916f73 --- /dev/null +++ b/basis/xml/tests/utf8-bom.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/basis/xml/tests/utf8.xml b/basis/xml/tests/utf8.xml new file mode 100644 index 0000000000..83b3e2d501 --- /dev/null +++ b/basis/xml/tests/utf8.xml @@ -0,0 +1 @@ +é \ No newline at end of file diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index 0c475c108d..26b04310d6 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -1,9 +1,9 @@ ! Copyright (C) 2005, 2006 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: xml.errors xml.data xml.utilities xml.char-classes sets -xml.entities kernel state-parser kernel namespaces make strings -math math.parser sequences assocs arrays splitting combinators -unicode.case accessors fry ascii ; +USING: accessors arrays ascii assocs combinators fry io.encodings +io.encodings.iana io.encodings.utf16 io.encodings.utf8 kernel +make math.parser namespaces sequences sets splitting state-parser +xml.char-classes xml.data xml.entities xml.errors strings ; IN: xml.tokenize ! XML namespace processing: ns = namespace @@ -262,9 +262,15 @@ DEFER: direct [ yes/no>bool ] [ f ] if* ; +SYMBOL: string-input? +: decode-input-if ( encoding -- ) + string-input? get [ drop ] [ decode-input ] if ; + : parse-prolog ( -- prolog ) pass-blank middle-tag "?>" expect-string dup assure-no-extra prolog-attrs + dup encoding>> dup "UTF-16" = + [ drop ] [ name>encoding [ decode-input-if ] when* ] if dup prolog-data set ; : instruct ( -- instruction ) @@ -285,3 +291,50 @@ DEFER: direct CHAR: > expect ] } cond ; + +! Autodetecting encodings + +: start-utf16le ( -- tag ) + utf16le decode-input-if + CHAR: ? expect + 0 expect instruct ; + +: start< ( -- tag ) + get-next { + { 0 [ next next start-utf16le ] } + { CHAR: ? [ next next instruct ] } ! XML prolog parsing sets the encoding + [ drop utf8 decode-input-if next make-tag ] + ! That is a hack. It fails if you have > [ @@ -133,11 +133,12 @@ TUPLE: pull-xml scope ; : sax ( stream quot: ( xml-elem -- ) -- ) swap [ reset-prolog init-ns-stack - prolog-data get call-under + start-document call-under sax-loop ] state-parse ; inline recursive : (read-xml) ( -- ) + start-document process [ process ] sax-loop ; inline : (read-xml-chunk) ( stream -- prolog seq ) @@ -159,11 +160,12 @@ TUPLE: pull-xml scope ; read-xml ; : string>xml-chunk ( string -- xml ) - read-xml-chunk ; + t string-input? + [ read-xml-chunk ] with-variable ; : file>xml ( filename -- xml ) ! Autodetect encoding! - utf8 read-xml ; + binary read-xml ; : xml-reprint ( string -- ) string>xml print-xml ;