From b9d773b3f0363fdf335dee762e64fdd8e70fdee8 Mon Sep 17 00:00:00 2001 From: Daniel Ehrenberg Date: Thu, 15 Jan 2009 15:25:00 -0600 Subject: [PATCH] Fix subtle Unicode encodings autodetection bug --- basis/xml/tests/encodings.factor | 4 ++- basis/xml/tests/spaces.xml | 3 ++ basis/xml/tests/unitag.xml | 1 + basis/xml/tokenize/tokenize.factor | 57 +++++++++++++++++++++--------- 4 files changed, 47 insertions(+), 18 deletions(-) create mode 100644 basis/xml/tests/spaces.xml create mode 100644 basis/xml/tests/unitag.xml diff --git a/basis/xml/tests/encodings.factor b/basis/xml/tests/encodings.factor index 92fea88b4a..720b04ca42 100644 --- a/basis/xml/tests/encodings.factor +++ b/basis/xml/tests/encodings.factor @@ -1,7 +1,8 @@ -USING: xml xml.data xml.utilities tools.test ; +USING: xml xml.data xml.utilities tools.test accessors kernel ; [ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test [ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test +[ "\u0000e9" ] [ "resource:basis/xml/tests/spaces.xml" file>xml children>string ] unit-test [ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test [ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test [ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test @@ -10,3 +11,4 @@ USING: xml xml.data xml.utilities tools.test ; [ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test [ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test [ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test +[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test \ No newline at end of file diff --git a/basis/xml/tests/spaces.xml b/basis/xml/tests/spaces.xml new file mode 100644 index 0000000000..dd194ff218 --- /dev/null +++ b/basis/xml/tests/spaces.xml @@ -0,0 +1,3 @@ + + +é diff --git a/basis/xml/tests/unitag.xml b/basis/xml/tests/unitag.xml new file mode 100644 index 0000000000..b7ef6ade3c --- /dev/null +++ b/basis/xml/tests/unitag.xml @@ -0,0 +1 @@ +<é>x \ No newline at end of file diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index 26b04310d6..f2bb5a2e97 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -1,9 +1,10 @@ ! Copyright (C) 2005, 2006 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays ascii assocs combinators fry io.encodings -io.encodings.iana io.encodings.utf16 io.encodings.utf8 kernel -make math.parser namespaces sequences sets splitting state-parser -xml.char-classes xml.data xml.entities xml.errors strings ; +USING: accessors arrays ascii assocs combinators +combinators.short-circuit fry io.encodings io.encodings.iana +io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make +math math.parser namespaces sequences sets splitting state-parser +strings xml.char-classes xml.data xml.entities xml.errors ; IN: xml.tokenize ! XML namespace processing: ns = namespace @@ -53,17 +54,23 @@ SYMBOL: ns-stack ! version=1.0? is calculated once and passed around for efficiency -: (parse-name) ( -- str ) - version=1.0? dup - get-char name-start? [ - [ dup get-char name-char? not ] take-until nip - ] [ - "Malformed name" xml-string-error - ] if ; +: assure-name ( str version=1.0? -- str ) + over { + [ first name-start? ] + [ rest-slice [ name-char? ] with all? ] + } 2&& [ "Malformed name" xml-string-error ] unless ; + +: (parse-name) ( start -- str ) + version=1.0? + [ [ get-char name-char? not ] curry take-until append ] + [ assure-name ] bi ; + +: parse-name-starting ( start -- name ) + (parse-name) get-char CHAR: : = + [ next "" (parse-name) ] [ "" swap ] if f ; : parse-name ( -- name ) - (parse-name) get-char CHAR: : = - [ next (parse-name) ] [ "" swap ] if f ; + "" parse-name-starting ; ! -- Parsing strings @@ -99,7 +106,7 @@ SYMBOL: ns-stack : parse-text ( -- string ) CHAR: < parse-char ; - + ! Parsing tags : start-tag ( -- name ? ) @@ -274,7 +281,7 @@ SYMBOL: string-input? dup prolog-data set ; : instruct ( -- instruction ) - (parse-name) dup "xml" = + "" (parse-name) dup "xml" = [ drop parse-prolog ] [ dup >lower "xml" = [ capitalized-prolog ] @@ -294,17 +301,33 @@ SYMBOL: string-input? ! Autodetecting encodings +: continue-make-tag ( str -- tag ) + parse-name-starting middle-tag end-tag CHAR: > expect ; + : start-utf16le ( -- tag ) utf16le decode-input-if CHAR: ? expect 0 expect instruct ; +: 10xxxxxx? ( ch -- ? ) + -6 shift 3 bitand 2 = ; + +: start