Fix subtle Unicode encodings autodetection bug
parent
db0fee9e3c
commit
b9d773b3f0
|
@ -1,7 +1,8 @@
|
||||||
USING: xml xml.data xml.utilities tools.test ;
|
USING: xml xml.data xml.utilities tools.test accessors kernel ;
|
||||||
|
|
||||||
[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
|
[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
|
||||||
[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test
|
[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test
|
||||||
|
[ "\u0000e9" ] [ "resource:basis/xml/tests/spaces.xml" file>xml children>string ] unit-test
|
||||||
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test
|
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test
|
||||||
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test
|
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test
|
||||||
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test
|
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test
|
||||||
|
@ -10,3 +11,4 @@ USING: xml xml.data xml.utilities tools.test ;
|
||||||
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test
|
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test
|
||||||
[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test
|
[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test
|
||||||
[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test
|
[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test
|
||||||
|
[ "\u0000e9" "x" ] [ "resource:basis/xml/tests/unitag.xml" file>xml [ name>> main>> ] [ children>string ] bi ] unit-test
|
|
@ -0,0 +1,3 @@
|
||||||
|
|
||||||
|
|
||||||
|
<x>é</x>
|
|
@ -0,0 +1 @@
|
||||||
|
<é>x</é>
|
|
@ -1,9 +1,10 @@
|
||||||
! Copyright (C) 2005, 2006 Daniel Ehrenberg
|
! Copyright (C) 2005, 2006 Daniel Ehrenberg
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays ascii assocs combinators fry io.encodings
|
USING: accessors arrays ascii assocs combinators
|
||||||
io.encodings.iana io.encodings.utf16 io.encodings.utf8 kernel
|
combinators.short-circuit fry io.encodings io.encodings.iana
|
||||||
make math.parser namespaces sequences sets splitting state-parser
|
io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make
|
||||||
xml.char-classes xml.data xml.entities xml.errors strings ;
|
math math.parser namespaces sequences sets splitting state-parser
|
||||||
|
strings xml.char-classes xml.data xml.entities xml.errors ;
|
||||||
IN: xml.tokenize
|
IN: xml.tokenize
|
||||||
|
|
||||||
! XML namespace processing: ns = namespace
|
! XML namespace processing: ns = namespace
|
||||||
|
@ -53,17 +54,23 @@ SYMBOL: ns-stack
|
||||||
|
|
||||||
! version=1.0? is calculated once and passed around for efficiency
|
! version=1.0? is calculated once and passed around for efficiency
|
||||||
|
|
||||||
: (parse-name) ( -- str )
|
: assure-name ( str version=1.0? -- str )
|
||||||
version=1.0? dup
|
over {
|
||||||
get-char name-start? [
|
[ first name-start? ]
|
||||||
[ dup get-char name-char? not ] take-until nip
|
[ rest-slice [ name-char? ] with all? ]
|
||||||
] [
|
} 2&& [ "Malformed name" xml-string-error ] unless ;
|
||||||
"Malformed name" xml-string-error
|
|
||||||
] if ;
|
: (parse-name) ( start -- str )
|
||||||
|
version=1.0?
|
||||||
|
[ [ get-char name-char? not ] curry take-until append ]
|
||||||
|
[ assure-name ] bi ;
|
||||||
|
|
||||||
|
: parse-name-starting ( start -- name )
|
||||||
|
(parse-name) get-char CHAR: : =
|
||||||
|
[ next "" (parse-name) ] [ "" swap ] if f <name> ;
|
||||||
|
|
||||||
: parse-name ( -- name )
|
: parse-name ( -- name )
|
||||||
(parse-name) get-char CHAR: : =
|
"" parse-name-starting ;
|
||||||
[ next (parse-name) ] [ "" swap ] if f <name> ;
|
|
||||||
|
|
||||||
! -- Parsing strings
|
! -- Parsing strings
|
||||||
|
|
||||||
|
@ -99,7 +106,7 @@ SYMBOL: ns-stack
|
||||||
|
|
||||||
: parse-text ( -- string )
|
: parse-text ( -- string )
|
||||||
CHAR: < parse-char ;
|
CHAR: < parse-char ;
|
||||||
|
|
||||||
! Parsing tags
|
! Parsing tags
|
||||||
|
|
||||||
: start-tag ( -- name ? )
|
: start-tag ( -- name ? )
|
||||||
|
@ -274,7 +281,7 @@ SYMBOL: string-input?
|
||||||
dup prolog-data set ;
|
dup prolog-data set ;
|
||||||
|
|
||||||
: instruct ( -- instruction )
|
: instruct ( -- instruction )
|
||||||
(parse-name) dup "xml" =
|
"" (parse-name) dup "xml" =
|
||||||
[ drop parse-prolog ] [
|
[ drop parse-prolog ] [
|
||||||
dup >lower "xml" =
|
dup >lower "xml" =
|
||||||
[ capitalized-prolog ]
|
[ capitalized-prolog ]
|
||||||
|
@ -294,17 +301,33 @@ SYMBOL: string-input?
|
||||||
|
|
||||||
! Autodetecting encodings
|
! Autodetecting encodings
|
||||||
|
|
||||||
|
: continue-make-tag ( str -- tag )
|
||||||
|
parse-name-starting middle-tag end-tag CHAR: > expect ;
|
||||||
|
|
||||||
: start-utf16le ( -- tag )
|
: start-utf16le ( -- tag )
|
||||||
utf16le decode-input-if
|
utf16le decode-input-if
|
||||||
CHAR: ? expect
|
CHAR: ? expect
|
||||||
0 expect instruct ;
|
0 expect instruct ;
|
||||||
|
|
||||||
|
: 10xxxxxx? ( ch -- ? )
|
||||||
|
-6 shift 3 bitand 2 = ;
|
||||||
|
|
||||||
|
: start<name ( ch -- tag )
|
||||||
|
ascii?
|
||||||
|
[ utf8 decode-input-if next make-tag ] [
|
||||||
|
next
|
||||||
|
[ get-next 10xxxxxx? not ] take-until
|
||||||
|
get-char suffix utf8 decode
|
||||||
|
utf8 decode-input-if next
|
||||||
|
continue-make-tag
|
||||||
|
] if ;
|
||||||
|
|
||||||
: start< ( -- tag )
|
: start< ( -- tag )
|
||||||
get-next {
|
get-next {
|
||||||
{ 0 [ next next start-utf16le ] }
|
{ 0 [ next next start-utf16le ] }
|
||||||
{ CHAR: ? [ next next instruct ] } ! XML prolog parsing sets the encoding
|
{ CHAR: ? [ next next instruct ] } ! XML prolog parsing sets the encoding
|
||||||
[ drop utf8 decode-input-if next make-tag ]
|
{ CHAR: ! [ utf8 decode-input next next direct ] }
|
||||||
! That is a hack. It fails if you have <nonascii
|
[ start<name ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
: skip-utf8-bom ( -- tag )
|
: skip-utf8-bom ( -- tag )
|
||||||
|
|
Loading…
Reference in New Issue