Merge branch 'master' of git://factorcode.org/git/factor

db4
John Benediktsson 2009-01-15 08:44:19 -08:00
commit f29f625787
16 changed files with 96 additions and 23 deletions

View File

@ -20,11 +20,11 @@ HELP: <duplex-stream>
HELP: with-stream
{ $values { "stream" duplex-stream } { "quot" quotation } }
{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ". The stream is closed if the quotation returns or throws an error." } ;
{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ", which must be a duplex stream. The stream is closed if the quotation returns or throws an error." } ;
HELP: with-stream*
{ $values { "stream" duplex-stream } { "quot" quotation } }
{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } "." }
{ $description "Calls the quotation in a new dynamic scope, with both " { $link input-stream } " and " { $link output-stream } " rebound to " { $snippet "stream" } ", which must be a duplex stream." }
{ $notes "This word does not close the stream. Compare with " { $link with-stream } "." } ;
HELP: <encoder-duplex>

View File

@ -0,0 +1 @@
<?xml version='1.0' encoding='ASCII'?><x>e</x>

View File

@ -0,0 +1,12 @@
USING: xml xml.data xml.utilities tools.test ;
[ "\u000131" ] [ "resource:basis/xml/tests/latin5.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/latin1.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf8.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16be-bom.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/utf16le-bom.xml" file>xml children>string ] unit-test
[ "\u0000e9" ] [ "resource:basis/xml/tests/prologless.xml" file>xml children>string ] unit-test
[ "e" ] [ "resource:basis/xml/tests/ascii.xml" file>xml children>string ] unit-test

View File

@ -0,0 +1 @@
<?xml version='1.0' encoding='ISO-8859-1'?><x>é</x>

View File

@ -0,0 +1 @@
<?xml version='1.0' encoding='ISO-8859-9'?><x>ý</x>

View File

@ -0,0 +1 @@
<x>é</x>

View File

@ -53,12 +53,12 @@ SYMBOL: xml-file
[ "<foo> bar </foo>" string>xml pprint-xml>string ] unit-test
[ "<!-- B+, B, or B--->" string>xml ] must-fail
[ ] [ "<?xml version='1.0'?><!-- declarations for <head> & <body> --><foo/>" string>xml drop ] unit-test
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>xml-chunk second ] unit-test
[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>xml-chunk second ] unit-test
[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>xml-chunk second ] unit-test
[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>xml-chunk second ] unit-test
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk second ] unit-test
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk second ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk second ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk second ] unit-test
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>xml-chunk first ] unit-test
[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>xml-chunk first ] unit-test
[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>xml-chunk first ] unit-test
[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first ] unit-test
[ t ] [ "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' >" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test

BIN
basis/xml/tests/utf16.xml Normal file

Binary file not shown.

Binary file not shown.

BIN
basis/xml/tests/utf16be.xml Normal file

Binary file not shown.

Binary file not shown.

BIN
basis/xml/tests/utf16le.xml Normal file

Binary file not shown.

View File

@ -0,0 +1 @@
<?xml version='1.0' encoding='UTF-8'?><x/>

1
basis/xml/tests/utf8.xml Normal file
View File

@ -0,0 +1 @@
<?xml version='1.0' encoding='UTF-8'?><x>é</x>

View File

@ -1,9 +1,9 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: xml.errors xml.data xml.utilities xml.char-classes sets
xml.entities kernel state-parser kernel namespaces make strings
math math.parser sequences assocs arrays splitting combinators
unicode.case accessors fry ascii ;
USING: accessors arrays ascii assocs combinators fry io.encodings
io.encodings.iana io.encodings.utf16 io.encodings.utf8 kernel
make math.parser namespaces sequences sets splitting state-parser
xml.char-classes xml.data xml.entities xml.errors strings ;
IN: xml.tokenize
! XML namespace processing: ns = namespace
@ -262,9 +262,15 @@ DEFER: direct
[ yes/no>bool ] [ f ] if*
<prolog> ;
SYMBOL: string-input?
: decode-input-if ( encoding -- )
string-input? get [ drop ] [ decode-input ] if ;
: parse-prolog ( -- prolog )
pass-blank middle-tag "?>" expect-string
dup assure-no-extra prolog-attrs
dup encoding>> dup "UTF-16" =
[ drop ] [ name>encoding [ decode-input-if ] when* ] if
dup prolog-data set ;
: instruct ( -- instruction )
@ -285,3 +291,50 @@ DEFER: direct
CHAR: > expect
]
} cond ;
! Autodetecting encodings
: start-utf16le ( -- tag )
utf16le decode-input-if
CHAR: ? expect
0 expect instruct ;
: start< ( -- tag )
get-next {
{ 0 [ next next start-utf16le ] }
{ CHAR: ? [ next next instruct ] } ! XML prolog parsing sets the encoding
[ drop utf8 decode-input-if next make-tag ]
! That is a hack. It fails if you have <nonascii
} case ;
: skip-utf8-bom ( -- tag )
"\u0000bb\u0000bf" expect utf8 decode-input
CHAR: < expect make-tag ;
: start-utf16be ( -- tag )
utf16be decode-input-if
next CHAR: < expect make-tag ;
: skip-utf16le-bom ( -- tag )
utf16le decode-input-if
next HEX: FE expect
CHAR: < expect make-tag ;
: skip-utf16be-bom ( -- tag )
utf16be decode-input-if
next HEX: FF expect
CHAR: < expect make-tag ;
: start-document ( -- tag )
get-char {
{ CHAR: < [ start< ] }
{ 0 [ start-utf16be ] }
{ HEX: EF [ skip-utf8-bom ] }
{ HEX: FF [ skip-utf16le-bom ] }
{ HEX: FE [ skip-utf16be-bom ] }
{ f [ "" ] }
[ dup blank?
[ drop pass-blank utf8 decode-input-if CHAR: < expect make-tag ]
[ 1string ] if ! Replace with proper error
]
} case ;

View File

@ -1,10 +1,9 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: io io.streams.string io.files kernel math namespaces
prettyprint sequences arrays generic strings vectors
xml.char-classes xml.data xml.errors xml.tokenize xml.writer
xml.utilities state-parser assocs ascii io.encodings.utf8
accessors xml.backend ;
USING: accessors arrays io io.encodings.binary io.files
io.streams.string kernel namespaces sequences state-parser strings
xml.backend xml.data xml.errors xml.tokenize ascii
xml.writer ;
IN: xml
! -- Overall parser with data tree
@ -23,7 +22,7 @@ GENERIC: process ( object -- )
M: object process add-child ;
M: prolog process
xml-stack get V{ { f V{ "" } } } =
xml-stack get V{ { f V{ } } } =
[ bad-prolog ] unless drop ;
M: instruction process
@ -101,6 +100,7 @@ TUPLE: pull-xml scope ;
text-now? on
] H{ } make-assoc
pull-xml boa ;
! pull-xml needs to call start-document somewhere
: pull-event ( pull -- xml-event/f )
scope>> [
@ -133,11 +133,12 @@ TUPLE: pull-xml scope ;
: sax ( stream quot: ( xml-elem -- ) -- )
swap [
reset-prolog init-ns-stack
prolog-data get call-under
start-document call-under
sax-loop
] state-parse ; inline recursive
: (read-xml) ( -- )
start-document process
[ process ] sax-loop ; inline
: (read-xml-chunk) ( stream -- prolog seq )
@ -159,11 +160,12 @@ TUPLE: pull-xml scope ;
<string-reader> read-xml ;
: string>xml-chunk ( string -- xml )
<string-reader> read-xml-chunk ;
t string-input?
[ <string-reader> read-xml-chunk ] with-variable ;
: file>xml ( filename -- xml )
! Autodetect encoding!
utf8 <file-reader> read-xml ;
binary <file-reader> read-xml ;
: xml-reprint ( string -- )
string>xml print-xml ;