diff --git a/basis/xml/autoencoding/autoencoding.factor b/basis/xml/autoencoding/autoencoding.factor index 5dc32958d4..d78342a08c 100644 --- a/basis/xml/autoencoding/autoencoding.factor +++ b/basis/xml/autoencoding/autoencoding.factor @@ -2,14 +2,15 @@ ! See http://factorcode.org/license.txt for BSD license. USING: kernel namespaces xml.name io.encodings.utf8 xml.elements io.encodings.utf16 xml.tokenize xml.state math ascii sequences -io.encodings.string io.encodings combinators ; +io.encodings.string io.encodings combinators accessors +xml.data io.encodings.iana ; IN: xml.autoencoding : continue-make-tag ( str -- tag ) parse-name-starting middle-tag end-tag ; : start-utf16le ( -- tag ) - utf16le decode-input-if + utf16le decode-input "?\0" expect check instruct ; @@ -17,20 +18,36 @@ IN: xml.autoencoding -6 shift 3 bitand 2 = ; : start> dup "UTF-16" = + [ drop ] [ name>encoding [ decode-input ] when* ] if ; + +: instruct-encoding ( instruct/prolog -- ) + dup prolog? + [ prolog-encoding ] + [ drop utf8 decode-input ] if ; + +: something ( -- ) + check utf8 decode-input next next ; + : start< ( -- tag ) + ! What if first letter of processing instruction is non-ASCII? get-next { { 0 [ next next start-utf16le ] } - { CHAR: ? [ check next next instruct ] } ! XML prolog parsing sets the encoding - { CHAR: ! [ check utf8 decode-input next next direct ] } + { CHAR: ? [ something instruct dup instruct-encoding ] } + { CHAR: ! [ something direct ] } [ check start, in the case of XML chunks? - } case check ; + [ drop utf8 decode-input check f ] + } case ; diff --git a/basis/xml/char-classes/char-classes.factor b/basis/xml/char-classes/char-classes.factor index 03e85e3ea3..b47d4c66df 100644 --- a/basis/xml/char-classes/char-classes.factor +++ b/basis/xml/char-classes/char-classes.factor @@ -26,7 +26,7 @@ CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ; ! 1.1: ! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] { - { [ dup HEX: 20 < ] [ "\t\r\n" member? and ] } + { [ dup HEX: 20 < ] [ swap [ "\t\r\n" member? ] [ zero? not ] if ] } { [ nip dup HEX: D800 < ] [ drop t ] } { [ dup HEX: E000 < ] [ drop f ] } [ { HEX: FFFE HEX: FFFF } member? not ] diff --git a/basis/xml/elements/elements.factor b/basis/xml/elements/elements.factor index 116acb076b..b927947329 100644 --- a/basis/xml/elements/elements.factor +++ b/basis/xml/elements/elements.factor @@ -29,7 +29,7 @@ IN: xml.elements parse-name swap ; : (middle-tag) ( -- ) - pass-blank version=1.0? get-char name-start? + pass-blank version-1.0? get-char name-start? [ parse-attr (middle-tag) ] when ; : assure-no-duplicates ( attrs-alist -- attrs-alist ) @@ -66,7 +66,8 @@ IN: xml.elements : prolog-version ( alist -- version ) T{ name { space "" } { main "version" } } swap at - [ good-version ] [ versionless-prolog ] if* ; + [ good-version ] [ versionless-prolog ] if* + dup set-version ; : prolog-encoding ( alist -- encoding ) T{ name { space "" } { main "encoding" } } swap at @@ -89,16 +90,9 @@ IN: xml.elements [ prolog-standalone ] tri ; -SYMBOL: string-input? -: decode-input-if ( encoding -- ) - string-input? get [ drop ] [ decode-input ] if ; - : parse-prolog ( -- prolog ) pass-blank middle-tag "?>" expect - dup assure-no-extra prolog-attrs - dup encoding>> dup "UTF-16" = - [ drop ] [ name>encoding [ decode-input-if ] when* ] if - dup prolog-data set ; + dup assure-no-extra prolog-attrs ; : instruct ( -- instruction ) take-name { diff --git a/basis/xml/state/state.factor b/basis/xml/state/state.factor index 059d8267a0..eba94220e3 100644 --- a/basis/xml/state/state.factor +++ b/basis/xml/state/state.factor @@ -3,7 +3,7 @@ USING: accessors kernel namespaces io ; IN: xml.state -TUPLE: spot char line column next check ; +TUPLE: spot char line column next check version-1.0? ; C: spot @@ -17,11 +17,12 @@ C: spot : set-next ( char -- ) spot get swap >>next drop ; : get-check ( -- ? ) spot get check>> ; : check ( -- ) spot get t >>check drop ; +: version-1.0? ( -- ? ) spot get version-1.0?>> ; +: set-version ( string -- ) + spot get swap "1.0" = >>version-1.0? drop ; SYMBOL: xml-stack -SYMBOL: prolog-data - SYMBOL: depth SYMBOL: interpolating? diff --git a/basis/xml/tests/test.factor b/basis/xml/tests/test.factor index 97793f2ab2..337c19bfe1 100644 --- a/basis/xml/tests/test.factor +++ b/basis/xml/tests/test.factor @@ -51,14 +51,18 @@ SYMBOL: xml-file [ "foo" ] [ "" string>xml children>string ] unit-test [ "" string>xml ] must-fail [ ] [ "" string>xml drop ] unit-test -[ T{ element-decl f "br" "EMPTY" } ] [ "" string>dtd directives>> first ] unit-test -[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>dtd directives>> first ] unit-test -[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>dtd directives>> first ] unit-test -[ T{ element-decl f "container" "ANY" } ] [ "" string>dtd directives>> first ] unit-test -[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test -[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first ] unit-test -[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test -[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first ] unit-test + +: first-thing ( seq -- elt ) + [ "" = not ] filter first ; + +[ T{ element-decl f "br" "EMPTY" } ] [ "" string>dtd directives>> first-thing ] unit-test +[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "" string>dtd directives>> first-thing ] unit-test +[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "" string>dtd directives>> first-thing ] unit-test +[ T{ element-decl f "container" "ANY" } ] [ "" string>dtd directives>> first-thing ] unit-test +[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first-thing ] unit-test +[ T{ doctype-decl f "foo" } ] [ "" string>xml-chunk first-thing ] unit-test +[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first-thing ] unit-test +[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "" string>xml-chunk first-thing ] unit-test [ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test [ "x" "<" ] [ "" string>xml [ name>> main>> ] [ "value" attr ] bi ] unit-test [ "foo" ] [ "]>&bar;" string>xml children>string ] unit-test diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index b629d46455..50ab43ca7b 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -6,12 +6,9 @@ circular xml.entities assocs make splitting math.parser locals combinators arrays ; IN: xml.tokenize -: version=1.0? ( -- ? ) - prolog-data get [ version>> "1.0" = ] [ t ] if* ; - : assure-good-char ( ch -- ch ) [ - version=1.0? over text? not get-check and + version-1.0? over text? not get-check and [ disallowed-char ] when ] [ f ] if* ; @@ -36,7 +33,7 @@ IN: xml.tokenize get-char [ unexpected-end ] unless (next) record ; : init-parser ( -- ) - 0 1 0 f f spot set + 0 1 0 f f t spot set read1 set-next next ; : with-state ( stream quot -- ) diff --git a/basis/xml/xml.factor b/basis/xml/xml.factor index 5369b04d9c..6b297918c3 100644 --- a/basis/xml/xml.factor +++ b/basis/xml/xml.factor @@ -22,7 +22,7 @@ GENERIC: process ( object -- ) M: object process add-child ; M: prolog process - xml-stack get V{ { f V{ } } } = + xml-stack get { V{ { f V{ "" } } } V{ { f V{ } } } } member? [ bad-prolog ] unless drop ; M: directive process @@ -49,17 +49,14 @@ M: closer process : init-xml-stack ( -- ) V{ } clone xml-stack set - extra-entities [ H{ } assoc-like ] change f push-xml ; : default-prolog ( -- prolog ) "1.0" "UTF-8" f ; -: reset-prolog ( -- ) - default-prolog prolog-data set ; - : init-xml ( -- ) - reset-prolog init-xml-stack init-ns-stack ; + init-ns-stack + extra-entities [ H{ } assoc-like ] change ; : assert-blanks ( seq pre? -- ) swap [ string? ] filter @@ -80,7 +77,11 @@ M: closer process ! this does *not* affect the contents of the stack [ notags ] unless* ; -: make-xml-doc ( prolog seq -- xml-doc ) +: get-prolog ( seq -- prolog ) + first dup prolog? [ drop default-prolog ] unless ; + +: make-xml-doc ( seq -- xml-doc ) + [ get-prolog ] keep dup [ tag? ] find [ assure-tags cut rest no-pre/post no-post-tags ] dip swap ; @@ -95,8 +96,7 @@ TUPLE: pull-xml scope ; : ( -- pull-xml ) [ input-stream [ ] change ! bring var in this scope - init-parser reset-prolog init-ns-stack - text-now? on + init-xml text-now? on ] H{ } make-assoc pull-xml boa ; ! pull-xml needs to call start-document somewhere @@ -135,50 +135,43 @@ PRIVATE> get-char [ make-tag call-under xml-loop ] [ drop ] if ; inline recursive +: read-seq ( stream quot n -- seq ) + rot [ + depth set + init-xml init-xml-stack + call + [ process ] xml-loop + done? [ unclosed ] unless + xml-stack get first second + ] with-state ; inline + PRIVATE> : each-element ( stream quot: ( xml-elem -- ) -- ) swap [ - reset-prolog init-ns-stack + init-xml start-document [ call-under ] when* xml-loop ] with-state ; inline -: (read-xml) ( -- ) - start-document [ process ] when* - [ process ] xml-loop ; inline - -: (read-xml-chunk) ( stream -- prolog seq ) - [ - init-xml (read-xml) - done? [ unclosed ] unless - xml-stack get first second - prolog-data get swap - ] with-state ; - : read-xml ( stream -- xml ) - 0 depth - [ (read-xml-chunk) make-xml-doc ] with-variable ; + [ start-document [ process ] when* ] + 0 read-seq make-xml-doc ; : read-xml-chunk ( stream -- seq ) - 1 depth - [ (read-xml-chunk) nip ] with-variable - ; + [ check ] 1 read-seq ; : string>xml ( string -- xml ) - t string-input? - [ read-xml ] with-variable ; + [ check ] 0 read-seq make-xml-doc ; : string>xml-chunk ( string -- xml ) - t string-input? - [ read-xml-chunk ] with-variable ; + read-xml-chunk ; : file>xml ( filename -- xml ) binary read-xml ; : read-dtd ( stream -- dtd ) [ - reset-prolog H{ } clone extra-entities set take-internal-subset ] with-state ;