diff --git a/basis/xml/char-classes/char-classes.factor b/basis/xml/char-classes/char-classes.factor index 6cba618682..03e85e3ea3 100644 --- a/basis/xml/char-classes/char-classes.factor +++ b/basis/xml/char-classes/char-classes.factor @@ -3,16 +3,16 @@ USING: kernel sequences unicode.syntax math math.order combinators ; IN: xml.char-classes -CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_ ; +CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ; : 1.0name-start? ( char -- ? ) dup 1.0name-start*? [ drop t ] [ HEX: 2BB HEX: 2C1 between? ] if ; -CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387 ; +CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ; -CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _ ; +CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ; -CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7 ; +CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ; : name-start? ( 1.0? char -- ? ) swap [ 1.0name-start? ] [ 1.1name-start? ] if ; diff --git a/basis/xml/data/data.factor b/basis/xml/data/data.factor index bf4e2047a7..8e6ff4bf09 100644 --- a/basis/xml/data/data.factor +++ b/basis/xml/data/data.factor @@ -17,10 +17,13 @@ C: name [ [ main>> ] bi@ ?= ] 2tri and and ; : ( string -- name ) + "" swap f ; + +: ( string -- name ) f swap f ; : assure-name ( string/name -- name ) - dup name? [ ] unless ; + dup name? [ ] unless ; TUPLE: opener name attrs ; C: opener @@ -54,6 +57,9 @@ C: public-id TUPLE: doctype-decl < directive name external-id internal-subset ; C: doctype-decl +TUPLE: notation-decl < directive name id ; +C: notation-decl + TUPLE: instruction text ; C: instruction diff --git a/basis/xml/errors/errors-tests.factor b/basis/xml/errors/errors-tests.factor index 25ec54814f..72d928568b 100644 --- a/basis/xml/errors/errors-tests.factor +++ b/basis/xml/errors/errors-tests.factor @@ -25,11 +25,12 @@ T{ capitalized-prolog f 1 6 "XmL" } "" xml-error-test T{ pre/post-content f "x" t } "x" xml-error-test T{ versionless-prolog f 1 8 } "" xml-error-test -T{ bad-instruction f 1 11 T{ instruction f "xsl" } } - "" xml-error-test T{ unclosed-quote f 1 13 } "" xml-error-test T{ text-w/]]> f 1 6 } "]]>" xml-error-test T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } "" xml-error-test +T{ bad-cdata f 1 3 } "" xml-error-test +T{ bad-cdata f 1 7 } "" xml-error-test +T{ pre/post-content f "&" t } "&32;" xml-error-test diff --git a/basis/xml/errors/errors.factor b/basis/xml/errors/errors.factor index 83b48f1da1..0be9ade1cf 100644 --- a/basis/xml/errors/errors.factor +++ b/basis/xml/errors/errors.factor @@ -170,18 +170,6 @@ M: versionless-prolog summary ( obj -- str ) "XML prolog lacks a version declaration" print ] with-string-writer ; -TUPLE: bad-instruction < parsing-error instruction ; - -: bad-instruction ( instruction -- * ) - \ bad-instruction parsing-error swap >>instruction throw ; - -M: bad-instruction summary ( obj -- str ) - [ - dup call-next-method write - "Misplaced processor instruction:" print - instruction>> write-xml-chunk nl - ] with-string-writer ; - TUPLE: bad-directive < parsing-error dir ; : bad-directive ( directive -- * ) @@ -286,9 +274,17 @@ TUPLE: duplicate-attr < parsing-error key values ; M: duplicate-attr summary call-next-method "\nDuplicate attribute" append ; +TUPLE: bad-cdata < parsing-error ; + +: bad-cdata ( -- * ) + \ bad-cdata parsing-error throw ; + +M: bad-cdata summary + call-next-method "\nCDATA occurs before or after main tag" append ; + UNION: xml-parse-error multitags notags extra-attrs nonexist-ns bad-decl not-yes/no unclosed mismatched expected no-entity - bad-prolog versionless-prolog capitalized-prolog bad-instruction + bad-prolog versionless-prolog capitalized-prolog bad-directive bad-name unclosed-quote quoteless-attr attr-w/< text-w/]]> duplicate-attr ; diff --git a/basis/xml/tests/xmltest.factor b/basis/xml/tests/xmltest.factor index 4cae34a6dd..8caa5e8a75 100644 --- a/basis/xml/tests/xmltest.factor +++ b/basis/xml/tests/xmltest.factor @@ -1,12 +1,12 @@ USING: accessors assocs combinators continuations fry generalizations io.pathnames kernel macros sequences stack-checker tools.test xml -xml.utilities xml.writer ; +xml.utilities xml.writer arrays ; IN: xml.tests.suite -TUPLE: test id uri sections description type ; +TUPLE: xml-test id uri sections description type ; -: >test ( tag -- test ) - test new swap { +: >xml-test ( tag -- test ) + xml-test new swap { [ "TYPE" swap at >>type ] [ "ID" swap at >>id ] [ "URI" swap at >>uri ] @@ -15,7 +15,7 @@ TUPLE: test id uri sections description type ; } cleave ; : parse-tests ( xml -- tests ) - "TEST" tags-named [ >test ] map ; + "TEST" tags-named [ >xml-test ] map ; : base "resource:basis/xml/tests/xmltest/" ; @@ -32,11 +32,22 @@ MACRO: drop-input ( quot -- newquot ) : well-formed? ( uri -- answer ) [ file>xml ] fails? "not-wf" "valid" ? ; -: run-test ( test -- ) +: test-quots ( test -- result quot ) [ type>> '[ _ ] ] - [ '[ _ uri>> base swap append-path well-formed? ] ] bi - unit-test ; + [ '[ _ uri>> base swap append-path well-formed? ] ] bi ; -: run-tests ( -- ) +: xml-tests ( -- tests ) base "xmltest.xml" append-path file>xml - parse-tests [ run-test ] each ; \ No newline at end of file + parse-tests [ test-quots 2array ] map ; + +: run-xml-tests ( -- ) + xml-tests [ unit-test ] assoc-each ; + +: works? ( result quot -- ? ) + [ first ] [ call ] bi* = ; + +: partition-xml-tests ( -- successes failures ) + xml-tests [ first2 works? ] partition ; + +: failing-valids ( -- tests ) + partition-xml-tests nip [ second first ] map [ type>> "valid" = ] filter ; diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index dc9f7ae719..2747959831 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -13,6 +13,8 @@ IN: xml.tokenize ! A stack of hashtables SYMBOL: ns-stack +SYMBOL: depth + : attrs>ns ( attrs-alist -- hash ) ! this should check to make sure URIs are valid [ @@ -50,25 +52,37 @@ SYMBOL: ns-stack ! Parsing names -! version=1.0? is calculated once and passed around for efficiency +: valid-name? ( str -- ? ) + [ f ] [ + version=1.0? swap { + [ first name-start? ] + [ rest-slice [ name-char? ] with all? ] + } 2&& + ] if-empty ; -: assure-name ( str version=1.0? -- str ) - over { - [ first name-start? ] - [ rest-slice [ name-char? ] with all? ] - } 2&& [ bad-name ] unless ; +: prefixed-name ( str -- name/f ) + ":" split dup length 2 = [ + [ [ valid-name? ] all? ] + [ first2 f ] bi and + ] [ drop f ] if ; -: (parse-name) ( start -- str ) - version=1.0? - [ [ get-char name-char? not ] curry take-until append ] - [ assure-name ] bi ; +: interpret-name ( str -- name ) + dup prefixed-name [ ] [ + dup valid-name? + [ ] [ bad-name ] if + ] ?if ; -: parse-name-starting ( start -- name ) - (parse-name) get-char CHAR: : = - [ next "" (parse-name) ] [ "" swap ] if f ; +: take-name ( -- string ) + version=1.0? '[ _ get-char name-char? not ] take-until ; : parse-name ( -- name ) - "" parse-name-starting ; + take-name interpret-name ; + +: parse-name-starting ( string -- name ) + take-name append interpret-name ; + +: parse-simple-name ( -- name ) + take-name ; ! -- Parsing strings @@ -99,11 +113,15 @@ SYMBOL: ns-stack : assure-no-]]> ( circular -- ) "]]>" sequence= [ text-w/]]> ] when ; -: parse-text ( -- string ) - 3 f '[ - _ [ push-circular ] - [ nip assure-no-]]> ] - [ drop CHAR: < = ] 2tri +:: parse-text ( -- string ) + 3 f :> circ + depth get zero? :> no-text [| char | + char circ push-circular + circ assure-no-]]> + no-text [ char blank? char CHAR: < = or [ + char 1string t pre/post-content + ] unless ] when + char CHAR: < = ] parse-char ; ! Parsing tags @@ -131,7 +149,7 @@ SYMBOL: ns-stack [ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ; : parse-attr ( -- ) - parse-name CHAR: = expect + parse-name pass-blank CHAR: = expect pass-blank t parse-quote* normalize-quot 2array , ; : (middle-tag) ( -- ) @@ -148,9 +166,13 @@ SYMBOL: ns-stack [ (middle-tag) ] f make pass-blank assure-no-duplicates ; +: close ( -- ) + pass-blank CHAR: > expect ; + : end-tag ( name attrs-alist -- tag ) tag-ns pass-blank get-char CHAR: / = - [ pop-ns next ] [ ] if ; + [ pop-ns next CHAR: > expect ] + [ depth inc close ] if ; : take-comment ( -- comment ) "--" expect-string @@ -159,6 +181,7 @@ SYMBOL: ns-stack CHAR: > expect ; : take-cdata ( -- string ) + depth get zero? [ bad-cdata ] when "[CDATA[" expect-string "]]>" take-string ; : take-word ( -- string ) @@ -173,19 +196,17 @@ SYMBOL: ns-stack : take-attlist-decl ( -- doctype-decl ) take-decl-contents ; +: take-notation-decl ( -- notation-decl ) + take-decl-contents ; + : take-until-one-of ( seps -- str sep ) '[ get-char _ member? ] take-until get-char ; -: expect-> ( -- ) - pass-blank CHAR: > expect ; - : take-system-id ( -- system-id ) - parse-quote - expect-> ; + parse-quote close ; : take-public-id ( -- public-id ) - parse-quote parse-quote - expect-> ; + parse-quote parse-quote close ; DEFER: direct @@ -216,7 +237,7 @@ DEFER: direct { CHAR: \s [ pass-blank get-char CHAR: [ = [ next take-internal-subset f swap - expect-> + close ] [ " >" take-until-one-of { { CHAR: \s [ (take-external-id) ] } @@ -235,21 +256,22 @@ DEFER: direct } case ; : associate-entity ( entity-name entity-def -- ) - swap extra-entities [ ?set-at ] change ; + swap extra-entities get set-at ; : take-entity-decl ( -- entity-decl ) pass-blank get-char { { CHAR: % [ next pass-blank take-entity-def ] } [ drop take-entity-def 2dup associate-entity ] } case - expect-> ; + close ; : take-directive ( -- directive ) - take-word { + take-name { { "ELEMENT" [ take-element-decl ] } { "ATTLIST" [ take-attlist-decl ] } { "DOCTYPE" [ take-doctype-decl ] } { "ENTITY" [ take-entity-decl ] } + { "NOTATION" [ take-notation-decl ] } [ bad-directive ] } case ; @@ -307,28 +329,27 @@ SYMBOL: string-input? dup prolog-data set ; : instruct ( -- instruction ) - "" (parse-name) dup "xml" = - [ drop parse-prolog ] [ - dup >lower "xml" = - [ capitalized-prolog ] - [ "?>" take-string append ] if - ] if ; + take-name { + { [ dup "xml" = ] [ drop parse-prolog ] } + { [ dup >lower "xml" = ] [ capitalized-prolog ] } + { [ dup valid-name? not ] [ bad-name ] } + [ "?>" take-string append ] + } cond ; : make-tag ( -- tag ) { { [ get-char dup CHAR: ! = ] [ drop next direct ] } { [ CHAR: ? = ] [ next instruct ] } [ - start-tag [ dup add-ns pop-ns ] + start-tag [ dup add-ns pop-ns depth dec close ] [ middle-tag end-tag ] if - CHAR: > expect ] } cond ; ! Autodetecting encodings : continue-make-tag ( str -- tag ) - parse-name-starting middle-tag end-tag CHAR: > expect ; + parse-name-starting middle-tag end-tag ; : start-utf16le ( -- tag ) utf16le decode-input-if diff --git a/basis/xml/xml.factor b/basis/xml/xml.factor index 53d186dcd2..8ee35001a6 100644 --- a/basis/xml/xml.factor +++ b/basis/xml/xml.factor @@ -2,8 +2,8 @@ ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays io io.encodings.binary io.files io.streams.string kernel namespaces sequences xml.state-parser strings -xml.backend xml.data xml.errors xml.tokenize ascii -xml.writer ; +xml.backend xml.data xml.errors xml.tokenize ascii xml.entities +xml.writer assocs ; IN: xml ! -- Overall parser with data tree @@ -25,11 +25,6 @@ M: prolog process xml-stack get V{ { f V{ } } } = [ bad-prolog ] unless drop ; -M: instruction process - xml-stack get length 1 = - [ bad-instruction ] unless - add-child ; - M: directive process xml-stack get dup length 1 = swap first second [ tag? ] contains? not and @@ -53,7 +48,9 @@ M: closer process add-child ; : init-xml-stack ( -- ) - V{ } clone xml-stack set f push-xml ; + V{ } clone xml-stack set + extra-entities [ H{ } assoc-like ] change + f push-xml ; : default-prolog ( -- prolog ) "1.0" "UTF-8" f ; @@ -150,11 +147,12 @@ TUPLE: pull-xml scope ; ] state-parse ; : read-xml ( stream -- xml ) - #! Produces a tree of XML nodes - (read-xml-chunk) make-xml-doc ; + 0 depth + [ (read-xml-chunk) make-xml-doc ] with-variable ; : read-xml-chunk ( stream -- seq ) - (read-xml-chunk) nip ; + 1 depth + [ (read-xml-chunk) nip ] with-variable ; : string>xml ( string -- xml ) read-xml ;