Going further towards conformance

db4
Daniel Ehrenberg 2009-01-20 15:37:21 -06:00
parent 8f44f5e4b3
commit 67dd4ca4a4
7 changed files with 115 additions and 82 deletions

View File

@ -3,16 +3,16 @@
USING: kernel sequences unicode.syntax math math.order combinators ; USING: kernel sequences unicode.syntax math math.order combinators ;
IN: xml.char-classes IN: xml.char-classes
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_ ; CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
: 1.0name-start? ( char -- ? ) : 1.0name-start? ( char -- ? )
dup 1.0name-start*? [ drop t ] dup 1.0name-start*? [ drop t ]
[ HEX: 2BB HEX: 2C1 between? ] if ; [ HEX: 2BB HEX: 2C1 between? ] if ;
CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387 ; CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ;
CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _ ; CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ;
CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7 ; CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
: name-start? ( 1.0? char -- ? ) : name-start? ( 1.0? char -- ? )
swap [ 1.0name-start? ] [ 1.1name-start? ] if ; swap [ 1.0name-start? ] [ 1.1name-start? ] if ;

View File

@ -17,10 +17,13 @@ C: <name> name
[ [ main>> ] bi@ ?= ] 2tri and and ; [ [ main>> ] bi@ ?= ] 2tri and and ;
: <simple-name> ( string -- name ) : <simple-name> ( string -- name )
"" swap f <name> ;
: <null-name> ( string -- name )
f swap f <name> ; f swap f <name> ;
: assure-name ( string/name -- name ) : assure-name ( string/name -- name )
dup name? [ <simple-name> ] unless ; dup name? [ <null-name> ] unless ;
TUPLE: opener name attrs ; TUPLE: opener name attrs ;
C: <opener> opener C: <opener> opener
@ -54,6 +57,9 @@ C: <public-id> public-id
TUPLE: doctype-decl < directive name external-id internal-subset ; TUPLE: doctype-decl < directive name external-id internal-subset ;
C: <doctype-decl> doctype-decl C: <doctype-decl> doctype-decl
TUPLE: notation-decl < directive name id ;
C: <notation-decl> notation-decl
TUPLE: instruction text ; TUPLE: instruction text ;
C: <instruction> instruction C: <instruction> instruction

View File

@ -25,11 +25,12 @@ T{ capitalized-prolog f 1 6 "XmL" } "<?XmL version='1.0'?><x/>"
xml-error-test xml-error-test
T{ pre/post-content f "x" t } "x<y/>" xml-error-test T{ pre/post-content f "x" t } "x<y/>" xml-error-test
T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test
T{ bad-instruction f 1 11 T{ instruction f "xsl" } }
"<x><?xsl?></x>" xml-error-test
T{ unclosed-quote f 1 13 } "<x value='/>" xml-error-test T{ unclosed-quote f 1 13 } "<x value='/>" xml-error-test
T{ bad-name f 1 3 "-" } "<-/>" xml-error-test T{ bad-name f 1 3 "-" } "<-/>" xml-error-test
T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test
T{ attr-w/< f 1 11 } "<x value='<'/>" xml-error-test T{ attr-w/< f 1 11 } "<x value='<'/>" xml-error-test
T{ text-w/]]> f 1 6 } "<x>]]></x>" xml-error-test T{ text-w/]]> f 1 6 } "<x>]]></x>" xml-error-test
T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } "<x this='a' this='b'/>" xml-error-test T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } "<x this='a' this='b'/>" xml-error-test
T{ bad-cdata f 1 3 } "<![CDATA[]]><x/>" xml-error-test
T{ bad-cdata f 1 7 } "<x/><![CDATA[]]>" xml-error-test
T{ pre/post-content f "&" t } "&32;<x/>" xml-error-test

View File

@ -170,18 +170,6 @@ M: versionless-prolog summary ( obj -- str )
"XML prolog lacks a version declaration" print "XML prolog lacks a version declaration" print
] with-string-writer ; ] with-string-writer ;
TUPLE: bad-instruction < parsing-error instruction ;
: bad-instruction ( instruction -- * )
\ bad-instruction parsing-error swap >>instruction throw ;
M: bad-instruction summary ( obj -- str )
[
dup call-next-method write
"Misplaced processor instruction:" print
instruction>> write-xml-chunk nl
] with-string-writer ;
TUPLE: bad-directive < parsing-error dir ; TUPLE: bad-directive < parsing-error dir ;
: bad-directive ( directive -- * ) : bad-directive ( directive -- * )
@ -286,9 +274,17 @@ TUPLE: duplicate-attr < parsing-error key values ;
M: duplicate-attr summary M: duplicate-attr summary
call-next-method "\nDuplicate attribute" append ; call-next-method "\nDuplicate attribute" append ;
TUPLE: bad-cdata < parsing-error ;
: bad-cdata ( -- * )
\ bad-cdata parsing-error throw ;
M: bad-cdata summary
call-next-method "\nCDATA occurs before or after main tag" append ;
UNION: xml-parse-error UNION: xml-parse-error
multitags notags extra-attrs nonexist-ns bad-decl multitags notags extra-attrs nonexist-ns bad-decl
not-yes/no unclosed mismatched expected no-entity not-yes/no unclosed mismatched expected no-entity
bad-prolog versionless-prolog capitalized-prolog bad-instruction bad-prolog versionless-prolog capitalized-prolog
bad-directive bad-name unclosed-quote quoteless-attr bad-directive bad-name unclosed-quote quoteless-attr
attr-w/< text-w/]]> duplicate-attr ; attr-w/< text-w/]]> duplicate-attr ;

View File

@ -1,12 +1,12 @@
USING: accessors assocs combinators continuations fry generalizations USING: accessors assocs combinators continuations fry generalizations
io.pathnames kernel macros sequences stack-checker tools.test xml io.pathnames kernel macros sequences stack-checker tools.test xml
xml.utilities xml.writer ; xml.utilities xml.writer arrays ;
IN: xml.tests.suite IN: xml.tests.suite
TUPLE: test id uri sections description type ; TUPLE: xml-test id uri sections description type ;
: >test ( tag -- test ) : >xml-test ( tag -- test )
test new swap { xml-test new swap {
[ "TYPE" swap at >>type ] [ "TYPE" swap at >>type ]
[ "ID" swap at >>id ] [ "ID" swap at >>id ]
[ "URI" swap at >>uri ] [ "URI" swap at >>uri ]
@ -15,7 +15,7 @@ TUPLE: test id uri sections description type ;
} cleave ; } cleave ;
: parse-tests ( xml -- tests ) : parse-tests ( xml -- tests )
"TEST" tags-named [ >test ] map ; "TEST" tags-named [ >xml-test ] map ;
: base "resource:basis/xml/tests/xmltest/" ; : base "resource:basis/xml/tests/xmltest/" ;
@ -32,11 +32,22 @@ MACRO: drop-input ( quot -- newquot )
: well-formed? ( uri -- answer ) : well-formed? ( uri -- answer )
[ file>xml ] fails? "not-wf" "valid" ? ; [ file>xml ] fails? "not-wf" "valid" ? ;
: run-test ( test -- ) : test-quots ( test -- result quot )
[ type>> '[ _ ] ] [ type>> '[ _ ] ]
[ '[ _ uri>> base swap append-path well-formed? ] ] bi [ '[ _ uri>> base swap append-path well-formed? ] ] bi ;
unit-test ;
: run-tests ( -- ) : xml-tests ( -- tests )
base "xmltest.xml" append-path file>xml base "xmltest.xml" append-path file>xml
parse-tests [ run-test ] each ; parse-tests [ test-quots 2array ] map ;
: run-xml-tests ( -- )
xml-tests [ unit-test ] assoc-each ;
: works? ( result quot -- ? )
[ first ] [ call ] bi* = ;
: partition-xml-tests ( -- successes failures )
xml-tests [ first2 works? ] partition ;
: failing-valids ( -- tests )
partition-xml-tests nip [ second first ] map [ type>> "valid" = ] filter ;

View File

@ -13,6 +13,8 @@ IN: xml.tokenize
! A stack of hashtables ! A stack of hashtables
SYMBOL: ns-stack SYMBOL: ns-stack
SYMBOL: depth
: attrs>ns ( attrs-alist -- hash ) : attrs>ns ( attrs-alist -- hash )
! this should check to make sure URIs are valid ! this should check to make sure URIs are valid
[ [
@ -50,25 +52,37 @@ SYMBOL: ns-stack
! Parsing names ! Parsing names
! version=1.0? is calculated once and passed around for efficiency : valid-name? ( str -- ? )
[ f ] [
version=1.0? swap {
[ first name-start? ]
[ rest-slice [ name-char? ] with all? ]
} 2&&
] if-empty ;
: assure-name ( str version=1.0? -- str ) : prefixed-name ( str -- name/f )
over { ":" split dup length 2 = [
[ first name-start? ] [ [ valid-name? ] all? ]
[ rest-slice [ name-char? ] with all? ] [ first2 f <name> ] bi and
} 2&& [ bad-name ] unless ; ] [ drop f ] if ;
: (parse-name) ( start -- str ) : interpret-name ( str -- name )
version=1.0? dup prefixed-name [ ] [
[ [ get-char name-char? not ] curry take-until append ] dup valid-name?
[ assure-name ] bi ; [ <simple-name> ] [ bad-name ] if
] ?if ;
: parse-name-starting ( start -- name ) : take-name ( -- string )
(parse-name) get-char CHAR: : = version=1.0? '[ _ get-char name-char? not ] take-until ;
[ next "" (parse-name) ] [ "" swap ] if f <name> ;
: parse-name ( -- name ) : parse-name ( -- name )
"" parse-name-starting ; take-name interpret-name ;
: parse-name-starting ( string -- name )
take-name append interpret-name ;
: parse-simple-name ( -- name )
take-name <simple-name> ;
! -- Parsing strings ! -- Parsing strings
@ -99,11 +113,15 @@ SYMBOL: ns-stack
: assure-no-]]> ( circular -- ) : assure-no-]]> ( circular -- )
"]]>" sequence= [ text-w/]]> ] when ; "]]>" sequence= [ text-w/]]> ] when ;
: parse-text ( -- string ) :: parse-text ( -- string )
3 f <array> <circular> '[ 3 f <array> <circular> :> circ
_ [ push-circular ] depth get zero? :> no-text [| char |
[ nip assure-no-]]> ] char circ push-circular
[ drop CHAR: < = ] 2tri circ assure-no-]]>
no-text [ char blank? char CHAR: < = or [
char 1string t pre/post-content
] unless ] when
char CHAR: < =
] parse-char ; ] parse-char ;
! Parsing tags ! Parsing tags
@ -131,7 +149,7 @@ SYMBOL: ns-stack
[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ; [ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;
: parse-attr ( -- ) : parse-attr ( -- )
parse-name CHAR: = expect parse-name pass-blank CHAR: = expect pass-blank
t parse-quote* normalize-quot 2array , ; t parse-quote* normalize-quot 2array , ;
: (middle-tag) ( -- ) : (middle-tag) ( -- )
@ -148,9 +166,13 @@ SYMBOL: ns-stack
[ (middle-tag) ] f make pass-blank [ (middle-tag) ] f make pass-blank
assure-no-duplicates ; assure-no-duplicates ;
: close ( -- )
pass-blank CHAR: > expect ;
: end-tag ( name attrs-alist -- tag ) : end-tag ( name attrs-alist -- tag )
tag-ns pass-blank get-char CHAR: / = tag-ns pass-blank get-char CHAR: / =
[ pop-ns <contained> next ] [ <opener> ] if ; [ pop-ns <contained> next CHAR: > expect ]
[ depth inc <opener> close ] if ;
: take-comment ( -- comment ) : take-comment ( -- comment )
"--" expect-string "--" expect-string
@ -159,6 +181,7 @@ SYMBOL: ns-stack
CHAR: > expect ; CHAR: > expect ;
: take-cdata ( -- string ) : take-cdata ( -- string )
depth get zero? [ bad-cdata ] when
"[CDATA[" expect-string "]]>" take-string ; "[CDATA[" expect-string "]]>" take-string ;
: take-word ( -- string ) : take-word ( -- string )
@ -173,19 +196,17 @@ SYMBOL: ns-stack
: take-attlist-decl ( -- doctype-decl ) : take-attlist-decl ( -- doctype-decl )
take-decl-contents <attlist-decl> ; take-decl-contents <attlist-decl> ;
: take-notation-decl ( -- notation-decl )
take-decl-contents <notation-decl> ;
: take-until-one-of ( seps -- str sep ) : take-until-one-of ( seps -- str sep )
'[ get-char _ member? ] take-until get-char ; '[ get-char _ member? ] take-until get-char ;
: expect-> ( -- )
pass-blank CHAR: > expect ;
: take-system-id ( -- system-id ) : take-system-id ( -- system-id )
parse-quote <system-id> parse-quote <system-id> close ;
expect-> ;
: take-public-id ( -- public-id ) : take-public-id ( -- public-id )
parse-quote parse-quote <public-id> parse-quote parse-quote <public-id> close ;
expect-> ;
DEFER: direct DEFER: direct
@ -216,7 +237,7 @@ DEFER: direct
{ CHAR: \s [ { CHAR: \s [
pass-blank get-char CHAR: [ = [ pass-blank get-char CHAR: [ = [
next take-internal-subset f swap next take-internal-subset f swap
expect-> close
] [ ] [
" >" take-until-one-of { " >" take-until-one-of {
{ CHAR: \s [ (take-external-id) ] } { CHAR: \s [ (take-external-id) ] }
@ -235,21 +256,22 @@ DEFER: direct
} case ; } case ;
: associate-entity ( entity-name entity-def -- ) : associate-entity ( entity-name entity-def -- )
swap extra-entities [ ?set-at ] change ; swap extra-entities get set-at ;
: take-entity-decl ( -- entity-decl ) : take-entity-decl ( -- entity-decl )
pass-blank get-char { pass-blank get-char {
{ CHAR: % [ next pass-blank take-entity-def ] } { CHAR: % [ next pass-blank take-entity-def ] }
[ drop take-entity-def 2dup associate-entity ] [ drop take-entity-def 2dup associate-entity ]
} case } case
expect-> <entity-decl> ; close <entity-decl> ;
: take-directive ( -- directive ) : take-directive ( -- directive )
take-word { take-name {
{ "ELEMENT" [ take-element-decl ] } { "ELEMENT" [ take-element-decl ] }
{ "ATTLIST" [ take-attlist-decl ] } { "ATTLIST" [ take-attlist-decl ] }
{ "DOCTYPE" [ take-doctype-decl ] } { "DOCTYPE" [ take-doctype-decl ] }
{ "ENTITY" [ take-entity-decl ] } { "ENTITY" [ take-entity-decl ] }
{ "NOTATION" [ take-notation-decl ] }
[ bad-directive ] [ bad-directive ]
} case ; } case ;
@ -307,28 +329,27 @@ SYMBOL: string-input?
dup prolog-data set ; dup prolog-data set ;
: instruct ( -- instruction ) : instruct ( -- instruction )
"" (parse-name) dup "xml" = take-name {
[ drop parse-prolog ] [ { [ dup "xml" = ] [ drop parse-prolog ] }
dup >lower "xml" = { [ dup >lower "xml" = ] [ capitalized-prolog ] }
[ capitalized-prolog ] { [ dup valid-name? not ] [ bad-name ] }
[ "?>" take-string append <instruction> ] if [ "?>" take-string append <instruction> ]
] if ; } cond ;
: make-tag ( -- tag ) : make-tag ( -- tag )
{ {
{ [ get-char dup CHAR: ! = ] [ drop next direct ] } { [ get-char dup CHAR: ! = ] [ drop next direct ] }
{ [ CHAR: ? = ] [ next instruct ] } { [ CHAR: ? = ] [ next instruct ] }
[ [
start-tag [ dup add-ns pop-ns <closer> ] start-tag [ dup add-ns pop-ns <closer> depth dec close ]
[ middle-tag end-tag ] if [ middle-tag end-tag ] if
CHAR: > expect
] ]
} cond ; } cond ;
! Autodetecting encodings ! Autodetecting encodings
: continue-make-tag ( str -- tag ) : continue-make-tag ( str -- tag )
parse-name-starting middle-tag end-tag CHAR: > expect ; parse-name-starting middle-tag end-tag ;
: start-utf16le ( -- tag ) : start-utf16le ( -- tag )
utf16le decode-input-if utf16le decode-input-if

View File

@ -2,8 +2,8 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays io io.encodings.binary io.files USING: accessors arrays io io.encodings.binary io.files
io.streams.string kernel namespaces sequences xml.state-parser strings io.streams.string kernel namespaces sequences xml.state-parser strings
xml.backend xml.data xml.errors xml.tokenize ascii xml.backend xml.data xml.errors xml.tokenize ascii xml.entities
xml.writer ; xml.writer assocs ;
IN: xml IN: xml
! -- Overall parser with data tree ! -- Overall parser with data tree
@ -25,11 +25,6 @@ M: prolog process
xml-stack get V{ { f V{ } } } = xml-stack get V{ { f V{ } } } =
[ bad-prolog ] unless drop ; [ bad-prolog ] unless drop ;
M: instruction process
xml-stack get length 1 =
[ bad-instruction ] unless
add-child ;
M: directive process M: directive process
xml-stack get dup length 1 = xml-stack get dup length 1 =
swap first second [ tag? ] contains? not and swap first second [ tag? ] contains? not and
@ -53,7 +48,9 @@ M: closer process
<tag> add-child ; <tag> add-child ;
: init-xml-stack ( -- ) : init-xml-stack ( -- )
V{ } clone xml-stack set f push-xml ; V{ } clone xml-stack set
extra-entities [ H{ } assoc-like ] change
f push-xml ;
: default-prolog ( -- prolog ) : default-prolog ( -- prolog )
"1.0" "UTF-8" f <prolog> ; "1.0" "UTF-8" f <prolog> ;
@ -150,11 +147,12 @@ TUPLE: pull-xml scope ;
] state-parse ; ] state-parse ;
: read-xml ( stream -- xml ) : read-xml ( stream -- xml )
#! Produces a tree of XML nodes 0 depth
(read-xml-chunk) make-xml-doc ; [ (read-xml-chunk) make-xml-doc ] with-variable ;
: read-xml-chunk ( stream -- seq ) : read-xml-chunk ( stream -- seq )
(read-xml-chunk) nip ; 1 depth
[ (read-xml-chunk) nip ] with-variable ;
: string>xml ( string -- xml ) : string>xml ( string -- xml )
<string-reader> read-xml ; <string-reader> read-xml ;