Going further towards conformance

db4
Daniel Ehrenberg 2009-01-20 15:37:21 -06:00
parent 8f44f5e4b3
commit 67dd4ca4a4
7 changed files with 115 additions and 82 deletions

View File

@ -3,16 +3,16 @@
USING: kernel sequences unicode.syntax math math.order combinators ;
IN: xml.char-classes
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_ ;
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
: 1.0name-start? ( char -- ? )
dup 1.0name-start*? [ drop t ]
[ HEX: 2BB HEX: 2C1 between? ] if ;
CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387 ;
CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ;
CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _ ;
CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ;
CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7 ;
CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
: name-start? ( 1.0? char -- ? )
swap [ 1.0name-start? ] [ 1.1name-start? ] if ;

View File

@ -17,10 +17,13 @@ C: <name> name
[ [ main>> ] bi@ ?= ] 2tri and and ;
: <simple-name> ( string -- name )
"" swap f <name> ;
: <null-name> ( string -- name )
f swap f <name> ;
: assure-name ( string/name -- name )
dup name? [ <simple-name> ] unless ;
dup name? [ <null-name> ] unless ;
TUPLE: opener name attrs ;
C: <opener> opener
@ -54,6 +57,9 @@ C: <public-id> public-id
TUPLE: doctype-decl < directive name external-id internal-subset ;
C: <doctype-decl> doctype-decl
TUPLE: notation-decl < directive name id ;
C: <notation-decl> notation-decl
TUPLE: instruction text ;
C: <instruction> instruction

View File

@ -25,11 +25,12 @@ T{ capitalized-prolog f 1 6 "XmL" } "<?XmL version='1.0'?><x/>"
xml-error-test
T{ pre/post-content f "x" t } "x<y/>" xml-error-test
T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test
T{ bad-instruction f 1 11 T{ instruction f "xsl" } }
"<x><?xsl?></x>" xml-error-test
T{ unclosed-quote f 1 13 } "<x value='/>" xml-error-test
T{ bad-name f 1 3 "-" } "<-/>" xml-error-test
T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test
T{ attr-w/< f 1 11 } "<x value='<'/>" xml-error-test
T{ text-w/]]> f 1 6 } "<x>]]></x>" xml-error-test
T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } "<x this='a' this='b'/>" xml-error-test
T{ bad-cdata f 1 3 } "<![CDATA[]]><x/>" xml-error-test
T{ bad-cdata f 1 7 } "<x/><![CDATA[]]>" xml-error-test
T{ pre/post-content f "&" t } "&32;<x/>" xml-error-test

View File

@ -170,18 +170,6 @@ M: versionless-prolog summary ( obj -- str )
"XML prolog lacks a version declaration" print
] with-string-writer ;
TUPLE: bad-instruction < parsing-error instruction ;
: bad-instruction ( instruction -- * )
\ bad-instruction parsing-error swap >>instruction throw ;
M: bad-instruction summary ( obj -- str )
[
dup call-next-method write
"Misplaced processor instruction:" print
instruction>> write-xml-chunk nl
] with-string-writer ;
TUPLE: bad-directive < parsing-error dir ;
: bad-directive ( directive -- * )
@ -286,9 +274,17 @@ TUPLE: duplicate-attr < parsing-error key values ;
M: duplicate-attr summary
call-next-method "\nDuplicate attribute" append ;
TUPLE: bad-cdata < parsing-error ;
: bad-cdata ( -- * )
\ bad-cdata parsing-error throw ;
M: bad-cdata summary
call-next-method "\nCDATA occurs before or after main tag" append ;
UNION: xml-parse-error
multitags notags extra-attrs nonexist-ns bad-decl
not-yes/no unclosed mismatched expected no-entity
bad-prolog versionless-prolog capitalized-prolog bad-instruction
bad-prolog versionless-prolog capitalized-prolog
bad-directive bad-name unclosed-quote quoteless-attr
attr-w/< text-w/]]> duplicate-attr ;

View File

@ -1,12 +1,12 @@
USING: accessors assocs combinators continuations fry generalizations
io.pathnames kernel macros sequences stack-checker tools.test xml
xml.utilities xml.writer ;
xml.utilities xml.writer arrays ;
IN: xml.tests.suite
TUPLE: test id uri sections description type ;
TUPLE: xml-test id uri sections description type ;
: >test ( tag -- test )
test new swap {
: >xml-test ( tag -- test )
xml-test new swap {
[ "TYPE" swap at >>type ]
[ "ID" swap at >>id ]
[ "URI" swap at >>uri ]
@ -15,7 +15,7 @@ TUPLE: test id uri sections description type ;
} cleave ;
: parse-tests ( xml -- tests )
"TEST" tags-named [ >test ] map ;
"TEST" tags-named [ >xml-test ] map ;
: base "resource:basis/xml/tests/xmltest/" ;
@ -32,11 +32,22 @@ MACRO: drop-input ( quot -- newquot )
: well-formed? ( uri -- answer )
[ file>xml ] fails? "not-wf" "valid" ? ;
: run-test ( test -- )
: test-quots ( test -- result quot )
[ type>> '[ _ ] ]
[ '[ _ uri>> base swap append-path well-formed? ] ] bi
unit-test ;
[ '[ _ uri>> base swap append-path well-formed? ] ] bi ;
: run-tests ( -- )
: xml-tests ( -- tests )
base "xmltest.xml" append-path file>xml
parse-tests [ run-test ] each ;
parse-tests [ test-quots 2array ] map ;
: run-xml-tests ( -- )
xml-tests [ unit-test ] assoc-each ;
: works? ( result quot -- ? )
[ first ] [ call ] bi* = ;
: partition-xml-tests ( -- successes failures )
xml-tests [ first2 works? ] partition ;
: failing-valids ( -- tests )
partition-xml-tests nip [ second first ] map [ type>> "valid" = ] filter ;

View File

@ -13,6 +13,8 @@ IN: xml.tokenize
! A stack of hashtables
SYMBOL: ns-stack
SYMBOL: depth
: attrs>ns ( attrs-alist -- hash )
! this should check to make sure URIs are valid
[
@ -50,25 +52,37 @@ SYMBOL: ns-stack
! Parsing names
! version=1.0? is calculated once and passed around for efficiency
: valid-name? ( str -- ? )
[ f ] [
version=1.0? swap {
[ first name-start? ]
[ rest-slice [ name-char? ] with all? ]
} 2&&
] if-empty ;
: assure-name ( str version=1.0? -- str )
over {
[ first name-start? ]
[ rest-slice [ name-char? ] with all? ]
} 2&& [ bad-name ] unless ;
: prefixed-name ( str -- name/f )
":" split dup length 2 = [
[ [ valid-name? ] all? ]
[ first2 f <name> ] bi and
] [ drop f ] if ;
: (parse-name) ( start -- str )
version=1.0?
[ [ get-char name-char? not ] curry take-until append ]
[ assure-name ] bi ;
: interpret-name ( str -- name )
dup prefixed-name [ ] [
dup valid-name?
[ <simple-name> ] [ bad-name ] if
] ?if ;
: parse-name-starting ( start -- name )
(parse-name) get-char CHAR: : =
[ next "" (parse-name) ] [ "" swap ] if f <name> ;
: take-name ( -- string )
version=1.0? '[ _ get-char name-char? not ] take-until ;
: parse-name ( -- name )
"" parse-name-starting ;
take-name interpret-name ;
: parse-name-starting ( string -- name )
take-name append interpret-name ;
: parse-simple-name ( -- name )
take-name <simple-name> ;
! -- Parsing strings
@ -99,11 +113,15 @@ SYMBOL: ns-stack
: assure-no-]]> ( circular -- )
"]]>" sequence= [ text-w/]]> ] when ;
: parse-text ( -- string )
3 f <array> <circular> '[
_ [ push-circular ]
[ nip assure-no-]]> ]
[ drop CHAR: < = ] 2tri
:: parse-text ( -- string )
3 f <array> <circular> :> circ
depth get zero? :> no-text [| char |
char circ push-circular
circ assure-no-]]>
no-text [ char blank? char CHAR: < = or [
char 1string t pre/post-content
] unless ] when
char CHAR: < =
] parse-char ;
! Parsing tags
@ -131,7 +149,7 @@ SYMBOL: ns-stack
[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;
: parse-attr ( -- )
parse-name CHAR: = expect
parse-name pass-blank CHAR: = expect pass-blank
t parse-quote* normalize-quot 2array , ;
: (middle-tag) ( -- )
@ -148,9 +166,13 @@ SYMBOL: ns-stack
[ (middle-tag) ] f make pass-blank
assure-no-duplicates ;
: close ( -- )
pass-blank CHAR: > expect ;
: end-tag ( name attrs-alist -- tag )
tag-ns pass-blank get-char CHAR: / =
[ pop-ns <contained> next ] [ <opener> ] if ;
[ pop-ns <contained> next CHAR: > expect ]
[ depth inc <opener> close ] if ;
: take-comment ( -- comment )
"--" expect-string
@ -159,6 +181,7 @@ SYMBOL: ns-stack
CHAR: > expect ;
: take-cdata ( -- string )
depth get zero? [ bad-cdata ] when
"[CDATA[" expect-string "]]>" take-string ;
: take-word ( -- string )
@ -173,19 +196,17 @@ SYMBOL: ns-stack
: take-attlist-decl ( -- doctype-decl )
take-decl-contents <attlist-decl> ;
: take-notation-decl ( -- notation-decl )
take-decl-contents <notation-decl> ;
: take-until-one-of ( seps -- str sep )
'[ get-char _ member? ] take-until get-char ;
: expect-> ( -- )
pass-blank CHAR: > expect ;
: take-system-id ( -- system-id )
parse-quote <system-id>
expect-> ;
parse-quote <system-id> close ;
: take-public-id ( -- public-id )
parse-quote parse-quote <public-id>
expect-> ;
parse-quote parse-quote <public-id> close ;
DEFER: direct
@ -216,7 +237,7 @@ DEFER: direct
{ CHAR: \s [
pass-blank get-char CHAR: [ = [
next take-internal-subset f swap
expect->
close
] [
" >" take-until-one-of {
{ CHAR: \s [ (take-external-id) ] }
@ -235,21 +256,22 @@ DEFER: direct
} case ;
: associate-entity ( entity-name entity-def -- )
swap extra-entities [ ?set-at ] change ;
swap extra-entities get set-at ;
: take-entity-decl ( -- entity-decl )
pass-blank get-char {
{ CHAR: % [ next pass-blank take-entity-def ] }
[ drop take-entity-def 2dup associate-entity ]
} case
expect-> <entity-decl> ;
close <entity-decl> ;
: take-directive ( -- directive )
take-word {
take-name {
{ "ELEMENT" [ take-element-decl ] }
{ "ATTLIST" [ take-attlist-decl ] }
{ "DOCTYPE" [ take-doctype-decl ] }
{ "ENTITY" [ take-entity-decl ] }
{ "NOTATION" [ take-notation-decl ] }
[ bad-directive ]
} case ;
@ -307,28 +329,27 @@ SYMBOL: string-input?
dup prolog-data set ;
: instruct ( -- instruction )
"" (parse-name) dup "xml" =
[ drop parse-prolog ] [
dup >lower "xml" =
[ capitalized-prolog ]
[ "?>" take-string append <instruction> ] if
] if ;
take-name {
{ [ dup "xml" = ] [ drop parse-prolog ] }
{ [ dup >lower "xml" = ] [ capitalized-prolog ] }
{ [ dup valid-name? not ] [ bad-name ] }
[ "?>" take-string append <instruction> ]
} cond ;
: make-tag ( -- tag )
{
{ [ get-char dup CHAR: ! = ] [ drop next direct ] }
{ [ CHAR: ? = ] [ next instruct ] }
[
start-tag [ dup add-ns pop-ns <closer> ]
start-tag [ dup add-ns pop-ns <closer> depth dec close ]
[ middle-tag end-tag ] if
CHAR: > expect
]
} cond ;
! Autodetecting encodings
: continue-make-tag ( str -- tag )
parse-name-starting middle-tag end-tag CHAR: > expect ;
parse-name-starting middle-tag end-tag ;
: start-utf16le ( -- tag )
utf16le decode-input-if

View File

@ -2,8 +2,8 @@
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays io io.encodings.binary io.files
io.streams.string kernel namespaces sequences xml.state-parser strings
xml.backend xml.data xml.errors xml.tokenize ascii
xml.writer ;
xml.backend xml.data xml.errors xml.tokenize ascii xml.entities
xml.writer assocs ;
IN: xml
! -- Overall parser with data tree
@ -25,11 +25,6 @@ M: prolog process
xml-stack get V{ { f V{ } } } =
[ bad-prolog ] unless drop ;
M: instruction process
xml-stack get length 1 =
[ bad-instruction ] unless
add-child ;
M: directive process
xml-stack get dup length 1 =
swap first second [ tag? ] contains? not and
@ -53,7 +48,9 @@ M: closer process
<tag> add-child ;
: init-xml-stack ( -- )
V{ } clone xml-stack set f push-xml ;
V{ } clone xml-stack set
extra-entities [ H{ } assoc-like ] change
f push-xml ;
: default-prolog ( -- prolog )
"1.0" "UTF-8" f <prolog> ;
@ -150,11 +147,12 @@ TUPLE: pull-xml scope ;
] state-parse ;
: read-xml ( stream -- xml )
#! Produces a tree of XML nodes
(read-xml-chunk) make-xml-doc ;
0 depth
[ (read-xml-chunk) make-xml-doc ] with-variable ;
: read-xml-chunk ( stream -- seq )
(read-xml-chunk) nip ;
1 depth
[ (read-xml-chunk) nip ] with-variable ;
: string>xml ( string -- xml )
<string-reader> read-xml ;