Going further towards conformance

2009-01-20 15:37:21 -06:00 · 2009-01-20 15:37:21 -06:00 · 67dd4ca4a4
parent 8f44f5e4b3
commit 67dd4ca4a4
7 changed files with 115 additions and 82 deletions
--- a/basis/xml/char-classes/char-classes.factor
+++ b/basis/xml/char-classes/char-classes.factor
@ -3,16 +3,16 @@
 USING: kernel sequences unicode.syntax math math.order combinators ;
 IN: xml.char-classes

-CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_ ;
+CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
 : 1.0name-start? ( char -- ? )
    dup 1.0name-start*? [ drop t ] 
    [ HEX: 2BB HEX: 2C1 between? ] if ;

-CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387 ;
+CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ;

-CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _ ;
+CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ;

-CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7 ;
+CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;

 : name-start? ( 1.0? char -- ? )
    swap [ 1.0name-start? ] [ 1.1name-start? ] if ;
--- a/basis/xml/data/data.factor
+++ b/basis/xml/data/data.factor
@ -17,10 +17,13 @@ C: <name> name
    [ [ main>> ] bi@ ?= ] 2tri and and ;

 : <simple-name> ( string -- name )
+    "" swap f <name> ;
+
+: <null-name> ( string -- name )
    f swap f <name> ;

 : assure-name ( string/name -- name )
-    dup name? [ <simple-name> ] unless ;
+    dup name? [ <null-name> ] unless ;

 TUPLE: opener name attrs ;
 C: <opener> opener
@ -54,6 +57,9 @@ C: <public-id> public-id
 TUPLE: doctype-decl < directive name external-id internal-subset ;
 C: <doctype-decl> doctype-decl

+TUPLE: notation-decl < directive name id ;
+C: <notation-decl> notation-decl
+
 TUPLE: instruction text ;
 C: <instruction> instruction

--- a/basis/xml/errors/errors-tests.factor
+++ b/basis/xml/errors/errors-tests.factor
@ -25,11 +25,12 @@ T{ capitalized-prolog f 1 6 "XmL" } "<?XmL version='1.0'?><x/>"
    xml-error-test
 T{ pre/post-content f "x" t } "x<y/>" xml-error-test
 T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test
-T{ bad-instruction f 1 11 T{ instruction f "xsl" } }
-    "<x><?xsl?></x>" xml-error-test
 T{ unclosed-quote f 1 13 } "<x value='/>" xml-error-test
 T{ bad-name f 1 3 "-" } "<-/>" xml-error-test
 T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test
 T{ attr-w/< f 1 11 } "<x value='<'/>" xml-error-test
 T{ text-w/]]> f 1 6 } "<x>]]></x>" xml-error-test
 T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } "<x this='a' this='b'/>" xml-error-test
+T{ bad-cdata f 1 3 } "<![CDATA[]]><x/>" xml-error-test
+T{ bad-cdata f 1 7 } "<x/><![CDATA[]]>" xml-error-test
+T{ pre/post-content f "&" t } "&32;<x/>" xml-error-test
--- a/basis/xml/errors/errors.factor
+++ b/basis/xml/errors/errors.factor
@ -170,18 +170,6 @@ M: versionless-prolog summary ( obj -- str )
        "XML prolog lacks a version declaration" print
    ] with-string-writer ;

-TUPLE: bad-instruction < parsing-error instruction ;
-
-: bad-instruction ( instruction -- * )
-    \ bad-instruction parsing-error swap >>instruction throw ;
-
-M: bad-instruction summary ( obj -- str )
-    [
-        dup call-next-method write
-        "Misplaced processor instruction:" print
-        instruction>> write-xml-chunk nl
-    ] with-string-writer ;
-
 TUPLE: bad-directive < parsing-error dir ;

 : bad-directive ( directive -- * )
@ -286,9 +274,17 @@ TUPLE: duplicate-attr < parsing-error key values ;
 M: duplicate-attr summary
    call-next-method "\nDuplicate attribute" append ;

+TUPLE: bad-cdata < parsing-error ;
+
+: bad-cdata ( -- * )
+    \ bad-cdata parsing-error throw ;
+
+M: bad-cdata summary
+    call-next-method "\nCDATA occurs before or after main tag" append ;
+
 UNION: xml-parse-error
    multitags notags extra-attrs nonexist-ns bad-decl
    not-yes/no unclosed mismatched expected no-entity
-    bad-prolog versionless-prolog capitalized-prolog bad-instruction
+    bad-prolog versionless-prolog capitalized-prolog
    bad-directive bad-name unclosed-quote quoteless-attr
    attr-w/< text-w/]]> duplicate-attr ;
--- a/basis/xml/tests/xmltest.factor
+++ b/basis/xml/tests/xmltest.factor
@ -1,12 +1,12 @@
 USING: accessors assocs combinators continuations fry generalizations
 io.pathnames kernel macros sequences stack-checker tools.test xml
-xml.utilities xml.writer ;
+xml.utilities xml.writer arrays ;
 IN: xml.tests.suite

-TUPLE: test id uri sections description type ;
+TUPLE: xml-test id uri sections description type ;

-: >test ( tag -- test )
-    test new swap {
+: >xml-test ( tag -- test )
+    xml-test new swap {
        [ "TYPE" swap at >>type ]
        [ "ID" swap at >>id ]
        [ "URI" swap at >>uri ]
@ -15,7 +15,7 @@ TUPLE: test id uri sections description type ;
    } cleave ;

 : parse-tests ( xml -- tests )
-    "TEST" tags-named [ >test ] map ;
+    "TEST" tags-named [ >xml-test ] map ;

 : base "resource:basis/xml/tests/xmltest/" ;

@ -32,11 +32,22 @@ MACRO: drop-input ( quot -- newquot )
 : well-formed? ( uri -- answer )
    [ file>xml ] fails? "not-wf" "valid" ? ;

-: run-test ( test -- )
+: test-quots ( test -- result quot )
    [ type>> '[ _ ] ]
-    [ '[ _ uri>> base swap append-path well-formed? ] ] bi
-    unit-test ;
+    [ '[ _ uri>> base swap append-path well-formed? ] ] bi ;

-: run-tests ( -- )
+: xml-tests ( -- tests )
    base "xmltest.xml" append-path file>xml
-    parse-tests [ run-test ] each ;
+    parse-tests [ test-quots 2array ] map ;
+
+: run-xml-tests ( -- )
+    xml-tests [ unit-test ] assoc-each ;
+
+: works? ( result quot -- ? )
+    [ first ] [ call ] bi* = ;
+
+: partition-xml-tests ( -- successes failures )
+    xml-tests [ first2 works? ] partition ;
+
+: failing-valids ( -- tests )
+    partition-xml-tests nip [ second first ] map [ type>> "valid" = ] filter ;
--- a/basis/xml/tokenize/tokenize.factor
+++ b/basis/xml/tokenize/tokenize.factor
@ -13,6 +13,8 @@ IN: xml.tokenize
 ! A stack of hashtables
 SYMBOL: ns-stack

+SYMBOL: depth
+
 : attrs>ns ( attrs-alist -- hash )
    ! this should check to make sure URIs are valid
    [
@ -50,25 +52,37 @@ SYMBOL: ns-stack

 ! Parsing names

-! version=1.0? is calculated once and passed around for efficiency
+: valid-name? ( str -- ? )
+    [ f ] [
+        version=1.0? swap {
+            [ first name-start? ]
+            [ rest-slice [ name-char? ] with all? ]
+        } 2&&
+    ] if-empty ;

-: assure-name ( str version=1.0? -- str )
-    over {
-        [ first name-start? ]
-        [ rest-slice [ name-char? ] with all? ]
-    } 2&& [ bad-name ] unless ;
+: prefixed-name ( str -- name/f )
+    ":" split dup length 2 = [
+        [ [ valid-name? ] all? ]
+        [ first2 f <name> ] bi and
+    ] [ drop f ] if ;

-: (parse-name) ( start -- str )
-    version=1.0?
-    [ [ get-char name-char? not ] curry take-until append ]
-    [ assure-name ] bi ;
+: interpret-name ( str -- name )
+    dup prefixed-name [ ] [
+        dup valid-name?
+        [ <simple-name> ] [ bad-name ] if
+    ] ?if ;

-: parse-name-starting ( start -- name )
-    (parse-name) get-char CHAR: : =
-    [ next "" (parse-name) ] [ "" swap ] if f <name> ;
+: take-name ( -- string )
+    version=1.0? '[ _ get-char name-char? not ] take-until ;

 : parse-name ( -- name )
-    "" parse-name-starting ;
+    take-name interpret-name ;
+
+: parse-name-starting ( string -- name )
+    take-name append interpret-name ;
+
+: parse-simple-name ( -- name )
+    take-name <simple-name> ;

 !   -- Parsing strings

@ -99,11 +113,15 @@ SYMBOL: ns-stack
 : assure-no-]]> ( circular -- )
    "]]>" sequence= [ text-w/]]> ] when ;

-: parse-text ( -- string )
-    3 f <array> <circular> '[
-        _ [ push-circular ]
-        [ nip assure-no-]]> ]
-        [ drop CHAR: < = ] 2tri
+:: parse-text ( -- string )
+    3 f <array> <circular> :> circ
+    depth get zero? :> no-text [| char |
+        char circ push-circular
+        circ assure-no-]]>
+        no-text [ char blank? char CHAR: < = or [
+            char 1string t pre/post-content
+        ] unless ] when
+        char CHAR: < =
    ] parse-char ;

 ! Parsing tags
@ -131,7 +149,7 @@ SYMBOL: ns-stack
    [ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;

 : parse-attr ( -- )
-    parse-name CHAR: = expect
+    parse-name pass-blank CHAR: = expect pass-blank
    t parse-quote* normalize-quot 2array , ;

 : (middle-tag) ( -- )
@ -148,9 +166,13 @@ SYMBOL: ns-stack
    [ (middle-tag) ] f make pass-blank
    assure-no-duplicates ;

+: close ( -- )
+    pass-blank CHAR: > expect ;
+
 : end-tag ( name attrs-alist -- tag )
    tag-ns pass-blank get-char CHAR: / =
-    [ pop-ns <contained> next ] [ <opener> ] if ;
+    [ pop-ns <contained> next CHAR: > expect ]
+    [ depth inc <opener> close ] if ;

 : take-comment ( -- comment )
    "--" expect-string
@ -159,6 +181,7 @@ SYMBOL: ns-stack
    CHAR: > expect ;

 : take-cdata ( -- string )
+    depth get zero? [ bad-cdata ] when
    "[CDATA[" expect-string "]]>" take-string ;

 : take-word ( -- string )
@ -173,19 +196,17 @@ SYMBOL: ns-stack
 : take-attlist-decl ( -- doctype-decl )
    take-decl-contents <attlist-decl> ;

+: take-notation-decl ( -- notation-decl )
+    take-decl-contents <notation-decl> ; 
+
 : take-until-one-of ( seps -- str sep )
    '[ get-char _ member? ] take-until get-char ;

-: expect-> ( -- )
-    pass-blank CHAR: > expect ;
-
 : take-system-id ( -- system-id )
-    parse-quote <system-id>
-    expect-> ;
+    parse-quote <system-id> close ;

 : take-public-id ( -- public-id )
-    parse-quote parse-quote <public-id>
-    expect-> ;
+    parse-quote parse-quote <public-id> close ;

 DEFER: direct

@ -216,7 +237,7 @@ DEFER: direct
        { CHAR: \s [
            pass-blank get-char CHAR: [ = [
                next take-internal-subset f swap
-                expect->
+                close
            ] [
                " >" take-until-one-of {
                    { CHAR: \s [ (take-external-id) ] }
@ -235,21 +256,22 @@ DEFER: direct
    } case ;

 : associate-entity ( entity-name entity-def -- )
-    swap extra-entities [ ?set-at ] change ;
+    swap extra-entities get set-at ;

 : take-entity-decl ( -- entity-decl )
    pass-blank get-char {
        { CHAR: % [ next pass-blank take-entity-def ] }
        [ drop take-entity-def 2dup associate-entity ]
    } case
-    expect-> <entity-decl> ;
+    close <entity-decl> ;

 : take-directive ( -- directive )
-    take-word {
+    take-name {
        { "ELEMENT" [ take-element-decl ] }
        { "ATTLIST" [ take-attlist-decl ] }
        { "DOCTYPE" [ take-doctype-decl ] }
        { "ENTITY" [ take-entity-decl ] }
+        { "NOTATION" [ take-notation-decl ] }
        [ bad-directive ]
    } case ;

@ -307,28 +329,27 @@ SYMBOL: string-input?
    dup prolog-data set ;

 : instruct ( -- instruction )
-    "" (parse-name) dup "xml" =
-    [ drop parse-prolog ] [
-        dup >lower "xml" =
-        [ capitalized-prolog ]
-        [ "?>" take-string append <instruction> ] if
-    ] if ;
+    take-name {
+        { [ dup "xml" = ] [ drop parse-prolog ] }
+        { [ dup >lower "xml" = ] [ capitalized-prolog ] }
+        { [ dup valid-name? not ] [ bad-name ] }
+        [ "?>" take-string append <instruction> ]
+    } cond ;

 : make-tag ( -- tag )
    {
        { [ get-char dup CHAR: ! = ] [ drop next direct ] }
        { [ CHAR: ? = ] [ next instruct ] }
        [
-            start-tag [ dup add-ns pop-ns <closer> ]
+            start-tag [ dup add-ns pop-ns <closer> depth dec close ]
            [ middle-tag end-tag ] if
-            CHAR: > expect
        ]
    } cond ;

 ! Autodetecting encodings

 : continue-make-tag ( str -- tag )
-    parse-name-starting middle-tag end-tag CHAR: > expect ;
+    parse-name-starting middle-tag end-tag ;

 : start-utf16le ( -- tag )
    utf16le decode-input-if
--- a/basis/xml/xml.factor
+++ b/basis/xml/xml.factor
@ -2,8 +2,8 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays io io.encodings.binary io.files
 io.streams.string kernel namespaces sequences xml.state-parser strings
-xml.backend xml.data xml.errors xml.tokenize ascii
-xml.writer ;
+xml.backend xml.data xml.errors xml.tokenize ascii xml.entities
+xml.writer assocs ;
 IN: xml

 !   -- Overall parser with data tree
@ -25,11 +25,6 @@ M: prolog process
    xml-stack get V{ { f V{ } } } =
    [ bad-prolog ] unless drop ;

-M: instruction process
-    xml-stack get length 1 =
-    [ bad-instruction ] unless
-    add-child ;
-
 M: directive process
    xml-stack get dup length 1 =
    swap first second [ tag? ] contains? not and
@ -53,7 +48,9 @@ M: closer process
    <tag> add-child ;

 : init-xml-stack ( -- )
-    V{ } clone xml-stack set f push-xml ;
+    V{ } clone xml-stack set
+    extra-entities [ H{ } assoc-like ] change
+    f push-xml ;

 : default-prolog ( -- prolog )
    "1.0" "UTF-8" f <prolog> ;
@ -150,11 +147,12 @@ TUPLE: pull-xml scope ;
    ] state-parse ;

 : read-xml ( stream -- xml )
-    #! Produces a tree of XML nodes
-    (read-xml-chunk) make-xml-doc ;
+    0 depth
+    [ (read-xml-chunk) make-xml-doc ] with-variable ;

 : read-xml-chunk ( stream -- seq )
-    (read-xml-chunk) nip ;
+    1 depth
+    [ (read-xml-chunk) nip ] with-variable ;

 : string>xml ( string -- xml )
    <string-reader> read-xml ;