xml: more docs and assorted refactoring

2006-12-09 04:11:30 +00:00 · 2006-12-09 04:11:30 +00:00 · 55d4b8d332
parent 5e9fce2437
commit 55d4b8d332
6 changed files with 165 additions and 84 deletions
--- a/libs/xml/parser.factor
+++ b/libs/xml/parser.factor
@ -1,9 +1,11 @@
-! --> Copyright (C) 2005, 2006 Daniel Ehrenberg
+! Copyright (C) 2005, 2006 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
 IN: xml
 USING: errors hashtables io kernel math namespaces prettyprint sequences
    arrays generic strings vectors char-classes ;

+! * Parsing tags
+
 TUPLE: opener name props ;
 TUPLE: closer name ;
 TUPLE: contained name props ;
@ -16,22 +18,6 @@ TUPLE: instruction text ;
    get-char CHAR: / = dup [ next ] when
    parse-name swap ;

-: (parse-quot) ( ch -- )
-    ! The similarities with (parse-text) should be factored out
-    get-char {
-        { [ dup not ]
-          [ "File ended in quote" <xml-string-error> throw ] }
-        { [ 2dup = ]
-          [ 2drop end-record , next ] }
-        { [ dup CHAR: & = ]
-          [ drop parse-entity (parse-quot) ] }
-        { [ CHAR: % = ] [ parse-reference (parse-quot) ] }
-        { [ t ] [ next (parse-quot) ] }
-    } cond ;
-
-: parse-quot ( ch -- array )
-   [ new-record (parse-quot) ] { } make ;
-
 : parse-prop-value ( -- seq )
    get-char dup "'\"" member? [
        next parse-quot
@ -46,7 +32,7 @@ TUPLE: instruction text ;
    swap set ;

 : (middle-tag) ( -- )
-    pass-blank get-char name-start-char?
+    pass-blank version=1.0? get-char name-start-char?
    [ parse-prop (middle-tag) ] when ;

 : middle-tag ( -- hash )
@ -77,7 +63,6 @@ TUPLE: instruction text ;
    "?>" take-string <instruction> ;

 : make-tag ( -- tag/f )
-    CHAR: < expect
    { { [ get-char dup CHAR: ! = ] [ drop next directive ] }
      { [ CHAR: ? = ] [ next instruction ] } 
      { [ t ] [
@ -95,9 +80,12 @@ C: tag ( name props children -- tag )
    [ set-tag-props ] keep
    [ set-delegate ] keep ;

-TUPLE: contained-tag ;
-C: contained-tag ( name props -- contained-tag )
-    [ >r { } <tag> r> set-delegate ] keep ;
+! tag with children=f is contained
+: <contained-tag> ( name props -- tag )
+    f <tag> ;
+
+PREDICATE: tag contained-tag tag-children not ;
+PREDICATE: tag open-tag tag-children ;

 ! A stack of { tag children } pairs
 SYMBOL: xml-stack
@ -273,7 +261,8 @@ M: bad-version error.
    init-xml-stack
    init-ns-stack ;

-UNION: any-tag tag contained-tag ;
+: init-xml-string ( string -- ) ! for debugging
+    <string-reader> init-xml ;

 TUPLE: notags ;
 M: notags error.
@ -284,26 +273,28 @@ M: multitags error.
    drop "XML document contains multiple main tags" print ;

 : make-xml-doc ( seq -- xml-doc )
-    prolog-data get swap dup [ any-tag? ] find
+    prolog-data get swap dup [ tag? ] find
    >r dup -1 = [ <notags> throw ] when
    swap cut 1 tail
-    dup [ any-tag? ] contains? [ <multitags> throw ] when r>
+    dup [ tag? ] contains? [ <multitags> throw ] when r>
    swap <xml-doc> ;

 : (read-xml) ( -- )
    parse-text [ add-child ] each
    get-char [ make-tag process (read-xml) ] when ;

+: (xml-chunk) ( stream -- seq )
+    init-xml parse-prolog (read-xml)
+    xml-stack get
+    dup length 1 = [ <unclosed> throw ] unless
+    first second ;
+
 : read-xml ( stream -- xml-doc )
    #! Produces a tree of XML nodes
-    [
-        init-xml
-        parse-prolog (read-xml)
-        xml-stack get
-        dup length 1 = [ <unclosed> throw ] unless
-        first second
-        make-xml-doc
-    ] with-scope ;
+    [ (xml-chunk) make-xml-doc ] with-scope ;
+
+: xml-chunk ( stream -- seq )
+    [ (xml-chunk) ] with-scope ;

 : string>xml ( string -- xml-doc )
    <string-reader> read-xml ;
--- a/libs/xml/test.factor
+++ b/libs/xml/test.factor
@ -19,7 +19,7 @@ SYMBOL: xml-file
 [ t ] [ xml-file get tag-children second contained-tag? ] unit-test
 [ t ] [ [ "<a></b>" string>xml ] catch xml-parse-error? ] unit-test
 [ "<?xml version=\"1.0\" encoding=\"iso-8859-1\" standalone=\"no\"?><a b=\"c\"/>" ]
-    [ "<a b='c'/>" xml-reprint ] unit-test
+    [ "<a b='c'/>" string>xml xml>string ] unit-test
 [ 32 ] [
    "<math><times><add><number>1</number><number>3</number></add><neg><number>-8</number></neg></times></math>"
    calc-arith
--- a/libs/xml/tokenizer.factor
+++ b/libs/xml/tokenizer.factor
@ -205,46 +205,60 @@ TUPLE: reference name ;
    next unrecord end-record , CHAR: ; take-char
    <reference> , next new-record ;

-: (parse-text) ( -- )
+: (parse-char) ( ch -- )
+    ! The similarities with (parse-text) should be factored out
    get-char {
        { [ dup not ]
-          [ drop 0 end-record* , ] }
-        { [ dup CHAR: < = ] [ drop end-record , ] }
+          [ 2drop 0 end-record* , ] }
+        { [ 2dup = ]
+          [ 2drop end-record , next ] }
        { [ dup CHAR: & = ]
-          [ drop parse-entity (parse-text) ] }
-        { [ CHAR: % = ]
-          [ parse-reference (parse-text) ] }
-        { [ t ] [ next (parse-text) ] }
+          [ drop parse-entity (parse-char) ] }
+        { [ CHAR: % = ] [ parse-reference (parse-char) ] }
+        { [ t ] [ next (parse-char) ] }
    } cond ;

-: parse-text ( -- array )
-   [ new-record (parse-text) ] { } make ;
+: parse-char ( ch -- array )
+    [ new-record (parse-char) ] { } make ;

-!   -- Parsing tags
+: parse-quot ( ch -- array )
+    parse-char get-char
+    [ "XML file ends in a quote" <xml-string-error> throw ] unless ;
+
+: parse-text ( -- array )
+    CHAR: < parse-char ;
+
+!   -- Parsing names

 TUPLE: name space tag url ;
-C: name ( space tag -- name )
-    [ set-name-tag ] keep
-    [ set-name-space ] keep ;

-: get-version ( -- string )
-    prolog-data get prolog-version ;
+: version=1.0? ( -- ? )
+    prolog-data get prolog-version "1.0" = ;

-: name-start-char? ( char -- ? )
-    get-version "1.0" =
-    [ 1.0name-start-char? ] [ 1.1name-start-char? ] if ;
+! version=1.0? is calculated once and passed around for efficiency
+: name-start-char? ( 1.0? char -- ? )
+    swap [ 1.0name-start-char? ] [ 1.1name-start-char? ] if ;

-: name-char? ( char -- ? )
-    get-version "1.0" =
-    [ 1.0name-char? ] [ 1.1name-char? ] if ;
+: name-char? ( 1.0? char -- ? )
+    swap [ 1.0name-char? ] [ 1.1name-char? ] if ;

 : (parse-name) ( -- str )
+    version=1.0? dup
    new-record get-char name-start-char? [
-        [ get-char name-char? not ] skip-until end-record
+        [ dup get-char name-char? not ] skip-until
+        drop end-record
    ] [
        "Malformed name" <xml-string-error> throw
    ] if ;

 : parse-name ( -- name )
    (parse-name) get-char CHAR: : =
-    [ next (parse-name) ] [ "" swap ] if <name> ;
+    [ next (parse-name) ] [ "" swap ] if f <name> ;
+
+: ?= ( object/f object/f -- ? )
+    2dup and [ = ] [ 2drop t ] if ;
+
+: names-match? ( name1 name2 -- ? )
+    [ name-space swap name-space ?= ] 2keep
+    [ name-url swap name-url ?= ] 2keep
+    name-tag swap name-tag ?= and and ;
--- a/libs/xml/utilities.factor
+++ b/libs/xml/utilities.factor
@ -34,7 +34,7 @@ M: process-missing error.
 ! * Common utility functions

 : build-tag* ( items name -- tag )
-    "" swap <name> "" over set-name-url
+    "" swap "" <name>
    swap >r H{ } r> <tag> ;

 : build-tag ( item name -- tag )
@ -47,10 +47,10 @@ M: process-missing error.
    tag-children [ string? ] subset concat ;

 : children-tags ( tag -- sequence )
-    tag-children [ any-tag? ] subset ;
+    tag-children [ tag? ] subset ;

 : first-child-tag ( tag -- tag )
-    tag-children [ any-tag? ] find nip ;
+    tag-children [ tag? ] find nip ;

 ! * Utilities for searching through XML documents
 ! These all work from the outside in, top to bottom.
@ -102,16 +102,16 @@ M: xml-doc (xml-find)
 : xml-find ( tag quot -- tag ) ! quot: tag -- ?
    swap (xml-find) ; inline

-: prop-name ( name-tag tag -- seq/f )
-    #! gets the property with the name-tag string specified
+: prop-name ( name tag -- seq/f )
+    #! gets the property with the first matching name
    tag-props [
-        hash-keys [ name-tag over = ] find
+        hash-keys [ over names-match? ] find
    ] keep hash 2nip ;

 : get-id ( tag id -- elem ) ! elem=tag.getElementById(id)
    swap [
-        dup any-tag? [
-            "id" swap prop-name
+        dup tag? [
+            T{ name f f "id" f } swap prop-name
            [ string? ] subset concat
            over =
        ] [ drop f ] if
--- a/libs/xml/writer.factor
+++ b/libs/xml/writer.factor
@ -43,7 +43,7 @@ M: contained-tag (xml>string)
    tag-props print-props
    "/>" write ;

-M: tag (xml>string)
+M: open-tag (xml>string)
    CHAR: < write1
    dup print-name
    dup tag-props print-props
@ -79,6 +79,6 @@ M: instruction (xml>string)
 : xml>string ( xml-doc -- string )
    [ write-xml ] string-out ;

-: xml-reprint ( string -- string )
-    string>xml xml>string ;
+: xml-reprint ( string -- )
+    string>xml print-xml ;

--- a/libs/xml/xml.facts
+++ b/libs/xml/xml.facts
@ -4,13 +4,13 @@ USING: help kernel xml ;

 HELP: string>xml
 { $values { "string" "a string" } { "xml-doc" "an xml document" } }
-{ $description "converts a string into an " { $snippet "xml-doc" }
+{ $description "converts a string into an " { $link xml-doc }
    " datatype for further processing" }
 { $see-also xml>string xml-reprint } ;

 HELP: xml>string
 { $values { "xml-doc" "an xml document" } { "string" "a string" } }
-{ $description "converts an xml document into a string" }
+{ $description "converts an xml document (" { $link xml-doc } ") into a string" }
 { $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" }
 { $see-also string>xml xml-reprint write-xml } ;

@ -18,14 +18,14 @@ HELP: xml-parse-error
 { $class-description "the exception class that all parsing errors in XML documents are in." } ;

 HELP: xml-reprint
-{ $values { "string" "a string of XML" } { "string" "reprinted XML" } }
-{ $description "parses XML and converts it back into a string, for testing purposes" }
+{ $values { "string" "a string of XML" } }
+{ $description "parses XML and prints it out again, for testing purposes" }
 { $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" }
 { $see-also write-xml xml>string string>xml } ;

 HELP: write-xml
 { $values { "xml-doc" "an XML document" } }
-{ $description "prints the contents of an XML document to stdio" }
+{ $description "prints the contents of an XML document (" { $link xml-doc } ") to stdio" }
 { $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" }
 { $see-also xml>string xml-reprint read-xml } ;

@ -68,12 +68,12 @@ HELP: xml-subset
 HELP: build-tag*
 { $values { "items" "sequence of elements" } { "name" "string" }
    { "tag" "an XML tag" } }
-{ $description "builds a tag with the specified name, in the namespace \"\" and URL \"\" containing the children listed in item" }
+{ $description "builds a " { $link tag } " with the specified name, in the namespace \"\" and URL \"\" containing the children listed in item" }
 { $see-also build-tag build-xml-doc } ;

 HELP: build-tag
 { $values { "item" "an element" } { "name" "string" } { "tag" "XML tag" } }
-{ $description "builds a tag with the specified name containing the single child item" }
+{ $description "builds a " { $link tag } " with the specified name containing the single child item" }
 { $see-also build-tag* build-xml-doc } ;

 HELP: build-xml-doc
@ -82,24 +82,24 @@ HELP: build-xml-doc
 { $see-also build-tag* build-tag } ;

 HELP: tag
-{ $class-description "tuple representing an XML tag, delegating to a name, containing the slots props (a hashtable) and children (a sequence)" }
+{ $class-description "tuple representing an XML tag, delegating to a " { $link name } ", containing the slots props (a hashtable) and children (a sequence)" }
 { $see-also <tag> name contained-tag xml-doc } ;

 HELP: <tag>
 { $values { "name" "an XML tag name" }
    { "props" "a hashtable of XML properties" }
    { "children" "a sequence" } }
-{ $description "constructs an XML tag, with the specified name (not a string) and tag properties specified in props, and children specified" }
+{ $description "constructs an XML " { $link tag } ", with the name (not a string) and tag properties specified in props, and children specified" }
 { $see-also tag <contained-tag> build-tag build-tag* } ;

 HELP: name
 { $class-description "represents an XML name, with the fields space (a string representing the namespace, as written in the document, tag (a string of the actual name of the tag) and url (a string of the URL that the namespace points to)" }
 { $see-also <name> tag } ;

-HELP: <name>
-{ $values { "space" "a string" } { "tag" "a string" }
+HELP: <name> ( space tag url -- name )
+{ $values { "space" "a string" } { "tag" "a string" } { "url" "a string" }
    { "name" "an XML tag name" } }
-{ $description "creates a name tuple with the name-space space and the tag-name tag. The namespace URL must be added later." }
+{ $description "creates a name tuple with the name-space space and the tag-name tag and the tag-url url." }
 { $see-also name <tag> } ;

 HELP: contained-tag
@ -169,6 +169,66 @@ HELP: <instruction> ( text -- instruction )
 { $description "creates an XML parsing instruction, such as <?xsl stylesheet='foo.xml'?>." }
 { $see-also instruction } ;

+HELP: names-match?
+{ $values { "name1" "a name" } { "name2" "a name" } { "?" "t or f" } }
+{ $description "checks to see if the two names match, that is, if all fields are equal, ignoring fields whose value is f in either name." }
+{ $example "T{ name f \"rpc\" \"methodCall\" f } T{ name f f \"methodCall\" \"http://www.xmlrpc.org/\" } names-match? ! => t" }
+{ $see-also name } ;
+
+HELP: xml-chunk
+{ $values { "stream" "an input stream" } { "seq" "a sequence of elements" } }
+{ $description "rather than parse a document, as " { $link read-xml } " does, this word parses and returns a sequence of XML elements (tags, strings, etc). This is useful for fragments of XML which may have more than one main tag." }
+{ $see-also read-xml } ;
+
+HELP: xml-find
+{ $values { "tag" "an XML element or document" } { "quot" "a quotation ( elem -- ? )" } { "tag" "an XML element which satisfies the predicate" } }
+{ $description "finds the first element in the XML document which satisfies the predicate, moving from the outermost element to the innermost, top-down" }
+{ $see-also xml-each xml-map get-id } ;
+
+HELP: get-id
+{ $values { "tag" "an XML tag or document" } { "id" "a string" } { "elem" "an XML element or f" } }
+{ $description "finds the XML tag with the specified id, ignoring the namespace" }
+{ $see-also xml-find } ;
+
+HELP: spot
+{ $var-description "This variable represents the location in the program. It is in the format { char line column line-str } where char is the current character, line is the line number, column is the column number, and line-str is the full contents of the line, as a string. The contents shouldn't be accessed directly but rather with the proxy words get-char set-char get-line etc." } ;
+
+HELP: new-record
+{ $description "puts the parser into recording mode, where all characters encountered are pushed into a string buffer to be extracted later" }
+{ $see-also end-record } ;
+
+HELP: end-record
+{ $values { "string" "a string" } }
+{ $description "takes the parser out of recording mode and outputs the string recorded" }
+{ $see-also new-record } ;
+
+HELP: skip-until
+{ $values { "quot" "a quotation ( -- ? )" } }
+{ $description "executes " { $link next } " until the quotation yields false. Usually, the quotation will call " { $link get-char } " in its test, but not always." }
+{ $see-also take-until } ;
+
+HELP: take-until
+{ $values { "quot" "a quotation ( -- ? )" } { "string" "a string" } }
+{ $description "like " { $link skip-until } " but records what it passes over and outputs the string." }
+{ $see-also skip-until take-char take-string } ;
+
+HELP: take-char
+{ $values { "char" "a character" } { "string" "a string" } }
+{ $description "records the document from the current spot to the first instance of the given character. Outputs the content between those two points." }
+{ $see-also take-until take-string } ;
+
+HELP: take-string
+{ $values { "match" "a string to match" } { "string" "the portion of the XML document" } }
+{ $description "records the document from the current spot to the first instance of the given character. Outputs the content between those two points." }
+{ $notes "match may not contain a newline" } ;
+
+HELP: next
+{ $description "originally written as " { $code "spot inc" } ", code that would no longer run, this word moves the state of the XML parser to the next place in the source file, keeping track of appropriate debugging information." } ;
+
+HELP: process
+{ $values { "object" "an opener, closer, contained or text element" } }
+{ $description { "takes an XML event and, using the XML stack, processes it and adds it to the tree" } } ;
+
 ARTICLE: { "xml" "intro" } "XML"
    "The XML module attempts to implement the XML 1.1 standard, converting strings of text into XML and vice versa. It currently is a work in progress. Together with XML-RPC, this is a component of the F2EE framework."
    $terpri
@ -176,7 +236,8 @@ ARTICLE: { "xml" "intro" } "XML"
    { $subsection { "xml" "basic" } }
    { $subsection { "xml" "classes" } }
    { $subsection { "xml" "construct" } }
-    { $subsection { "xml" "utils" } } ;
+    { $subsection { "xml" "utils" } }
+    { $subsection { "xml" "internal" } } ;

 ARTICLE: { "xml" "basic" } "Basic words for XML processing"
    "These are the most basic words needed for processing an XML document"
@ -185,8 +246,7 @@ ARTICLE: { "xml" "basic" } "Basic words for XML processing"
    { $subsection xml-parse-error }
    { $subsection xml-reprint }
    { $subsection write-xml }
-    { $subsection read-xml }
-    { $subsection init-xml } ;
+    { $subsection read-xml } ;

 ARTICLE: { "xml" "classes" } "XML data classes"
    "Data types that XML documents are made of:"
@ -222,7 +282,23 @@ ARTICLE: { "xml" "utils" } "XML processing utilities"
    { $subsection xml-each }
    { $subsection xml-map }
    { $subsection xml-subset }
+    { $subsection xml-find }
+    { $subsection get-id }
    "Words for simplified generation of XML:"
    { $subsection build-tag* }
    { $subsection build-tag }
-    { $subsection build-xml-doc } ;
+    { $subsection build-xml-doc }
+    "Other relevant words:"
+    { $subsection names-match? } ;
+
+ARTICLE: { "xml" "internal" } "Internals of the XML parser"
+    "The XML parser creates its own parsing framework to process XML documents. The parser operates on streams. Important words involved in processing are:"
+    { $subsection spot }
+    { $subsection new-record }
+    { $subsection end-record }
+    { $subsection skip-until }
+    { $subsection take-until }
+    { $subsection take-char }
+    { $subsection take-string }
+    { $subsection next }
+    { $subsection process } ; ! should I have more? less?