From 725ed93b35a5bfef84c51a46b49931cfd7447cac Mon Sep 17 00:00:00 2001
From: microdan <microdan@gmail.com>
Date: Thu, 14 Dec 2006 23:42:52 +0000
Subject: [PATCH] assorted refactoring and documentation for XML

---
 libs/xml/load.factor      |  3 +-
 libs/xml/parser.factor    | 43 +++++++++++++++++++------
 libs/xml/tokenizer.factor |  1 -
 libs/xml/writer.factor    | 25 ++++++++-------
 libs/xml/xml.facts        | 66 ++++++++++++++++++++++++++++++++++-----
 5 files changed, 108 insertions(+), 30 deletions(-)
diff --git a/libs/xml/load.factor b/libs/xml/load.factor
index b05ee64431..55fedc87a4 100644
--- a/libs/xml/load.factor
+++ b/libs/xml/load.factor
@@ -11,4 +11,5 @@ PROVIDE: libs/xml
 } }
 { +tests+ {
     "test.factor"
-} } ;
+} }
+{ +help+ { "xml" "intro" } } ;
diff --git a/libs/xml/parser.factor b/libs/xml/parser.factor
index f5fb78c925..d676364e77 100644
--- a/libs/xml/parser.factor
+++ b/libs/xml/parser.factor
@@ -1,8 +1,8 @@
 ! Copyright (C) 2005, 2006 Daniel Ehrenberg
 ! See http://factorcode.org/license.txt for BSD license.
-IN: xml
 USING: errors hashtables io kernel math namespaces prettyprint sequences
     arrays generic strings vectors char-classes ;
+IN: xml
 
 ! * Parsing tags
 
@@ -62,7 +62,7 @@ TUPLE: instruction text ;
     ! this should make sure the name doesn't include 'xml'
     "?>" take-string <instruction> ;
 
-: make-tag ( -- tag/f )
+: make-tag ( -- tag )
     { { [ get-char dup CHAR: ! = ] [ drop next directive ] }
       { [ CHAR: ? = ] [ next instruction ] } 
       { [ t ] [
@@ -252,14 +252,15 @@ M: bad-version error.
          dup assure-no-extra prolog-attrs
     ] when ;
 
-: init-xml ( stream -- )
+: basic-init ( stream -- )
     stdio set
     { 0 0 0 "" } clone spot set
     f record set f now-recording? set
     next
-    "1.0" "iso-8859-1" f <prolog> prolog-data set
-    init-xml-stack
-    init-ns-stack ;
+    "1.0" "iso-8859-1" f <prolog> prolog-data set ;
+
+: init-xml ( stream -- )
+    basic-init init-xml-stack init-ns-stack ;
 
 : init-xml-string ( string -- ) ! for debugging
     <string-reader> init-xml ;
@@ -279,15 +280,37 @@ M: multitags error.
     dup [ tag? ] contains? [ <multitags> throw ] when r>
     swap <xml-doc> ;
 
+! * Views of XML
+
+SYMBOL: text-now?
+
+TUPLE: pull-xml scope ;
+C: pull-xml ( stream -- pull-xml )
+    [
+        swap basic-init parse-prolog
+        t text-now? set
+        [ namestack pop swap set-pull-xml-scope ] keep
+    ] with-scope ;
+
+: pull-next ( pull -- xml-elem/f )
+    pull-xml-scope [
+        text-now? get [ parse-text f ] [
+            get-char [ make-tag t ] [ f f ] if
+        ] if text-now? set    
+    ] bind ;
+
+: call-under ( quot object -- quot )
+    swap dup slip ; inline
+
 : sax-loop ( quot -- ) ! quot: xml-elem --
-    parse-text [ swap dup slip ] each
-    get-char [ make-tag swap dup slip sax-loop ]
+    parse-text [ call-under ] each
+    get-char [ make-tag call-under sax-loop ]
     [ drop ] if ; inline
 
 : sax ( stream quot -- ) ! quot: xml-elem --
     swap [
-        init-xml parse-prolog
-        prolog-data get swap dup slip
+        basic-init parse-prolog
+        prolog-data get call-under
         sax-loop
     ] with-scope ; inline
 
diff --git a/libs/xml/tokenizer.factor b/libs/xml/tokenizer.factor
index 44b69d1b88..da5b7ac1f7 100644
--- a/libs/xml/tokenizer.factor
+++ b/libs/xml/tokenizer.factor
@@ -206,7 +206,6 @@ TUPLE: reference name ;
     <reference> , next new-record ;
 
 : (parse-char) ( ch -- )
-    ! The similarities with (parse-text) should be factored out
     get-char {
         { [ dup not ]
           [ 2drop 0 end-record* , ] }
diff --git a/libs/xml/writer.factor b/libs/xml/writer.factor
index f082b2130c..5924d8e164 100644
--- a/libs/xml/writer.factor
+++ b/libs/xml/writer.factor
@@ -32,32 +32,32 @@ UNION: str-elem string entity reference ;
         [ write-str-elem ] each "\"" write
     ] hash-each ;
 
-GENERIC: (xml>string) ( object -- )
+GENERIC: write-item ( object -- )
 
-M: str-elem (xml>string) ! string element
+M: str-elem write-item ! string element
     write-str-elem ;
 
-M: contained-tag (xml>string)
+M: contained-tag write-item
     CHAR: < write1
     dup print-name
     tag-props print-props
     "/>" write ;
 
-M: open-tag (xml>string)
+M: open-tag write-item
     CHAR: < write1
     dup print-name
     dup tag-props print-props
     CHAR: > write1
-    dup tag-children [ (xml>string) ] each
+    dup tag-children [ write-item ] each
     "</" write print-name CHAR: > write1 ;
 
-M: comment (xml>string)
+M: comment write-item
     "<!--" write comment-text write "-->" write ;
 
-M: directive (xml>string)
+M: directive write-item
     "<!" write directive-text write CHAR: > write1 ;
 
-M: instruction (xml>string)
+M: instruction write-item
     "<?" write instruction-text write "?>" write ;
 
 : xml-preamble ( xml -- )
@@ -67,11 +67,14 @@ M: instruction (xml>string)
     prolog-standalone "yes" "no" ? write
     "\"?>" write ;
 
+: write-chunk ( seq -- )
+    [ write-item ] each ;
+
 : write-xml ( xml-doc -- )
     dup xml-doc-prolog xml-preamble
-    dup xml-doc-before [ (xml>string) ] each
-    dup delegate (xml>string)
-    xml-doc-after [ (xml>string) ] each ;
+    dup xml-doc-before write-chunk
+    dup delegate write-item
+    xml-doc-after write-chunk ;
 
 : print-xml ( xml-doc -- )
     write-xml terpri ;
diff --git a/libs/xml/xml.facts b/libs/xml/xml.facts
index a3a04dcf38..afa4c30620 100644
--- a/libs/xml/xml.facts
+++ b/libs/xml/xml.facts
@@ -247,6 +247,45 @@ HELP: contained
 { $class-description "represents a self-closing tag, like <a/>. Contains two slots, name and props, containing, respectively, the name of the tag and its attributes. Usually, the name-url will be f." }
 { $see-also opener closer } ;
 
+HELP: parse-text
+{ $values { "array" "an array of text elements" } }
+{ $description "moves the pointer from the current spot to the beginning of the next tag, parsing the text underneath, returning the text elements it passed. This parses DTD references like %foo; and XML entities like &bar; &#97; and &amp;" }
+{ $see-also parse-name } ;
+
+HELP: parse-name
+{ $values { "name" "an XML name" } }
+{ $description "parses a " { $link name } " from the input stream. Returns a name with only the name-space and name-tag defined, with name-url=f" }
+{ $see-also parse-text } ;
+
+HELP: make-tag
+{ $values { "tag" "an opener, closer or contained" } }
+{ $description "assuming the pointer is just past a <, this word parses until the next > and emits a tuple representing the tag parsed" }
+{ $see-also opener closer contained } ;
+
+HELP: pull-xml
+{ $class-description "represents the state of a pull-parser for XML. Has one slot, scope, which is a namespace which contains all relevant state information." }
+{ $see-also <pull-xml> pull-next } ;
+
+HELP: <pull-xml>
+{ $values { "stream" "an input stream containing XML" } { "pull-xml" "a pull-xml tuple" } }
+{ $description "creates an XML pull-based parser, executing all initial XML commands to set up the parser" }
+{ $see-also pull-xml pull-next } ;
+
+HELP: pull-next
+{ $values { "pull" "an XML pull parser" } { "xml-elem/f" "an XML tag event, sequence of string elements, or f" } }
+{ $description "gets the next xml element from the given XML pull parser. Returns f upon exaustion" }
+{ $see-also pull-xml <pull-xml> } ;
+
+HELP: write-item
+{ $values { "object" "an XML element" } }
+{ $description "writes an XML element to stdio" }
+{ $see-also write-chunk write-xml } ;
+
+HELP: write-chunk
+{ $values { "seq" "an XML document fragment" } }
+{ $description "writes an XML document fragment, ie a sequence of XML elements, to stdio" }
+{ $see-also write-item write-xml } ;
+
 ARTICLE: { "xml" "intro" } "XML"
     "The XML module attempts to implement the XML 1.1 standard, converting strings of text into XML and vice versa. It currently is a work in progress. Together with XML-RPC, this is a component of the F2EE framework."
     $terpri
@@ -260,13 +299,19 @@ ARTICLE: { "xml" "intro" } "XML"
 
 ARTICLE: { "xml" "basic" } "Basic words for XML processing"
     "These are the most basic words needed for processing an XML document"
+    $terpri
+    "Parsing XML:"
     { $subsection string>xml }
-    { $subsection xml>string }
-    { $subsection xml-parse-error }
-    { $subsection xml-reprint }
-    { $subsection write-xml }
     { $subsection read-xml }
-    { $subsection xml-chunk } ;
+    { $subsection xml-chunk }
+    "Printing XML"
+    { $subsection xml>string }
+    { $subsection write-xml }
+    { $subsection write-item }
+    { $subsection write-chunk }
+    "Other"
+    { $subsection xml-parse-error }
+    { $subsection xml-reprint } ;
 
 ARTICLE: { "xml" "classes" } "XML data classes"
     "Data types that XML documents are made of:"
@@ -295,7 +340,7 @@ ARTICLE: { "xml" "construct" } "XML data constructors"
 ARTICLE: { "xml" "utils" } "XML processing utilities"
     "Utilities for processing XML include..."
     $terpri
-    "System for creating words which dispatch on XML tags:"
+    "System sfor creating words which dispatch on XML tags:"
     { $subsection POSTPONE: PROCESS: }
     { $subsection POSTPONE: TAG: }
     "Combinators for traversing XML trees:"
@@ -321,6 +366,9 @@ ARTICLE: { "xml" "internal" } "Internals of the XML parser"
     { $subsection take-char }
     { $subsection take-string }
     { $subsection next }
+    { $subsection parse-text }
+    { $subsection make-tag }
+    { $subsection parse-name }
     { $subsection process } ; ! should I have more? less?
 
 ARTICLE: { "xml" "events" } "Event-based XML parsing"
@@ -328,4 +376,8 @@ ARTICLE: { "xml" "events" } "Event-based XML parsing"
     { $subsection sax }
     { $subsection opener }
     { $subsection closer }
-    { $subsection contained } ;
+    { $subsection contained }
+    "There is also pull-based parsing to augment the push-parsing of SAX. This is probably easier to use and more logical. It uses the same parsing objects as the above style of parsing, except string elements are always in arrays, for example { \"\" }. Relevant pull-parsing words are:"
+    { $subsection <pull-xml> }
+    { $subsection pull-xml }
+    { $subsection pull-next } ;