From 725ed93b35a5bfef84c51a46b49931cfd7447cac Mon Sep 17 00:00:00 2001 From: microdan Date: Thu, 14 Dec 2006 23:42:52 +0000 Subject: [PATCH] assorted refactoring and documentation for XML --- libs/xml/load.factor | 3 +- libs/xml/parser.factor | 43 +++++++++++++++++++------ libs/xml/tokenizer.factor | 1 - libs/xml/writer.factor | 25 ++++++++------- libs/xml/xml.facts | 66 ++++++++++++++++++++++++++++++++++----- 5 files changed, 108 insertions(+), 30 deletions(-) diff --git a/libs/xml/load.factor b/libs/xml/load.factor index b05ee64431..55fedc87a4 100644 --- a/libs/xml/load.factor +++ b/libs/xml/load.factor @@ -11,4 +11,5 @@ PROVIDE: libs/xml } } { +tests+ { "test.factor" -} } ; +} } +{ +help+ { "xml" "intro" } } ; diff --git a/libs/xml/parser.factor b/libs/xml/parser.factor index f5fb78c925..d676364e77 100644 --- a/libs/xml/parser.factor +++ b/libs/xml/parser.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2005, 2006 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -IN: xml USING: errors hashtables io kernel math namespaces prettyprint sequences arrays generic strings vectors char-classes ; +IN: xml ! * Parsing tags @@ -62,7 +62,7 @@ TUPLE: instruction text ; ! this should make sure the name doesn't include 'xml' "?>" take-string ; -: make-tag ( -- tag/f ) +: make-tag ( -- tag ) { { [ get-char dup CHAR: ! = ] [ drop next directive ] } { [ CHAR: ? = ] [ next instruction ] } { [ t ] [ @@ -252,14 +252,15 @@ M: bad-version error. dup assure-no-extra prolog-attrs ] when ; -: init-xml ( stream -- ) +: basic-init ( stream -- ) stdio set { 0 0 0 "" } clone spot set f record set f now-recording? set next - "1.0" "iso-8859-1" f prolog-data set - init-xml-stack - init-ns-stack ; + "1.0" "iso-8859-1" f prolog-data set ; + +: init-xml ( stream -- ) + basic-init init-xml-stack init-ns-stack ; : init-xml-string ( string -- ) ! for debugging init-xml ; @@ -279,15 +280,37 @@ M: multitags error. dup [ tag? ] contains? [ throw ] when r> swap ; +! * Views of XML + +SYMBOL: text-now? + +TUPLE: pull-xml scope ; +C: pull-xml ( stream -- pull-xml ) + [ + swap basic-init parse-prolog + t text-now? set + [ namestack pop swap set-pull-xml-scope ] keep + ] with-scope ; + +: pull-next ( pull -- xml-elem/f ) + pull-xml-scope [ + text-now? get [ parse-text f ] [ + get-char [ make-tag t ] [ f f ] if + ] if text-now? set + ] bind ; + +: call-under ( quot object -- quot ) + swap dup slip ; inline + : sax-loop ( quot -- ) ! quot: xml-elem -- - parse-text [ swap dup slip ] each - get-char [ make-tag swap dup slip sax-loop ] + parse-text [ call-under ] each + get-char [ make-tag call-under sax-loop ] [ drop ] if ; inline : sax ( stream quot -- ) ! quot: xml-elem -- swap [ - init-xml parse-prolog - prolog-data get swap dup slip + basic-init parse-prolog + prolog-data get call-under sax-loop ] with-scope ; inline diff --git a/libs/xml/tokenizer.factor b/libs/xml/tokenizer.factor index 44b69d1b88..da5b7ac1f7 100644 --- a/libs/xml/tokenizer.factor +++ b/libs/xml/tokenizer.factor @@ -206,7 +206,6 @@ TUPLE: reference name ; , next new-record ; : (parse-char) ( ch -- ) - ! The similarities with (parse-text) should be factored out get-char { { [ dup not ] [ 2drop 0 end-record* , ] } diff --git a/libs/xml/writer.factor b/libs/xml/writer.factor index f082b2130c..5924d8e164 100644 --- a/libs/xml/writer.factor +++ b/libs/xml/writer.factor @@ -32,32 +32,32 @@ UNION: str-elem string entity reference ; [ write-str-elem ] each "\"" write ] hash-each ; -GENERIC: (xml>string) ( object -- ) +GENERIC: write-item ( object -- ) -M: str-elem (xml>string) ! string element +M: str-elem write-item ! string element write-str-elem ; -M: contained-tag (xml>string) +M: contained-tag write-item CHAR: < write1 dup print-name tag-props print-props "/>" write ; -M: open-tag (xml>string) +M: open-tag write-item CHAR: < write1 dup print-name dup tag-props print-props CHAR: > write1 - dup tag-children [ (xml>string) ] each + dup tag-children [ write-item ] each " write1 ; -M: comment (xml>string) +M: comment write-item "" write ; -M: directive (xml>string) +M: directive write-item " write1 ; -M: instruction (xml>string) +M: instruction write-item "" write ; : xml-preamble ( xml -- ) @@ -67,11 +67,14 @@ M: instruction (xml>string) prolog-standalone "yes" "no" ? write "\"?>" write ; +: write-chunk ( seq -- ) + [ write-item ] each ; + : write-xml ( xml-doc -- ) dup xml-doc-prolog xml-preamble - dup xml-doc-before [ (xml>string) ] each - dup delegate (xml>string) - xml-doc-after [ (xml>string) ] each ; + dup xml-doc-before write-chunk + dup delegate write-item + xml-doc-after write-chunk ; : print-xml ( xml-doc -- ) write-xml terpri ; diff --git a/libs/xml/xml.facts b/libs/xml/xml.facts index a3a04dcf38..afa4c30620 100644 --- a/libs/xml/xml.facts +++ b/libs/xml/xml.facts @@ -247,6 +247,45 @@ HELP: contained { $class-description "represents a self-closing tag, like . Contains two slots, name and props, containing, respectively, the name of the tag and its attributes. Usually, the name-url will be f." } { $see-also opener closer } ; +HELP: parse-text +{ $values { "array" "an array of text elements" } } +{ $description "moves the pointer from the current spot to the beginning of the next tag, parsing the text underneath, returning the text elements it passed. This parses DTD references like %foo; and XML entities like &bar; a and &" } +{ $see-also parse-name } ; + +HELP: parse-name +{ $values { "name" "an XML name" } } +{ $description "parses a " { $link name } " from the input stream. Returns a name with only the name-space and name-tag defined, with name-url=f" } +{ $see-also parse-text } ; + +HELP: make-tag +{ $values { "tag" "an opener, closer or contained" } } +{ $description "assuming the pointer is just past a <, this word parses until the next > and emits a tuple representing the tag parsed" } +{ $see-also opener closer contained } ; + +HELP: pull-xml +{ $class-description "represents the state of a pull-parser for XML. Has one slot, scope, which is a namespace which contains all relevant state information." } +{ $see-also pull-next } ; + +HELP: +{ $values { "stream" "an input stream containing XML" } { "pull-xml" "a pull-xml tuple" } } +{ $description "creates an XML pull-based parser, executing all initial XML commands to set up the parser" } +{ $see-also pull-xml pull-next } ; + +HELP: pull-next +{ $values { "pull" "an XML pull parser" } { "xml-elem/f" "an XML tag event, sequence of string elements, or f" } } +{ $description "gets the next xml element from the given XML pull parser. Returns f upon exaustion" } +{ $see-also pull-xml } ; + +HELP: write-item +{ $values { "object" "an XML element" } } +{ $description "writes an XML element to stdio" } +{ $see-also write-chunk write-xml } ; + +HELP: write-chunk +{ $values { "seq" "an XML document fragment" } } +{ $description "writes an XML document fragment, ie a sequence of XML elements, to stdio" } +{ $see-also write-item write-xml } ; + ARTICLE: { "xml" "intro" } "XML" "The XML module attempts to implement the XML 1.1 standard, converting strings of text into XML and vice versa. It currently is a work in progress. Together with XML-RPC, this is a component of the F2EE framework." $terpri @@ -260,13 +299,19 @@ ARTICLE: { "xml" "intro" } "XML" ARTICLE: { "xml" "basic" } "Basic words for XML processing" "These are the most basic words needed for processing an XML document" + $terpri + "Parsing XML:" { $subsection string>xml } - { $subsection xml>string } - { $subsection xml-parse-error } - { $subsection xml-reprint } - { $subsection write-xml } { $subsection read-xml } - { $subsection xml-chunk } ; + { $subsection xml-chunk } + "Printing XML" + { $subsection xml>string } + { $subsection write-xml } + { $subsection write-item } + { $subsection write-chunk } + "Other" + { $subsection xml-parse-error } + { $subsection xml-reprint } ; ARTICLE: { "xml" "classes" } "XML data classes" "Data types that XML documents are made of:" @@ -295,7 +340,7 @@ ARTICLE: { "xml" "construct" } "XML data constructors" ARTICLE: { "xml" "utils" } "XML processing utilities" "Utilities for processing XML include..." $terpri - "System for creating words which dispatch on XML tags:" + "System sfor creating words which dispatch on XML tags:" { $subsection POSTPONE: PROCESS: } { $subsection POSTPONE: TAG: } "Combinators for traversing XML trees:" @@ -321,6 +366,9 @@ ARTICLE: { "xml" "internal" } "Internals of the XML parser" { $subsection take-char } { $subsection take-string } { $subsection next } + { $subsection parse-text } + { $subsection make-tag } + { $subsection parse-name } { $subsection process } ; ! should I have more? less? ARTICLE: { "xml" "events" } "Event-based XML parsing" @@ -328,4 +376,8 @@ ARTICLE: { "xml" "events" } "Event-based XML parsing" { $subsection sax } { $subsection opener } { $subsection closer } - { $subsection contained } ; + { $subsection contained } + "There is also pull-based parsing to augment the push-parsing of SAX. This is probably easier to use and more logical. It uses the same parsing objects as the above style of parsing, except string elements are always in arrays, for example { \"\" }. Relevant pull-parsing words are:" + { $subsection } + { $subsection pull-xml } + { $subsection pull-next } ;