XML refactoring, splitting up docs

db4
Daniel Ehrenberg 2009-01-20 23:54:33 -06:00
parent 8536d85b4e
commit c24f4494bc
18 changed files with 653 additions and 707 deletions

View File

@ -0,0 +1,152 @@
USING: help.markup help.syntax sequences strings ;
IN: xml.data
ABOUT: "xml.data"
ARTICLE: "xml.data" "XML data types"
{ $vocab-link "xml.data" } " defines a simple document object model for XML. Everything is simply a tuple and can be manipulated as such."
{ $subsection { "xml.data" "classes" } }
{ $subsection { "xml.data" "constructors" } }
"Simple words for manipulating names:"
{ $subsection names-match? }
{ $subsection assure-name }
"For high-level tools for manipulating XML, see " { $vocab-link "xml.utilities" } ;
ARTICLE: { "xml.data" "classes" } "XML data classes"
"Data types that XML documents are made of:"
{ $subsection name }
{ $subsection tag }
{ $subsection contained-tag }
{ $subsection open-tag }
{ $subsection xml }
{ $subsection prolog }
{ $subsection comment }
{ $subsection instruction }
{ $subsection element-decl }
{ $subsection attlist-decl }
{ $subsection entity-decl }
{ $subsection system-id }
{ $subsection public-id }
{ $subsection doctype-decl }
{ $subsection notation-decl } ;
ARTICLE: { "xml.data" "constructors" } "XML data constructors"
"These data types are constructed with:"
{ $subsection <name> }
{ $subsection <tag> }
{ $subsection <contained-tag> }
{ $subsection <xml> }
{ $subsection <prolog> }
{ $subsection <comment> }
{ $subsection <instruction> }
{ $subsection <simple-name> }
{ $subsection <element-decl> }
{ $subsection <attlist-decl> }
{ $subsection <entity-decl> }
{ $subsection <system-id> }
{ $subsection <public-id> }
{ $subsection <doctype-decl> }
{ $subsection <notation-decl> } ;
HELP: tag
{ $class-description "tuple representing an XML tag, delegating to a " { $link
name } ", containing the slots attrs (an alist of names to strings) and children (a sequence). Tags implement the sequence protocol by acting like a sequence of its chidren, and the assoc protocol by acting like its attributes." }
{ $see-also <tag> name contained-tag xml } ;
HELP: <tag>
{ $values { "name" "an XML tag name" }
{ "attrs" "an alist of names to strings" }
{ "children" sequence }
{ "tag" tag } }
{ $description "constructs an XML " { $link tag } " with the name (not a string) and tag attributes specified in attrs and children specified" }
{ $see-also tag <contained-tag> } ;
HELP: name
{ $class-description "represents an XML name, with the fields space (a string representing the namespace, as written in the document, tag (a string of the actual name of the tag) and url (a string of the URL that the namespace points to)" }
{ $see-also <name> tag } ;
HELP: <name>
{ $values { "space" "a string" } { "main" "a string" } { "url" "a string" }
{ "name" "an XML tag name" } }
{ $description "creates a name tuple with the name-space space and the tag-name tag and the tag-url url." }
{ $see-also name <tag> } ;
HELP: contained-tag
{ $class-description "delegates to tag representing a tag like <a/> with no contents. The tag attributes are accessed with tag-attrs" }
{ $see-also tag <contained-tag> } ;
HELP: <contained-tag>
{ $values { "name" "an XML tag name" }
{ "attrs" "an alist from names to strings" }
{ "tag" tag } }
{ $description "creates an empty tag (like <a/>) with the specified name and tag attributes. This delegates to tag" }
{ $see-also contained-tag <tag> } ;
HELP: xml
{ $class-description "tuple representing an XML document, delegating to the main tag, containing the fields prolog (the header <?xml...?>), before (whatever comes between the prolog and the main tag) and after (whatever comes after the main tag)" }
{ $see-also <xml> tag prolog } ;
HELP: <xml>
{ $values { "prolog" "an XML prolog" } { "before" "a sequence of XML elements" }
{ "body" tag } { "after" "a sequence of XML elements" } { "xml" "an XML document" } }
{ $description "creates an XML document, delegating to the main tag, with the specified prolog, before, and after" }
{ $see-also xml <tag> } ;
HELP: prolog
{ $class-description "represents an XML prolog, with the tuple fields version (containing \"1.0\" or \"1.1\"), encoding (a string representing the encoding type), and standalone (t or f, whether the document is standalone without external entities)" }
{ $see-also <prolog> xml } ;
HELP: <prolog>
{ $values { "version" "a string, 1.0 or 1.1" }
{ "encoding" "a string" } { "standalone" "a boolean" } { "prolog" "an XML prolog" } }
{ $description "creates an XML prolog tuple" }
{ $see-also prolog <xml> } ;
HELP: comment
{ $class-description "represents a comment in XML. Has one slot, text, which contains the string of the comment" }
{ $see-also <comment> } ;
HELP: <comment>
{ $values { "text" "a string" } { "comment" "a comment" } }
{ $description "creates an XML comment tuple" }
{ $see-also comment } ;
HELP: instruction
{ $class-description "represents an XML instruction, such as <?xsl stylesheet='foo.xml'?>. Contains one slot, text, which contains the string between the question marks." }
{ $see-also <instruction> } ;
HELP: <instruction>
{ $values { "text" "a string" } { "instruction" "an XML instruction" } }
{ $description "creates an XML parsing instruction, such as <?xsl stylesheet='foo.xml'?>." }
{ $see-also instruction } ;
HELP: opener
{ $class-description "describes an opening tag, like <a>. Contains two slots, name and attrs containing, respectively, the name of the tag and its attributes. Usually, the name-url will be f." }
{ $see-also closer contained } ;
HELP: closer
{ $class-description "describes a closing tag, like </a>. Contains one slot, name, containing the tag's name. Usually, the name-url will be f." }
{ $see-also opener contained } ;
HELP: contained
{ $class-description "represents a self-closing tag, like <a/>. Contains two slots, name and attrs containing, respectively, the name of the tag and its attributes. Usually, the name-url will be f." }
{ $see-also opener closer } ;
HELP: open-tag
{ $class-description "represents a tag that does have children, ie is not a contained tag" }
{ $notes "the constructor used for this class is simply " { $link <tag> } "." }
{ $see-also tag contained-tag } ;
HELP: names-match?
{ $values { "name1" "a name" } { "name2" "a name" } { "?" "t or f" } }
{ $description "checks to see if the two names match, that is, if all fields are equal, ignoring fields whose value is f in either name." }
{ $example "USING: prettyprint xml.data ;" "T{ name f \"rpc\" \"methodCall\" f } T{ name f f \"methodCall\" \"http://www.xmlrpc.org/\" } names-match? ." "t" }
{ $see-also name } ;
HELP: assure-name
{ $values { "string/name" "a string or a name" } { "name" "a name" } }
{ $description "Converts a string into an XML name, if it is not already a name." } ;
HELP: <simple-name>
{ $values { "string" string } { "name" name } }
{ $description "Converts a string into an XML name with an empty prefix and URL." } ;

View File

@ -0,0 +1,22 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax ;
IN: xml.entities
ABOUT: "xml.entities"
ARTICLE: "xml.entities" "XML entities"
"When XML is parsed, entities like &foo; are replaced with the characters they represent. A few entities like &amp; and &lt; are defined by default, but more are available, and the set of entities can be customized. Below are some words involved in XML entities, defined in the vocabulary 'entities':"
{ $subsection entities }
{ $subsection with-entities }
"For entities used in HTML/XHTML, see " { $vocab-link "xml.entities.html" } ;
HELP: entities
{ $description "a hash table from default XML entity names (like &amp; and &lt;) to the characters they represent. This is automatically included when parsing any XML document." }
{ $see-also with-entities } ;
HELP: with-entities
{ $values { "entities" "a hash table of strings to chars" }
{ "quot" "a quotation ( -- )" } }
{ $description "calls the quotation using the given table of entity values (symbolizing, eg, that &foo; represents CHAR: a) on top of the default XML entities" } ;

View File

@ -0,0 +1,18 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax xml.entities ;
IN: xml.entities.html
ARTICLE: "xml.entities.html" "HTML entities"
{ $vocab-link "xml.entities.html" } " defines words for using entities defined in HTML/XHTML."
{ $subsection html-entities }
{ $subsection with-html-entities } ;
HELP: html-entities
{ $description "a hash table from HTML entity names to their character values" }
{ $see-also entities with-html-entities } ;
HELP: with-html-entities
{ $values { "quot" "a quotation ( -- )" } }
{ $description "calls the given quotation using HTML entity values" }
{ $see-also html-entities with-entities } ;

View File

@ -7,8 +7,10 @@ IN: xml.entities.html
VALUE: html-entities
: read-entities-file ( file -- table )
f swap binary <file-reader>
[ 2drop extra-entities get ] sax ;
H{ } clone [ extra-entities [
binary <file-reader>
[ drop ] sax
] with-variable ] keep ;
: get-html ( -- table )
{ "lat1" "special" "symbol" } [

View File

@ -0,0 +1,68 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax ;
IN: xml.errors
HELP: multitags
{ $class-description "XML parsing error describing the case where there is more than one main tag in a document. Contains no slots" } ;
HELP: notags
{ $class-description "XML parsing error describing the case where an XML document contains no main tag, or any tags at all" } ;
HELP: extra-attrs
{ $class-description "XML parsing error describing the case where the XML prolog (<?xml ...?>) contains attributes other than the three allowed ones, standalone, version and encoding. Contains one slot, attrs, which is a hashtable of all the extra attributes' names. Delegates to " { $link parsing-error } "." } ;
HELP: nonexist-ns
{ $class-description "XML parsing error describing the case where a namespace doesn't exist but it is used in a tag. Contains one slot, name, which contains the name of the undeclared namespace, and delegates to " { $link parsing-error } "." } ;
HELP: not-yes/no
{ $class-description "XML parsing error used to describe the case where standalone is set in the XML prolog to something other than 'yes' or 'no'. Delegates to " { $link parsing-error } " and contains one slot, text, which contains offending value." } ;
HELP: unclosed
{ $class-description "XML parsing error used to describe the case where the XML document contains classes which are not closed by the end of the document. Contains one slot, tags, a sequence of names." } ;
HELP: mismatched
{ $class-description "XML parsing error describing mismatched tags, eg <a></c>. Contains two slots: open is the name of the opening tag and close is the name of the closing tag. Delegates to " { $link parsing-error } " showing the location of the closing tag" } ;
HELP: expected
{ $class-description "XML parsing error describing when an expected token was not present. Delegates to " { $link parsing-error } ". Contains two slots, should-be, which has the expected string, and was, which has the actual string." } ;
HELP: no-entity
{ $class-description "XML parsing error describing the use of an undefined entity in a case where standalone is marked yes. Delegates to " { $link parsing-error } ". Contains one slot, thing, containing a string representing the entity." } ;
HELP: pre/post-content
{ $class-description "describes the error where a non-whitespace string is used before or after the main tag in an XML document. Contains two slots: string contains the offending string, and pre? is t if it occured before the main tag and f if it occured after" } ;
HELP: unclosed-quote
{ $class-description "describes the error where a quotation for an attribute value is opened but not closed before the end of the document." } ;
HELP: bad-name
{ $class-description "describes the error where a name is used, for example in an XML tag or attribute key, which is invalid." } ;
HELP: quoteless-attr
{ $class-description "describes the error where an attribute of an XML tag is missing quotes around a value." } ;
HELP: xml-parse-error
{ $class-description "the exception class that all parsing errors in XML documents are in." } ;
ARTICLE: "xml.errors" "XML parsing errors"
{ $vocab-link "xml.errors" } " provides a rich and highly inspectable set of parsing errors. All XML errors are described by the union class " { $link xml-parse-error } " but there are many classes contained in that:"
{ $subsection multitags }
{ $subsection notags }
{ $subsection extra-attrs }
{ $subsection nonexist-ns }
{ $subsection not-yes/no }
{ $subsection unclosed }
{ $subsection mismatched }
{ $subsection expected }
{ $subsection no-entity }
{ $subsection pre/post-content }
{ $subsection unclosed-quote }
{ $subsection bad-name }
{ $subsection quoteless-attr }
"Additionally, most of these errors are a kind of " { $link parsing-error } " which provides more information"
$nl
"Note that, in parsing an XML document, only the first error is reported." ;
ABOUT: "xml.errors"

View File

@ -1,10 +1,61 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: xml.data xml.writer kernel generic io prettyprint math
debugger sequences xml.state-parser accessors summary
debugger sequences xml.state accessors summary
namespaces io.streams.string xml.backend ;
IN: xml.errors
TUPLE: parsing-error line column ;
: parsing-error ( class -- obj )
new
get-line >>line
get-column >>column ;
M: parsing-error summary ( obj -- str )
[
"Parsing error" print
"Line: " write dup line>> .
"Column: " write column>> .
] with-string-writer ;
TUPLE: expected < parsing-error should-be was ;
: expected ( should-be was -- * )
\ expected parsing-error
swap >>was
swap >>should-be throw ;
M: expected summary ( obj -- str )
[
dup call-next-method write
"Token expected: " write dup should-be>> print
"Token present: " write was>> print
] with-string-writer ;
TUPLE: unexpected-end < parsing-error ;
: unexpected-end ( -- * ) \ unexpected-end parsing-error throw ;
M: unexpected-end summary ( obj -- str )
[
call-next-method write
"File unexpectedly ended." print
] with-string-writer ;
TUPLE: missing-close < parsing-error ;
: missing-close ( -- * ) \ missing-close parsing-error throw ;
M: missing-close summary ( obj -- str )
[
call-next-method write
"Missing closing token." print
] with-string-writer ;
TUPLE: disallowed-char < parsing-error char ;
: disallowed-char ( char -- * )
\ disallowed-char parsing-error swap >>char throw ;
M: disallowed-char summary
[ call-next-method ]
[ char>> "Disallowed character in XML document: " swap suffix ] bi
append ;
ERROR: multitags ;
M: multitags summary ( obj -- str )
@ -255,16 +306,6 @@ M: text-w/]]> summary
call-next-method
"Text node contains ']]>'" append ;
TUPLE: disallowed-char < parsing-error char ;
: disallowed-char ( char -- * )
\ disallowed-char parsing-error swap >>char throw ;
M: disallowed-char summary
[ call-next-method ]
[ char>> "Disallowed character in XML document: " swap suffix ] bi
append ;
TUPLE: duplicate-attr < parsing-error key values ;
: duplicate-attr ( key values -- * )
@ -282,6 +323,15 @@ TUPLE: bad-cdata < parsing-error ;
M: bad-cdata summary
call-next-method "\nCDATA occurs before or after main tag" append ;
TUPLE: not-enough-characters < parsing-error ;
: not-enough-characters ( -- * )
\ not-enough-characters parsing-error throw ;
M: not-enough-characters summary ( obj -- str )
[
call-next-method write
"Not enough characters" print
] with-string-writer ;
UNION: xml-parse-error
multitags notags extra-attrs nonexist-ns bad-decl
not-yes/no unclosed mismatched expected no-entity

View File

@ -1 +0,0 @@
Daniel Ehrenberg

View File

@ -1,72 +0,0 @@
USING: help.markup help.syntax ;
IN: xml.state-parser
ABOUT: { "state-parser" "main" }
ARTICLE: { "state-parser" "main" } "State-based parsing"
{ $vocab-link "xml.state-parser" } " defines a state-based parsing mechanism."
{ $subsection spot }
{ $subsection skip-until }
{ $subsection take-until }
{ $subsection take-char }
{ $subsection take-string }
{ $subsection next }
{ $subsection state-parse }
{ $subsection get-char }
{ $subsection take-rest }
{ $subsection string-parse }
{ $subsection expect }
{ $subsection expect-string }
{ $subsection parsing-error } ;
HELP: get-char
{ $values { "char" "the current character" } }
{ $description "Accesses the current character of the stream that is being parsed" } ;
HELP: take-rest
{ $values { "string" "the rest of the parser input" } }
{ $description "Exausts the stream of the parser input and returns a string representing the rest of the input" } ;
HELP: string-parse
{ $values { "input" "a string" } { "quot" "a quotation ( -- )" } }
{ $description "Calls the given quotation using the given string as parser input" }
{ $see-also state-parse } ;
HELP: expect
{ $values { "ch" "a number representing a character" } }
{ $description "Asserts that the current character is the given ch, and moves to the next spot" }
{ $see-also expect-string } ;
HELP: expect-string
{ $values { "string" "a string" } }
{ $description "Asserts that the current parsing spot is followed by the given string, and skips the parser past that string" }
{ $see-also expect } ;
HELP: spot
{ $var-description "This variable represents the location in the program. It is a tuple T{ spot f char column line next } where char is the current character, line is the line number, column is the column number, and line-str is the full contents of the line, as a string. The contents shouldn't be accessed directly but rather with the proxy words get-char set-char get-line etc." } ;
HELP: skip-until
{ $values { "quot" "a quotation ( -- ? )" } }
{ $description "executes " { $link next } " until the quotation yields false. Usually, the quotation will call " { $link get-char } " in its test, but not always." }
{ $see-also take-until } ;
HELP: take-until
{ $values { "quot" "a quotation ( -- ? )" } { "string" "a string" } }
{ $description "like " { $link skip-until } " but records what it passes over and outputs the string." }
{ $see-also skip-until take-char take-string } ;
HELP: take-char
{ $values { "ch" "a character" } { "string" "a string" } }
{ $description "records the document from the current spot to the first instance of the given character. Outputs the content between those two points." }
{ $see-also take-until take-string } ;
HELP: take-string
{ $values { "match" "a string to match" } { "string" "the portion of the XML document" } }
{ $description "records the document from the current spot to the first instance of the given character. Outputs the content between those two points." }
{ $notes "match may not contain a newline" } ;
HELP: next
{ $description "originally written as " { $code "spot inc" } ", code that would no longer run, this word moves the state of the XML parser to the next place in the source file, keeping track of appropriate debugging information." } ;
HELP: parsing-error
{ $class-description "class from which parsing errors inherit, containing information about which line and column the error occured on, and what the line was. Contains three slots, line, an integer, column, another integer, and line-str, a string" } ;

View File

@ -1,169 +0,0 @@
! Copyright (C) 2005, 2008 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: io io.streams.string kernel math namespaces sequences
strings circular prettyprint debugger ascii sbufs fry summary
accessors xml.data xml.char-classes xml.errors ;
IN: xml.state-parser
! * Basic underlying words
! Code stored in stdio
! Spot is composite so it won't be lost in sub-scopes
TUPLE: spot char line column next check ;
C: <spot> spot
: get-char ( -- char ) spot get char>> ;
: set-char ( char -- ) spot get swap >>char drop ;
: get-line ( -- line ) spot get line>> ;
: set-line ( line -- ) spot get swap >>line drop ;
: get-column ( -- column ) spot get column>> ;
: set-column ( column -- ) spot get swap >>column drop ;
: get-next ( -- char ) spot get next>> ;
: set-next ( char -- ) spot get swap >>next drop ;
: get-check ( -- ? ) spot get check>> ;
: check ( -- ) spot get t >>check drop ;
! * Errors
TUPLE: parsing-error line column ;
: parsing-error ( class -- obj )
new
get-line >>line
get-column >>column ;
M: parsing-error summary ( obj -- str )
[
"Parsing error" print
"Line: " write dup line>> .
"Column: " write column>> .
] with-string-writer ;
TUPLE: expected < parsing-error should-be was ;
: expected ( should-be was -- * )
\ expected parsing-error
swap >>was
swap >>should-be throw ;
M: expected summary ( obj -- str )
[
dup call-next-method write
"Token expected: " write dup should-be>> print
"Token present: " write was>> print
] with-string-writer ;
TUPLE: unexpected-end < parsing-error ;
: unexpected-end ( -- * ) \ unexpected-end parsing-error throw ;
M: unexpected-end summary ( obj -- str )
[
call-next-method write
"File unexpectedly ended." print
] with-string-writer ;
TUPLE: missing-close < parsing-error ;
: missing-close ( -- * ) \ missing-close parsing-error throw ;
M: missing-close summary ( obj -- str )
[
call-next-method write
"Missing closing token." print
] with-string-writer ;
SYMBOL: prolog-data
: version=1.0? ( -- ? )
prolog-data get [ version>> "1.0" = ] [ t ] if* ;
: assure-good-char ( ch -- ch )
[
version=1.0? over text? not get-check and
[ disallowed-char ] when
] [ f ] if* ;
! * Basic utility words
: record ( char -- )
CHAR: \n =
[ 0 get-line 1+ set-line ] [ get-column 1+ ] if
set-column ;
! (next) normalizes \r\n and \r
: (next) ( -- char )
get-next read1
2dup swap CHAR: \r = [
CHAR: \n =
[ nip read1 ] [ nip CHAR: \n swap ] if
] [ drop ] if
set-next dup set-char assure-good-char ;
: next ( -- )
#! Increment spot.
get-char [ unexpected-end ] unless (next) record ;
: next* ( -- )
get-char [ (next) record ] when ;
: skip-until ( quot: ( -- ? ) -- )
get-char [
[ call ] keep swap [ drop ] [
next skip-until
] if
] [ drop ] if ; inline recursive
: take-until ( quot -- string )
#! Take the substring of a string starting at spot
#! from code until the quotation given is true and
#! advance spot to after the substring.
10 <sbuf> [
'[ @ [ t ] [ get-char _ push f ] if ] skip-until
] keep >string ; inline
: take-rest ( -- string )
[ f ] take-until ;
: take-char ( ch -- string )
[ dup get-char = ] take-until nip ;
TUPLE: not-enough-characters < parsing-error ;
: not-enough-characters ( -- * )
\ not-enough-characters parsing-error throw ;
M: not-enough-characters summary ( obj -- str )
[
call-next-method write
"Not enough characters" print
] with-string-writer ;
: take ( n -- string )
[ 1- ] [ <sbuf> ] bi [
'[ drop get-char [ next _ push f ] [ t ] if* ] contains? drop
] keep get-char [ over push ] when* >string ;
: pass-blank ( -- )
#! Advance code past any whitespace, including newlines
[ get-char blank? not ] skip-until ;
: string-matches? ( string circular -- ? )
get-char over push-circular
sequence= ;
: take-string ( match -- string )
dup length <circular-string>
[ 2dup string-matches? ] take-until nip
dup length rot length 1- - head
get-char [ missing-close ] unless next ;
: expect ( ch -- )
get-char 2dup = [ 2drop ] [
[ 1string ] bi@ expected
] if next ;
: expect-string ( string -- )
dup [ get-char next ] replicate 2dup =
[ 2drop ] [ expected ] if ;
: init-parser ( -- )
0 1 0 f f <spot> spot set
read1 set-next next ;
: state-parse ( stream quot -- )
! with-input-stream implicitly creates a new scope which we use
swap [ init-parser call ] with-input-stream ; inline
: string-parse ( input quot -- )
[ <string-reader> ] dip state-parse ; inline

View File

@ -1 +0,0 @@
State-machined based text parsing framework

View File

@ -0,0 +1,19 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: accessors kernel namespaces ;
IN: xml.state
TUPLE: spot char line column next check ;
C: <spot> spot
: get-char ( -- char ) spot get char>> ;
: set-char ( char -- ) spot get swap >>char drop ;
: get-line ( -- line ) spot get line>> ;
: set-line ( line -- ) spot get swap >>line drop ;
: get-column ( -- column ) spot get column>> ;
: set-column ( column -- ) spot get swap >>column drop ;
: get-next ( -- char ) spot get next>> ;
: set-next ( char -- ) spot get swap >>next drop ;
: get-check ( -- ? ) spot get check>> ;
: check ( -- ) spot get t >>check drop ;

View File

@ -1,4 +1,11 @@
USING: tools.test xml.state-parser kernel io strings ascii ;
USING: tools.test xml.tokenize xml.state io.streams.string kernel io strings ascii ;
IN: xml.test.state
: string-parse ( str quot -- )
[ <string-reader> ] dip state-parse ;
: take-rest ( -- string )
[ f ] take-until ;
[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
[ 2 4 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test

View File

@ -3,7 +3,7 @@
IN: xml.tests
USING: kernel xml tools.test io namespaces make sequences
xml.errors xml.entities.html parser strings xml.data io.files
xml.writer xml.utilities xml.state-parser continuations assocs
xml.writer xml.utilities continuations assocs
sequences.deep accessors io.streams.string ;
! This is insufficient

View File

@ -3,11 +3,93 @@
USING: accessors arrays ascii assocs combinators locals
combinators.short-circuit fry io.encodings io.encodings.iana
io.encodings.string io.encodings.utf16 io.encodings.utf8 kernel make
math math.parser namespaces sequences sets splitting xml.state-parser
math math.parser namespaces sequences sets splitting xml.state
strings xml.char-classes xml.data xml.entities xml.errors hashtables
circular ;
circular io sbufs ;
IN: xml.tokenize
! Originally from state-parser
SYMBOL: prolog-data
: version=1.0? ( -- ? )
prolog-data get [ version>> "1.0" = ] [ t ] if* ;
: assure-good-char ( ch -- ch )
[
version=1.0? over text? not get-check and
[ disallowed-char ] when
] [ f ] if* ;
! * Basic utility words
: record ( char -- )
CHAR: \n =
[ 0 get-line 1+ set-line ] [ get-column 1+ ] if
set-column ;
! (next) normalizes \r\n and \r
: (next) ( -- char )
get-next read1
2dup swap CHAR: \r = [
CHAR: \n =
[ nip read1 ] [ nip CHAR: \n swap ] if
] [ drop ] if
set-next dup set-char assure-good-char ;
: next ( -- )
#! Increment spot.
get-char [ unexpected-end ] unless (next) record ;
: skip-until ( quot: ( -- ? ) -- )
get-char [
[ call ] keep swap [ drop ] [
next skip-until
] if
] [ drop ] if ; inline recursive
: take-until ( quot -- string )
#! Take the substring of a string starting at spot
#! from code until the quotation given is true and
#! advance spot to after the substring.
10 <sbuf> [
'[ @ [ t ] [ get-char _ push f ] if ] skip-until
] keep >string ; inline
: take-char ( ch -- string )
[ dup get-char = ] take-until nip ;
: pass-blank ( -- )
#! Advance code past any whitespace, including newlines
[ get-char blank? not ] skip-until ;
: string-matches? ( string circular -- ? )
get-char over push-circular
sequence= ;
: take-string ( match -- string )
dup length <circular-string>
[ 2dup string-matches? ] take-until nip
dup length rot length 1- - head
get-char [ missing-close ] unless next ;
: expect ( ch -- )
get-char 2dup = [ 2drop ] [
[ 1string ] bi@ expected
] if next ;
: expect-string ( string -- )
dup [ get-char next ] replicate 2dup =
[ 2drop ] [ expected ] if ;
: init-parser ( -- )
0 1 0 f f <spot> spot set
read1 set-next next ;
: state-parse ( stream quot -- )
! with-input-stream implicitly creates a new scope which we use
swap [ init-parser call ] with-input-stream ; inline
! XML namespace processing: ns = namespace
! A stack of hashtables
@ -81,9 +163,6 @@ SYMBOL: depth
: parse-name-starting ( string -- name )
take-name append interpret-name ;
: parse-simple-name ( -- name )
take-name <simple-name> ;
! -- Parsing strings
: parse-named-entity ( string -- )
@ -98,12 +177,20 @@ SYMBOL: depth
"x" ?head 16 10 ? base> ,
] [ parse-named-entity ] if ;
SYMBOL: pe-table
SYMBOL: in-dtd?
: parse-pe ( -- )
next CHAR: ; take-char dup next
pe-table get at [ % ] [ no-entity ] ?if ;
:: (parse-char) ( quot: ( ch -- ? ) -- )
get-char :> char
{
{ [ char not ] [ ] }
{ [ char quot call ] [ next ] }
{ [ char CHAR: & = ] [ parse-entity quot (parse-char) ] }
{ [ in-dtd? get char CHAR: % = and ] [ parse-pe quot (parse-char) ] }
[ char , next quot (parse-char) ]
} cond ; inline recursive
@ -131,11 +218,14 @@ SYMBOL: depth
get-char CHAR: / = dup [ next ] when
parse-name swap ;
: normalize-quote ( str -- str )
[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;
: (parse-quote) ( <-disallowed? ch -- string )
swap '[
dup _ = [ drop t ]
[ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if
] parse-char get-char
] parse-char normalize-quote get-char
[ unclosed-quote ] unless ; inline
: parse-quote* ( <-disallowed? -- seq )
@ -145,12 +235,9 @@ SYMBOL: depth
: parse-quote ( -- seq )
f parse-quote* ;
: normalize-quot ( str -- str )
[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;
: parse-attr ( -- )
parse-name pass-blank CHAR: = expect pass-blank
t parse-quote* normalize-quot 2array , ;
t parse-quote* 2array , ;
: (middle-tag) ( -- )
pass-blank version=1.0? get-char name-start?
@ -193,7 +280,7 @@ SYMBOL: depth
: take-element-decl ( -- element-decl )
take-decl-contents <element-decl> ;
: take-attlist-decl ( -- doctype-decl )
: take-attlist-decl ( -- attlist-decl )
take-decl-contents <attlist-decl> ;
: take-notation-decl ( -- notation-decl )
@ -217,7 +304,11 @@ DEFER: direct
} case ;
: take-internal-subset ( -- seq )
[ (take-internal-subset) ] { } make ;
[
H{ } pe-table set
t in-dtd? set
(take-internal-subset)
] { } make ;
: (take-external-id) ( token -- external-id )
pass-blank {
@ -232,36 +323,33 @@ DEFER: direct
: only-blanks ( str -- )
[ blank? ] all? [ bad-decl ] unless ;
: nontrivial-doctype ( -- external-id internal-subset )
pass-blank get-char CHAR: [ = [
next take-internal-subset f swap close
] [
" >" take-until-one-of {
{ CHAR: \s [ (take-external-id) ] }
{ CHAR: > [ only-blanks f ] }
} case f
] if ;
: take-doctype-decl ( -- doctype-decl )
pass-blank " >" take-until-one-of {
{ CHAR: \s [
pass-blank get-char CHAR: [ = [
next take-internal-subset f swap
close
] [
" >" take-until-one-of {
{ CHAR: \s [ (take-external-id) ] }
{ CHAR: > [ only-blanks f ] }
} case f
] if
] }
{ CHAR: \s [ nontrivial-doctype ] }
{ CHAR: > [ f f ] }
} case <doctype-decl> ;
: take-entity-def ( -- entity-name entity-def )
: take-entity-def ( var -- entity-name entity-def )
take-word pass-blank get-char {
{ CHAR: ' [ parse-quote ] }
{ CHAR: " [ parse-quote ] }
[ drop take-external-id ]
} case ;
: associate-entity ( entity-name entity-def -- )
swap extra-entities get set-at ;
} case [ spin [ ?set-at ] change ] 2keep ;
: take-entity-decl ( -- entity-decl )
pass-blank get-char {
{ CHAR: % [ next pass-blank take-entity-def ] }
[ drop take-entity-def 2dup associate-entity ]
{ CHAR: % [ next pass-blank pe-table take-entity-def ] }
[ drop extra-entities take-entity-def ]
} case
close <entity-decl> ;
@ -282,13 +370,6 @@ DEFER: direct
[ drop take-directive ]
} case ;
: yes/no>bool ( string -- t/f )
{
{ "yes" [ t ] }
{ "no" [ f ] }
[ not-yes/no ]
} case ;
: assure-no-extra ( seq -- )
[ first ] map {
T{ name f "" "version" f }
@ -307,6 +388,13 @@ DEFER: direct
: prolog-encoding ( alist -- encoding )
T{ name f "" "encoding" f } swap at "UTF-8" or ;
: yes/no>bool ( string -- t/f )
{
{ "yes" [ t ] }
{ "no" [ f ] }
[ not-yes/no ]
} case ;
: prolog-standalone ( alist -- version )
T{ name f "" "standalone" f } swap at
[ yes/no>bool ] [ f ] if* ;

View File

@ -0,0 +1,101 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax xml.data sequences strings ;
IN: xml.utilities
ABOUT: "xml.utilities"
ARTICLE: "xml.utilities" "Utilities for processing XML"
"Utilities for processing XML include..."
$nl
"System sfor creating words which dispatch on XML tags:"
{ $subsection POSTPONE: PROCESS: }
{ $subsection POSTPONE: TAG: }
"Getting parts of an XML document or tag:"
$nl
"Note: the difference between deep-tag-named and tag-named is that the former searches recursively among all children and children of children of the tag, while the latter only looks at the direct children, and is therefore more efficient."
{ $subsection tag-named }
{ $subsection tags-named }
{ $subsection deep-tag-named }
{ $subsection deep-tags-named }
{ $subsection get-id }
"Words for simplified generation of XML:"
{ $subsection build-tag* }
{ $subsection build-tag }
{ $subsection build-xml }
"Other relevant words:"
{ $subsection children>string }
{ $subsection children-tags }
{ $subsection first-child-tag }
{ $subsection assert-tag } ;
HELP: deep-tag-named
{ $values { "tag" "an XML tag or document" } { "name/string" "an XML name or string representing a name" } { "matching-tag" tag } }
{ $description "finds an XML tag with a matching name, recursively searching children and children of children" }
{ $see-also tags-named tag-named deep-tags-named } ;
HELP: deep-tags-named
{ $values { "tag" "an XML tag or document" } { "name/string" "an XML name or string representing a name" } { "tags-seq" "a sequence of tags" } }
{ $description "returns a sequence of all tags of a matching name, recursively searching children and children of children" }
{ $see-also tag-named deep-tag-named tags-named } ;
HELP: children>string
{ $values { "tag" "an XML tag or document" } { "string" "a string" } }
{ $description "concatenates the children of the tag, ignoring everything that's not a string" } ;
HELP: children-tags
{ $values { "tag" "an XML tag or document" } { "sequence" sequence } }
{ $description "gets the children of the tag that are themselves tags" }
{ $see-also first-child-tag } ;
HELP: first-child-tag
{ $values { "tag" "an XML tag or document" } { "tag" tag } }
{ $description "returns the first child of the given tag that is a tag" }
{ $see-also children-tags } ;
HELP: tag-named
{ $values { "tag" "an XML tag or document" }
{ "name/string" "an XML name or string representing the name" }
{ "matching-tag" tag } }
{ $description "finds the first tag with matching name which is the direct child of the given tag" }
{ $see-also deep-tags-named deep-tag-named tags-named } ;
HELP: tags-named
{ $values { "tag" "an XML tag or document" }
{ "name/string" "an XML name or string representing the name" }
{ "tags-seq" "a sequence of tags" } }
{ $description "finds all tags with matching name that are the direct children of the given tag" }
{ $see-also deep-tag-named deep-tags-named tag-named } ;
HELP: get-id
{ $values { "tag" "an XML tag or document" } { "id" "a string" } { "elem" "an XML element or f" } }
{ $description "finds the XML tag with the specified id, ignoring the namespace" } ;
HELP: PROCESS:
{ $syntax "PROCESS: word" }
{ $values { "word" "a new word to define" } }
{ $description "creates a new word to process XML tags" }
{ $see-also POSTPONE: TAG: } ;
HELP: TAG:
{ $syntax "TAG: tag word definition... ;" }
{ $values { "tag" "an xml tag name" } { "word" "an XML process" } }
{ $description "defines what a process should do when it encounters a specific tag" }
{ $examples { $code "PROCESS: x ( tag -- )\nTAG: a x drop \"hi\" write ;" } }
{ $see-also POSTPONE: PROCESS: } ;
HELP: build-tag*
{ $values { "items" "sequence of elements" } { "name" "string" }
{ "tag" tag } }
{ $description "builds a " { $link tag } " with the specified name, in the namespace \"\" and URL \"\" containing the children listed in item" }
{ $see-also build-tag build-xml } ;
HELP: build-tag
{ $values { "item" "an element" } { "name" string } { "tag" tag } }
{ $description "builds a " { $link tag } " with the specified name containing the single child item" }
{ $see-also build-tag* build-xml } ;
HELP: build-xml
{ $values { "tag" tag } { "xml" "an XML document" } }
{ $description "builds an XML document out of a tag" }
{ $see-also build-tag* build-tag } ;

View File

@ -0,0 +1,62 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.syntax help.markup io strings ;
IN: xml.writer
ABOUT: "xml.writer"
ARTICLE: "xml.writer" "Writing XML"
"These words are used in implementing prettyprint"
{ $subsection write-xml-chunk }
"These words are used to print XML normally"
{ $subsection xml>string }
{ $subsection write-xml }
{ $subsection print-xml }
"These words are used to prettyprint XML"
{ $subsection pprint-xml>string }
{ $subsection pprint-xml>string-but }
{ $subsection pprint-xml }
{ $subsection pprint-xml-but } ;
HELP: write-xml-chunk
{ $values { "object" "an XML element" } }
{ $description "writes an XML element to " { $link output-stream } "." }
{ $see-also write-xml-chunk write-xml } ;
HELP: xml>string
{ $values { "xml" "an xml document" } { "string" "a string" } }
{ $description "converts an XML document into a string" }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml>string
{ $values { "xml" "an xml document" } { "string" "a string" } }
{ $description "converts an XML document into a string in a prettyprinted form." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: write-xml
{ $values { "xml" "an XML document" } }
{ $description "prints the contents of an XML document to " { $link output-stream } "." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: print-xml
{ $values { "xml" "an XML document" } }
{ $description "prints the contents of an XML document to " { $link output-stream } ", followed by a newline" }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml
{ $values { "xml" "an XML document" } }
{ $description "prints the contents of an XML document to " { $link output-stream } " in a prettyprinted form." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml-but
{ $values { "xml" "an XML document" } { "sensitive-tags" "a sequence of names" } }
{ $description "Prettyprints an XML document, leaving the whitespace of the tags with names in sensitive-tags intact." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml>string-but
{ $values { "xml" "an XML document" } { "sensitive-tags" "a sequence of names" } { "string" string } }
{ $description "Prettyprints an XML document, returning the result as a string and leaving the whitespace of the tags with names in sensitive-tags intact." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
{ xml>string print-xml write-xml pprint-xml pprint-xml>string pprint-xml>string-but pprint-xml-but } related-words

View File

@ -1,8 +1,6 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: help.markup help.syntax kernel xml.data xml.errors
xml.writer state-parser xml.tokenize xml.utilities xml.entities
strings sequences io xml.entities.html ;
USING: help.markup help.syntax xml.data io ;
IN: xml
HELP: string>xml
@ -13,7 +11,7 @@ HELP: string>xml
HELP: read-xml
{ $values { "stream" "a stream that supports readln" }
{ "xml" "an XML document" } }
{ $description "exausts the given stream, reading an XML document from it" } ;
{ $description "exausts the given stream, reading an XML document from it. A binary stream, one without encoding, should be used as input, and the encoding is automatically detected." } ;
HELP: file>xml
{ $values { "filename" "a string representing a filename" }
@ -22,170 +20,10 @@ HELP: file>xml
{ string>xml read-xml file>xml } related-words
HELP: xml>string
{ $values { "xml" "an xml document" } { "string" "a string" } }
{ $description "converts an xml document (" { $link xml } ") into a string" }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml>string
{ $values { "xml" "an xml document" } { "string" "a string" } }
{ $description "converts an xml document (" { $link xml } ") into a string in a prettyprinted form." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: xml-parse-error
{ $class-description "the exception class that all parsing errors in XML documents are in." } ;
HELP: xml-reprint
{ $values { "string" "a string of XML" } }
{ $description "parses XML and prints it out again, for testing purposes" }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: write-xml
{ $values { "xml" "an XML document" } }
{ $description "prints the contents of an XML document (" { $link xml } ") to " { $link output-stream } "." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: print-xml
{ $values { "xml" "an XML document" } }
{ $description "prints the contents of an XML document (" { $link xml } ") to " { $link output-stream } ", followed by a newline" }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml
{ $values { "xml" "an XML document" } }
{ $description "prints the contents of an XML document (" { $link xml } ") to " { $link output-stream } " in a prettyprinted form." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml-but
{ $values { "xml" "an XML document" } { "sensitive-tags" "a sequence of names" } }
{ $description "Prettyprints an XML document, leaving the whitespace of the tags with names in sensitive-tags intact." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
HELP: pprint-xml>string-but
{ $values { "xml" "an XML document" } { "sensitive-tags" "a sequence of names" } { "string" string } }
{ $description "Prettyprints an XML document, returning the result as a string and leaving the whitespace of the tags with names in sensitive-tags intact." }
{ $notes "does not preserve what type of quotes were used or what data was omitted from version declaration" } ;
{ xml>string print-xml write-xml pprint-xml xml-reprint pprint-xml>string pprint-xml>string-but pprint-xml-but } related-words
HELP: PROCESS:
{ $syntax "PROCESS: word" }
{ $values { "word" "a new word to define" } }
{ $description "creates a new word to process XML tags" }
{ $see-also POSTPONE: TAG: } ;
HELP: TAG:
{ $syntax "TAG: tag word definition... ;" }
{ $values { "tag" "an xml tag name" } { "word" "an XML process" } }
{ $description "defines what a process should do when it encounters a specific tag" }
{ $examples { $code "PROCESS: x ( tag -- )\nTAG: a x drop \"hi\" write ;" } }
{ $see-also POSTPONE: PROCESS: } ;
HELP: build-tag*
{ $values { "items" "sequence of elements" } { "name" "string" }
{ "tag" tag } }
{ $description "builds a " { $link tag } " with the specified name, in the namespace \"\" and URL \"\" containing the children listed in item" }
{ $see-also build-tag build-xml } ;
HELP: build-tag
{ $values { "item" "an element" } { "name" string } { "tag" tag } }
{ $description "builds a " { $link tag } " with the specified name containing the single child item" }
{ $see-also build-tag* build-xml } ;
HELP: build-xml
{ $values { "tag" tag } { "xml" "an XML document" } }
{ $description "builds an XML document out of a tag" }
{ $see-also build-tag* build-tag } ;
HELP: tag
{ $class-description "tuple representing an XML tag, delegating to a " { $link
name } ", containing the slots attrs (an alist of names to strings) and children (a sequence). Tags implement the sequence protocol by acting like a sequence of its chidren, and the assoc protocol by acting like its attributes." }
{ $see-also <tag> name contained-tag xml } ;
HELP: <tag>
{ $values { "name" "an XML tag name" }
{ "attrs" "an alist of names to strings" }
{ "children" sequence }
{ "tag" tag } }
{ $description "constructs an XML " { $link tag } " with the name (not a string) and tag attributes specified in attrs and children specified" }
{ $see-also tag <contained-tag> build-tag build-tag* } ;
HELP: name
{ $class-description "represents an XML name, with the fields space (a string representing the namespace, as written in the document, tag (a string of the actual name of the tag) and url (a string of the URL that the namespace points to)" }
{ $see-also <name> tag } ;
HELP: <name>
{ $values { "space" "a string" } { "main" "a string" } { "url" "a string" }
{ "name" "an XML tag name" } }
{ $description "creates a name tuple with the name-space space and the tag-name tag and the tag-url url." }
{ $see-also name <tag> } ;
HELP: contained-tag
{ $class-description "delegates to tag representing a tag like <a/> with no contents. The tag attributes are accessed with tag-attrs" }
{ $see-also tag <contained-tag> } ;
HELP: <contained-tag>
{ $values { "name" "an XML tag name" }
{ "attrs" "an alist from names to strings" }
{ "tag" tag } }
{ $description "creates an empty tag (like <a/>) with the specified name and tag attributes. This delegates to tag" }
{ $see-also contained-tag <tag> } ;
HELP: xml
{ $class-description "tuple representing an XML document, delegating to the main tag, containing the fields prolog (the header <?xml...?>), before (whatever comes between the prolog and the main tag) and after (whatever comes after the main tag)" }
{ $see-also <xml> tag prolog } ;
HELP: <xml>
{ $values { "prolog" "an XML prolog" } { "before" "a sequence of XML elements" }
{ "body" tag } { "after" "a sequence of XML elements" } { "xml" "an XML document" } }
{ $description "creates an XML document, delegating to the main tag, with the specified prolog, before, and after" }
{ $see-also xml <tag> } ;
HELP: prolog
{ $class-description "represents an XML prolog, with the tuple fields version (containing \"1.0\" or \"1.1\"), encoding (a string representing the encoding type), and standalone (t or f, whether the document is standalone without external entities)" }
{ $see-also <prolog> xml } ;
HELP: <prolog>
{ $values { "version" "a string, 1.0 or 1.1" }
{ "encoding" "a string" } { "standalone" "a boolean" } { "prolog" "an XML prolog" } }
{ $description "creates an XML prolog tuple" }
{ $see-also prolog <xml> } ;
HELP: comment
{ $class-description "represents a comment in XML. Has one slot, text, which contains the string of the comment" }
{ $see-also <comment> } ;
HELP: <comment>
{ $values { "text" "a string" } { "comment" "a comment" } }
{ $description "creates an XML comment tuple" }
{ $see-also comment } ;
HELP: instruction
{ $class-description "represents an XML instruction, such as <?xsl stylesheet='foo.xml'?>. Contains one slot, text, which contains the string between the question marks." }
{ $see-also <instruction> } ;
HELP: <instruction>
{ $values { "text" "a string" } { "instruction" "an XML instruction" } }
{ $description "creates an XML parsing instruction, such as <?xsl stylesheet='foo.xml'?>." }
{ $see-also instruction } ;
HELP: names-match?
{ $values { "name1" "a name" } { "name2" "a name" } { "?" "t or f" } }
{ $description "checks to see if the two names match, that is, if all fields are equal, ignoring fields whose value is f in either name." }
{ $example "USING: prettyprint xml.data ;" "T{ name f \"rpc\" \"methodCall\" f } T{ name f f \"methodCall\" \"http://www.xmlrpc.org/\" } names-match? ." "t" }
{ $see-also name } ;
HELP: read-xml-chunk
{ $values { "stream" "an input stream" } { "seq" "a sequence of elements" } }
{ $description "rather than parse a document, as " { $link read-xml } " does, this word parses and returns a sequence of XML elements (tags, strings, etc), ie a document fragment. This is useful for pieces of XML which may have more than one main tag." }
{ $see-also write-xml-chunk read-xml } ;
HELP: get-id
{ $values { "tag" "an XML tag or document" } { "id" "a string" } { "elem" "an XML element or f" } }
{ $description "finds the XML tag with the specified id, ignoring the namespace" }
{ $see-also } ;
HELP: process
{ $values { "object" "an opener, closer, contained or text element" } }
{ $description "takes an XML event and, using the XML stack, processes it and adds it to the tree" } ;
{ $see-also read-xml } ;
HELP: sax
{ $values { "stream" "an input stream" } { "quot" "a quotation ( xml-elem -- )" } }
@ -193,33 +31,6 @@ HELP: sax
{ $notes "It is important to note that this is not SAX, merely an event-based XML view" }
{ $see-also read-xml } ;
HELP: opener
{ $class-description "describes an opening tag, like <a>. Contains two slots, name and attrs containing, respectively, the name of the tag and its attributes. Usually, the name-url will be f." }
{ $see-also closer contained } ;
HELP: closer
{ $class-description "describes a closing tag, like </a>. Contains one slot, name, containing the tag's name. Usually, the name-url will be f." }
{ $see-also opener contained } ;
HELP: contained
{ $class-description "represents a self-closing tag, like <a/>. Contains two slots, name and attrs containing, respectively, the name of the tag and its attributes. Usually, the name-url will be f." }
{ $see-also opener closer } ;
HELP: parse-text
{ $values { "string" "a string" } }
{ $description "moves the pointer from the current spot to the beginning of the next tag, parsing the text underneath, returning the text element it passed. This parses XML entities like &bar; &#97; and &amp;" }
{ $see-also parse-name } ;
HELP: parse-name
{ $values { "name" "an XML name" } }
{ $description "parses a " { $link name } " from the input stream. Returns a name with only the name-space and name-tag defined, with name-url=f" }
{ $see-also parse-text } ;
HELP: make-tag
{ $values { "tag" "an opener, closer or contained" } }
{ $description "assuming the pointer is just past a <, this word parses until the next > and emits a tuple representing the tag parsed" }
{ $see-also opener closer contained } ;
HELP: pull-xml
{ $class-description "represents the state of a pull-parser for XML. Has one slot, scope, which is a namespace which contains all relevant state information." }
{ $see-also <pull-xml> pull-event pull-elem } ;
@ -239,116 +50,6 @@ HELP: pull-event
{ $description "gets the next XML event from the given XML pull parser. Returns f upon exhaustion." }
{ $see-also pull-xml <pull-xml> pull-elem } ;
HELP: write-xml-chunk
{ $values { "object" "an XML element" } }
{ $description "writes an XML element to " { $link output-stream } "." }
{ $see-also write-xml-chunk write-xml } ;
HELP: deep-tag-named
{ $values { "tag" "an XML tag or document" } { "name/string" "an XML name or string representing a name" } { "matching-tag" tag } }
{ $description "finds an XML tag with a matching name, recursively searching children and children of children" }
{ $see-also tags-named tag-named deep-tags-named } ;
HELP: deep-tags-named
{ $values { "tag" "an XML tag or document" } { "name/string" "an XML name or string representing a name" } { "tags-seq" "a sequence of tags" } }
{ $description "returns a sequence of all tags of a matching name, recursively searching children and children of children" }
{ $see-also tag-named deep-tag-named tags-named } ;
HELP: children>string
{ $values { "tag" "an XML tag or document" } { "string" "a string" } }
{ $description "concatenates the children of the tag, ignoring everything that's not a string" } ;
HELP: children-tags
{ $values { "tag" "an XML tag or document" } { "sequence" sequence } }
{ $description "gets the children of the tag that are themselves tags" }
{ $see-also first-child-tag } ;
HELP: first-child-tag
{ $values { "tag" "an XML tag or document" } { "tag" tag } }
{ $description "returns the first child of the given tag that is a tag" }
{ $see-also children-tags } ;
HELP: multitags
{ $class-description "XML parsing error describing the case where there is more than one main tag in a document. Contains no slots" } ;
HELP: notags
{ $class-description "XML parsing error describing the case where an XML document contains no main tag, or any tags at all" } ;
HELP: extra-attrs
{ $class-description "XML parsing error describing the case where the XML prolog (<?xml ...?>) contains attributes other than the three allowed ones, standalone, version and encoding. Contains one slot, attrs, which is a hashtable of all the extra attributes' names. Delegates to " { $link parsing-error } "." } ;
HELP: nonexist-ns
{ $class-description "XML parsing error describing the case where a namespace doesn't exist but it is used in a tag. Contains one slot, name, which contains the name of the undeclared namespace, and delegates to " { $link parsing-error } "." } ;
HELP: not-yes/no
{ $class-description "XML parsing error used to describe the case where standalone is set in the XML prolog to something other than 'yes' or 'no'. Delegates to " { $link parsing-error } " and contains one slot, text, which contains offending value." } ;
HELP: unclosed
{ $class-description "XML parsing error used to describe the case where the XML document contains classes which are not closed by the end of the document. Contains one slot, tags, a sequence of names." } ;
HELP: mismatched
{ $class-description "XML parsing error describing mismatched tags, eg <a></c>. Contains two slots: open is the name of the opening tag and close is the name of the closing tag. Delegates to " { $link parsing-error } " showing the location of the closing tag" } ;
HELP: expected
{ $class-description "XML parsing error describing when an expected token was not present. Delegates to " { $link parsing-error } ". Contains two slots, should-be, which has the expected string, and was, which has the actual string." } ;
HELP: no-entity
{ $class-description "XML parsing error describing the use of an undefined entity in a case where standalone is marked yes. Delegates to " { $link parsing-error } ". Contains one slot, thing, containing a string representing the entity." } ;
HELP: open-tag
{ $class-description "represents a tag that does have children, ie is not a contained tag" }
{ $notes "the constructor used for this class is simply " { $link <tag> } "." }
{ $see-also tag contained-tag } ;
HELP: tag-named
{ $values { "tag" "an XML tag or document" }
{ "name/string" "an XML name or string representing the name" }
{ "matching-tag" tag } }
{ $description "finds the first tag with matching name which is the direct child of the given tag" }
{ $see-also deep-tags-named deep-tag-named tags-named } ;
HELP: tags-named
{ $values { "tag" "an XML tag or document" }
{ "name/string" "an XML name or string representing the name" }
{ "tags-seq" "a sequence of tags" } }
{ $description "finds all tags with matching name that are the direct children of the given tag" }
{ $see-also deep-tag-named deep-tags-named tag-named } ;
HELP: state-parse
{ $values { "stream" "an input stream" } { "quot" "a quotation ( -- )" } }
{ $description "takes a stream and runs an imperative parser on it, allowing words like " { $link next } " to be used within the context of the stream." } ;
HELP: pre/post-content
{ $class-description "describes the error where a non-whitespace string is used before or after the main tag in an XML document. Contains two slots: string contains the offending string, and pre? is t if it occured before the main tag and f if it occured after" } ;
HELP: unclosed-quote
{ $class-description "describes the error where a quotation for an attribute value is opened but not closed before the end of the document." } ;
HELP: bad-name
{ $class-description "describes the error where a name is used, for example in an XML tag or attribute key, which is invalid." } ;
HELP: quoteless-attr
{ $class-description "describes the error where an attribute of an XML tag is missing quotes around a value." } ;
HELP: entities
{ $description "a hash table from default XML entity names (like &amp; and &lt;) to the characters they represent. This is automatically included when parsing any XML document." }
{ $see-also html-entities } ;
HELP: html-entities
{ $description "a hash table from HTML entity names to their character values" }
{ $see-also entities with-html-entities } ;
HELP: with-entities
{ $values { "entities" "a hash table of strings to chars" }
{ "quot" "a quotation ( -- )" } }
{ $description "calls the quotation using the given table of entity values (symbolizing, eg, that &foo; represents CHAR: a) on top of the default XML entities" }
{ $see-also with-html-entities } ;
HELP: with-html-entities
{ $values { "quot" "a quotation ( -- )" } }
{ $description "calls the given quotation using HTML entity values" }
{ $see-also html-entities with-entities } ;
ARTICLE: { "xml" "reading" } "Reading XML"
"The following words are used to read something into an XML document"
{ $subsection string>xml }
@ -357,77 +58,8 @@ ARTICLE: { "xml" "reading" } "Reading XML"
{ $subsection string>xml-chunk }
{ $subsection file>xml } ;
ARTICLE: { "xml" "writing" } "Writing XML"
"These words are used in implementing prettyprint"
{ $subsection write-xml-chunk }
"These words are used to print XML normally"
{ $subsection xml>string }
{ $subsection write-xml }
{ $subsection print-xml }
"These words are used to prettyprint XML"
{ $subsection pprint-xml>string }
{ $subsection pprint-xml>string-but }
{ $subsection pprint-xml }
{ $subsection pprint-xml-but }
"This word reads and writes XML"
{ $subsection xml-reprint } ;
ARTICLE: { "xml" "classes" } "XML data classes"
"Data types that XML documents are made of:"
{ $subsection name }
{ $subsection tag }
{ $subsection contained-tag }
{ $subsection open-tag }
{ $subsection xml }
{ $subsection prolog }
{ $subsection comment }
{ $subsection instruction } ;
ARTICLE: { "xml" "construct" } "XML data constructors"
"These data types are constructed with:"
{ $subsection <name> }
{ $subsection <tag> }
{ $subsection <contained-tag> }
{ $subsection <xml> }
{ $subsection <prolog> }
{ $subsection <comment> }
{ $subsection <instruction> } ;
ARTICLE: { "xml" "utils" } "XML processing utilities"
"Utilities for processing XML include..."
$nl
"System sfor creating words which dispatch on XML tags:"
{ $subsection POSTPONE: PROCESS: }
{ $subsection POSTPONE: TAG: }
"Getting parts of an XML document or tag:"
$nl
"Note: the difference between deep-tag-named and tag-named is that the former searches recursively among all children and children of children of the tag, while the latter only looks at the direct children, and is therefore more efficient."
{ $subsection tag-named }
{ $subsection tags-named }
{ $subsection deep-tag-named }
{ $subsection deep-tags-named }
{ $subsection get-id }
"Words for simplified generation of XML:"
{ $subsection build-tag* }
{ $subsection build-tag }
{ $subsection build-xml }
"Other relevant words:"
{ $subsection children>string }
{ $subsection children-tags }
{ $subsection first-child-tag }
{ $subsection names-match? }
{ $subsection assert-tag } ;
ARTICLE: { "xml" "internal" } "Internals of the XML parser"
"The XML parser creates its own parsing framework to process XML documents. The parser operates on streams. Important words involved in processing are:"
{ $subsection parse-text }
{ $subsection make-tag }
{ $subsection parse-name }
{ $subsection process }
"The XML parser is implemented using the libs/state-parser module. For more information, see " { $link { "state-parser" "main" } } ;
ARTICLE: { "xml" "events" } "Event-based XML parsing"
"In addition to DOM-style parsing based around " { $link read-xml } ", the XML module also provides SAX-style event-based parsing. This uses much of the same data structures as normal XML, with the exception of the classes " { $link xml } " and " { $link tag } " and as such, the articles " { $link { "xml" "classes" } } " and " { $link { "xml" "construct" } } " may be useful in learning how to process documents in this way. Other useful words are:"
"In addition to DOM-style parsing based around " { $link read-xml } ", the XML module also provides SAX-style event-based parsing. This uses much of the same data structures as normal XML, with the exception of the classes " { $link xml } " and " { $link tag } " and as such, the article " { $vocab-link "xml.data" } " may be useful in learning how to process documents in this way. Other useful words are:"
{ $subsection sax }
{ $subsection opener }
{ $subsection closer }
@ -438,43 +70,15 @@ ARTICLE: { "xml" "events" } "Event-based XML parsing"
{ $subsection pull-event }
{ $subsection pull-elem } ;
ARTICLE: { "xml" "errors" } "XML parsing errors"
"The XML module provides a rich and highly inspectable set of parsing errors. All XML errors are described by the union class " { $link xml-parse-error } " but there are many classes contained in that:"
{ $subsection multitags }
{ $subsection notags }
{ $subsection extra-attrs }
{ $subsection nonexist-ns }
{ $subsection not-yes/no }
{ $subsection unclosed }
{ $subsection mismatched }
{ $subsection expected }
{ $subsection no-entity }
{ $subsection pre/post-content }
{ $subsection unclosed-quote }
{ $subsection bad-name }
{ $subsection quoteless-attr }
"Additionally, most of these errors delegate to " { $link parsing-error } " in order to provide more information"
$nl
"Note that, in parsing an XML document, only the first error is reported." ;
ARTICLE: { "xml" "entities" } "XML entities"
"When XML is parsed, entities like &foo; are replaced with the characters they represent. A few entities like &amp; and &lt; are defined by default, but more are available, and the set of entities can be customized. Below are some words involved in XML entities, defined in the vocabulary 'entities':"
{ $subsection entities }
{ $subsection html-entities }
{ $subsection with-entities }
{ $subsection with-html-entities } ;
ARTICLE: "xml" "XML parser"
"The " { $vocab-link "xml" } " vocabulary implements the XML 1.0 and 1.1 standards, converting strings of text into XML and vice versa."
{ $subsection { "xml" "reading" } }
{ $subsection { "xml" "writing" } }
{ $subsection { "xml" "classes" } }
{ $subsection { "xml" "construct" } }
{ $subsection { "xml" "utils" } }
{ $subsection { "xml" "internal" } }
{ $subsection { "xml" "events" } }
{ $subsection { "xml" "errors" } }
{ $subsection { "xml" "entities" } } ;
{ $vocab-subsection "Utilities for processing XML" "xml.utilities" }
{ $vocab-subsection "Writing XML" "xml.writer" }
{ $vocab-subsection "XML parsing errors" "xml.errors" }
{ $vocab-subsection "XML entities" "xml.entities" }
{ $vocab-subsection "XML data types" "xml.data" } ;
IN: xml

View File

@ -1,9 +1,9 @@
! Copyright (C) 2005, 2006 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays io io.encodings.binary io.files
io.streams.string kernel namespaces sequences xml.state-parser strings
io.streams.string kernel namespaces sequences strings
xml.backend xml.data xml.errors xml.tokenize ascii xml.entities
xml.writer assocs ;
xml.writer xml.state assocs ;
IN: xml
! -- Overall parser with data tree
@ -163,7 +163,3 @@ TUPLE: pull-xml scope ;
: file>xml ( filename -- xml )
binary <file-reader> read-xml ;
: xml-reprint ( string -- )
string>xml print-xml ;