Merge branch 'master' of git://factorcode.org/git/factor
commit
79a5b3491a
|
@ -14,3 +14,8 @@ bar
|
|||
|
||||
[ "hello\nworld" ] [ <" hello
|
||||
world"> ] unit-test
|
||||
|
||||
[ "hello" "world" ] [ <" hello"> <" world"> ] unit-test
|
||||
|
||||
[ "\nhi" ] [ <"
|
||||
hi"> ] unit-test
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
! Copyright (C) 2007 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: namespaces make parser lexer kernel sequences words
|
||||
quotations math accessors ;
|
||||
quotations math accessors locals ;
|
||||
IN: multiline
|
||||
|
||||
<PRIVATE
|
||||
|
@ -26,20 +26,27 @@ PRIVATE>
|
|||
(( -- string )) define-inline ; parsing
|
||||
|
||||
<PRIVATE
|
||||
: (parse-multiline-string) ( start-index end-text -- end-index )
|
||||
lexer get line-text>> [
|
||||
2dup start
|
||||
[ rot dupd [ swap subseq % ] 2dip length + ] [
|
||||
rot tail % "\n" % 0
|
||||
lexer get next-line swap (parse-multiline-string)
|
||||
|
||||
:: (parse-multiline-string) ( i end -- j )
|
||||
lexer get line-text>> :> text
|
||||
text [
|
||||
end text i start* [| j |
|
||||
i j text subseq % j end length +
|
||||
] [
|
||||
text i short tail % CHAR: \n ,
|
||||
lexer get next-line
|
||||
0 end (parse-multiline-string)
|
||||
] if*
|
||||
] [ nip unexpected-eof ] if* ;
|
||||
] [ end unexpected-eof ] if ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: parse-multiline-string ( end-text -- str )
|
||||
[
|
||||
lexer get [ swap (parse-multiline-string) ] change-column drop
|
||||
] "" make rest ;
|
||||
lexer get
|
||||
[ 1+ swap (parse-multiline-string) ]
|
||||
change-column drop
|
||||
] "" make ;
|
||||
|
||||
: <"
|
||||
"\">" parse-multiline-string parsed ; parsing
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
Daniel Ehrenberg
|
|
@ -1,72 +0,0 @@
|
|||
USING: help.markup help.syntax ;
|
||||
IN: state-parser
|
||||
|
||||
ABOUT: { "state-parser" "main" }
|
||||
|
||||
ARTICLE: { "state-parser" "main" } "State-based parsing"
|
||||
"This module defines a state-based parsing mechanism. It was originally created for libs/xml, but is also used in libs/csv and can be easily used in new libraries or applications."
|
||||
{ $subsection spot }
|
||||
{ $subsection skip-until }
|
||||
{ $subsection take-until }
|
||||
{ $subsection take-char }
|
||||
{ $subsection take-string }
|
||||
{ $subsection next }
|
||||
{ $subsection state-parse }
|
||||
{ $subsection get-char }
|
||||
{ $subsection take-rest }
|
||||
{ $subsection string-parse }
|
||||
{ $subsection expect }
|
||||
{ $subsection expect-string }
|
||||
{ $subsection parsing-error } ;
|
||||
|
||||
HELP: get-char
|
||||
{ $values { "char" "the current character" } }
|
||||
{ $description "Accesses the current character of the stream that is being parsed" } ;
|
||||
|
||||
HELP: take-rest
|
||||
{ $values { "string" "the rest of the parser input" } }
|
||||
{ $description "Exausts the stream of the parser input and returns a string representing the rest of the input" } ;
|
||||
|
||||
HELP: string-parse
|
||||
{ $values { "input" "a string" } { "quot" "a quotation ( -- )" } }
|
||||
{ $description "Calls the given quotation using the given string as parser input" }
|
||||
{ $see-also state-parse } ;
|
||||
|
||||
HELP: expect
|
||||
{ $values { "ch" "a number representing a character" } }
|
||||
{ $description "Asserts that the current character is the given ch, and moves to the next spot" }
|
||||
{ $see-also expect-string } ;
|
||||
|
||||
HELP: expect-string
|
||||
{ $values { "string" "a string" } }
|
||||
{ $description "Asserts that the current parsing spot is followed by the given string, and skips the parser past that string" }
|
||||
{ $see-also expect } ;
|
||||
|
||||
HELP: spot
|
||||
{ $var-description "This variable represents the location in the program. It is a tuple T{ spot f char column line next } where char is the current character, line is the line number, column is the column number, and line-str is the full contents of the line, as a string. The contents shouldn't be accessed directly but rather with the proxy words get-char set-char get-line etc." } ;
|
||||
|
||||
HELP: skip-until
|
||||
{ $values { "quot" "a quotation ( -- ? )" } }
|
||||
{ $description "executes " { $link next } " until the quotation yields false. Usually, the quotation will call " { $link get-char } " in its test, but not always." }
|
||||
{ $see-also take-until } ;
|
||||
|
||||
HELP: take-until
|
||||
{ $values { "quot" "a quotation ( -- ? )" } { "string" "a string" } }
|
||||
{ $description "like " { $link skip-until } " but records what it passes over and outputs the string." }
|
||||
{ $see-also skip-until take-char take-string } ;
|
||||
|
||||
HELP: take-char
|
||||
{ $values { "ch" "a character" } { "string" "a string" } }
|
||||
{ $description "records the document from the current spot to the first instance of the given character. Outputs the content between those two points." }
|
||||
{ $see-also take-until take-string } ;
|
||||
|
||||
HELP: take-string
|
||||
{ $values { "match" "a string to match" } { "string" "the portion of the XML document" } }
|
||||
{ $description "records the document from the current spot to the first instance of the given character. Outputs the content between those two points." }
|
||||
{ $notes "match may not contain a newline" } ;
|
||||
|
||||
HELP: next
|
||||
{ $description "originally written as " { $code "spot inc" } ", code that would no longer run, this word moves the state of the XML parser to the next place in the source file, keeping track of appropriate debugging information." } ;
|
||||
|
||||
HELP: parsing-error
|
||||
{ $class-description "class from which parsing errors inherit, containing information about which line and column the error occured on, and what the line was. Contains three slots, line, an integer, column, another integer, and line-str, a string" } ;
|
|
@ -1,158 +0,0 @@
|
|||
! Copyright (C) 2005, 2006 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: io io.streams.string kernel math namespaces sequences
|
||||
strings circular prettyprint debugger ascii sbufs fry summary
|
||||
accessors ;
|
||||
IN: state-parser
|
||||
|
||||
! * Basic underlying words
|
||||
! Code stored in stdio
|
||||
! Spot is composite so it won't be lost in sub-scopes
|
||||
TUPLE: spot char line column next ;
|
||||
|
||||
C: <spot> spot
|
||||
|
||||
: get-char ( -- char ) spot get char>> ;
|
||||
: set-char ( char -- ) spot get swap >>char drop ;
|
||||
: get-line ( -- line ) spot get line>> ;
|
||||
: set-line ( line -- ) spot get swap >>line drop ;
|
||||
: get-column ( -- column ) spot get column>> ;
|
||||
: set-column ( column -- ) spot get swap >>column drop ;
|
||||
: get-next ( -- char ) spot get next>> ;
|
||||
: set-next ( char -- ) spot get swap >>next drop ;
|
||||
|
||||
! * Errors
|
||||
TUPLE: parsing-error line column ;
|
||||
|
||||
: parsing-error ( class -- obj )
|
||||
new
|
||||
get-line >>line
|
||||
get-column >>column ;
|
||||
M: parsing-error summary ( obj -- str )
|
||||
[
|
||||
"Parsing error" print
|
||||
"Line: " write dup line>> .
|
||||
"Column: " write column>> .
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: expected < parsing-error should-be was ;
|
||||
: expected ( should-be was -- * )
|
||||
\ expected parsing-error
|
||||
swap >>was
|
||||
swap >>should-be throw ;
|
||||
M: expected summary ( obj -- str )
|
||||
[
|
||||
dup call-next-method write
|
||||
"Token expected: " write dup should-be>> print
|
||||
"Token present: " write was>> print
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: unexpected-end < parsing-error ;
|
||||
: unexpected-end ( -- * ) \ unexpected-end parsing-error throw ;
|
||||
M: unexpected-end summary ( obj -- str )
|
||||
[
|
||||
call-next-method write
|
||||
"File unexpectedly ended." print
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: missing-close < parsing-error ;
|
||||
: missing-close ( -- * ) \ missing-close parsing-error throw ;
|
||||
M: missing-close summary ( obj -- str )
|
||||
[
|
||||
call-next-method write
|
||||
"Missing closing token." print
|
||||
] with-string-writer ;
|
||||
|
||||
SYMBOL: prolog-data
|
||||
|
||||
! * Basic utility words
|
||||
|
||||
: record ( char -- )
|
||||
CHAR: \n =
|
||||
[ 0 get-line 1+ set-line ] [ get-column 1+ ] if
|
||||
set-column ;
|
||||
|
||||
! (next) normalizes \r\n and \r
|
||||
: (next) ( -- char )
|
||||
get-next read1
|
||||
2dup swap CHAR: \r = [
|
||||
CHAR: \n =
|
||||
[ nip read1 ] [ nip CHAR: \n swap ] if
|
||||
] [ drop ] if
|
||||
set-next dup set-char ;
|
||||
|
||||
: next ( -- )
|
||||
#! Increment spot.
|
||||
get-char [ unexpected-end ] unless (next) record ;
|
||||
|
||||
: next* ( -- )
|
||||
get-char [ (next) record ] when ;
|
||||
|
||||
: skip-until ( quot: ( -- ? ) -- )
|
||||
get-char [
|
||||
[ call ] keep swap [ drop ] [
|
||||
next skip-until
|
||||
] if
|
||||
] [ drop ] if ; inline recursive
|
||||
|
||||
: take-until ( quot -- string )
|
||||
#! Take the substring of a string starting at spot
|
||||
#! from code until the quotation given is true and
|
||||
#! advance spot to after the substring.
|
||||
10 <sbuf> [
|
||||
'[ @ [ t ] [ get-char _ push f ] if ] skip-until
|
||||
] keep >string ; inline
|
||||
|
||||
: take-rest ( -- string )
|
||||
[ f ] take-until ;
|
||||
|
||||
: take-char ( ch -- string )
|
||||
[ dup get-char = ] take-until nip ;
|
||||
|
||||
TUPLE: not-enough-characters < parsing-error ;
|
||||
: not-enough-characters ( -- * )
|
||||
\ not-enough-characters parsing-error throw ;
|
||||
M: not-enough-characters summary ( obj -- str )
|
||||
[
|
||||
call-next-method write
|
||||
"Not enough characters" print
|
||||
] with-string-writer ;
|
||||
|
||||
: take ( n -- string )
|
||||
[ 1- ] [ <sbuf> ] bi [
|
||||
'[ drop get-char [ next _ push f ] [ t ] if* ] contains? drop
|
||||
] keep get-char [ over push ] when* >string ;
|
||||
|
||||
: pass-blank ( -- )
|
||||
#! Advance code past any whitespace, including newlines
|
||||
[ get-char blank? not ] skip-until ;
|
||||
|
||||
: string-matches? ( string circular -- ? )
|
||||
get-char over push-circular
|
||||
sequence= ;
|
||||
|
||||
: take-string ( match -- string )
|
||||
dup length <circular-string>
|
||||
[ 2dup string-matches? ] take-until nip
|
||||
dup length rot length 1- - head
|
||||
get-char [ missing-close ] unless next ;
|
||||
|
||||
: expect ( ch -- )
|
||||
get-char 2dup = [ 2drop ] [
|
||||
[ 1string ] bi@ expected
|
||||
] if next ;
|
||||
|
||||
: expect-string ( string -- )
|
||||
dup [ get-char next ] replicate 2dup =
|
||||
[ 2drop ] [ expected ] if ;
|
||||
|
||||
: init-parser ( -- )
|
||||
0 1 0 f <spot> spot set
|
||||
read1 set-next next ;
|
||||
|
||||
: state-parse ( stream quot -- )
|
||||
! with-input-stream implicitly creates a new scope which we use
|
||||
swap [ init-parser call ] with-input-stream ; inline
|
||||
|
||||
: string-parse ( input quot -- )
|
||||
[ <string-reader> ] dip state-parse ; inline
|
|
@ -1 +0,0 @@
|
|||
State-machined based text parsing framework
|
|
@ -65,7 +65,7 @@ IN: validators
|
|||
v-regexp ;
|
||||
|
||||
: v-url ( str -- str )
|
||||
"URL" R' (ftp|http|https)://\S+' v-regexp ;
|
||||
"URL" R' (?:ftp|http|https)://\S+' v-regexp ;
|
||||
|
||||
: v-captcha ( str -- str )
|
||||
dup empty? [ "must remain blank" throw ] unless ;
|
||||
|
|
|
@ -1,21 +1,33 @@
|
|||
! Copyright (C) 2005, 2007 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel sequences unicode.syntax math math.order ;
|
||||
USING: kernel sequences unicode.syntax math math.order combinators ;
|
||||
IN: xml.char-classes
|
||||
|
||||
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_ ;
|
||||
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
|
||||
: 1.0name-start? ( char -- ? )
|
||||
dup 1.0name-start*? [ drop t ]
|
||||
[ HEX: 2BB HEX: 2C1 between? ] if ;
|
||||
|
||||
CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387 ;
|
||||
CATEGORY: 1.0name-char Ll Lu Lo Lt Nl Mc Me Mn Lm Nd _-.\u000387: ;
|
||||
|
||||
CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _ ;
|
||||
CATEGORY: 1.1name-start Ll Lu Lo Lm Ln Nl _: ;
|
||||
|
||||
CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7 ;
|
||||
CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
|
||||
|
||||
: name-start? ( 1.0? char -- ? )
|
||||
swap [ 1.0name-start? ] [ 1.1name-start? ] if ;
|
||||
|
||||
: name-char? ( 1.0? char -- ? )
|
||||
swap [ 1.0name-char? ] [ 1.1name-char? ] if ;
|
||||
|
||||
: text? ( 1.0? char -- ? )
|
||||
! 1.0:
|
||||
! #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||
! 1.1:
|
||||
! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||
{
|
||||
{ [ dup HEX: 20 < ] [ "\t\r\n" member? and ] }
|
||||
{ [ nip dup HEX: D800 < ] [ drop t ] }
|
||||
{ [ dup HEX: E000 < ] [ drop f ] }
|
||||
[ { HEX: FFFE HEX: FFFF } member? not ]
|
||||
} cond ;
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
USING: help.markup help.syntax sequences strings ;
|
||||
IN: xml.data
|
||||
|
||||
ABOUT: "xml.data"
|
||||
|
||||
ARTICLE: "xml.data" "XML data types"
|
||||
{ $vocab-link "xml.data" } " defines a simple document object model for XML. Everything is simply a tuple and can be manipulated as such."
|
||||
{ $subsection { "xml.data" "classes" } }
|
||||
{ $subsection { "xml.data" "constructors" } }
|
||||
"Simple words for manipulating names:"
|
||||
{ $subsection names-match? }
|
||||
{ $subsection assure-name }
|
||||
"For high-level tools for manipulating XML, see " { $vocab-link "xml.utilities" } ;
|
||||
|
||||
ARTICLE: { "xml.data" "classes" } "XML data classes"
|
||||
"Data types that XML documents are made of:"
|
||||
{ $subsection name }
|
||||
{ $subsection tag }
|
||||
{ $subsection contained-tag }
|
||||
{ $subsection open-tag }
|
||||
{ $subsection xml }
|
||||
{ $subsection prolog }
|
||||
{ $subsection comment }
|
||||
{ $subsection instruction }
|
||||
{ $subsection element-decl }
|
||||
{ $subsection attlist-decl }
|
||||
{ $subsection entity-decl }
|
||||
{ $subsection system-id }
|
||||
{ $subsection public-id }
|
||||
{ $subsection doctype-decl }
|
||||
{ $subsection notation-decl } ;
|
||||
|
||||
ARTICLE: { "xml.data" "constructors" } "XML data constructors"
|
||||
"These data types are constructed with:"
|
||||
{ $subsection <name> }
|
||||
{ $subsection <tag> }
|
||||
{ $subsection <contained-tag> }
|
||||
{ $subsection <xml> }
|
||||
{ $subsection <prolog> }
|
||||
{ $subsection <comment> }
|
||||
{ $subsection <instruction> }
|
||||
{ $subsection <simple-name> }
|
||||
{ $subsection <element-decl> }
|
||||
{ $subsection <attlist-decl> }
|
||||
{ $subsection <entity-decl> }
|
||||
{ $subsection <system-id> }
|
||||
{ $subsection <public-id> }
|
||||
{ $subsection <doctype-decl> }
|
||||
{ $subsection <notation-decl> } ;
|
||||
|
||||
HELP: tag
|
||||
{ $class-description "tuple representing an XML tag, delegating to a " { $link
|
||||
name } ", containing the slots attrs (an alist of names to strings) and children (a sequence). Tags implement the sequence protocol by acting like a sequence of its chidren, and the assoc protocol by acting like its attributes." }
|
||||
{ $see-also <tag> name contained-tag xml } ;
|
||||
|
||||
HELP: <tag>
|
||||
{ $values { "name" "an XML tag name" }
|
||||
{ "attrs" "an alist of names to strings" }
|
||||
{ "children" sequence }
|
||||
{ "tag" tag } }
|
||||
{ $description "constructs an XML " { $link tag } " with the name (not a string) and tag attributes specified in attrs and children specified" }
|
||||
{ $see-also tag <contained-tag> } ;
|
||||
|
||||
HELP: name
|
||||
{ $class-description "represents an XML name, with the fields space (a string representing the namespace, as written in the document, tag (a string of the actual name of the tag) and url (a string of the URL that the namespace points to)" }
|
||||
{ $see-also <name> tag } ;
|
||||
|
||||
HELP: <name>
|
||||
{ $values { "space" "a string" } { "main" "a string" } { "url" "a string" }
|
||||
{ "name" "an XML tag name" } }
|
||||
{ $description "creates a name tuple with the name-space space and the tag-name tag and the tag-url url." }
|
||||
{ $see-also name <tag> } ;
|
||||
|
||||
HELP: contained-tag
|
||||
{ $class-description "delegates to tag representing a tag like <a/> with no contents. The tag attributes are accessed with tag-attrs" }
|
||||
{ $see-also tag <contained-tag> } ;
|
||||
|
||||
HELP: <contained-tag>
|
||||
{ $values { "name" "an XML tag name" }
|
||||
{ "attrs" "an alist from names to strings" }
|
||||
{ "tag" tag } }
|
||||
{ $description "creates an empty tag (like <a/>) with the specified name and tag attributes. This delegates to tag" }
|
||||
{ $see-also contained-tag <tag> } ;
|
||||
|
||||
HELP: xml
|
||||
{ $class-description "tuple representing an XML document, delegating to the main tag, containing the fields prolog (the header <?xml...?>), before (whatever comes between the prolog and the main tag) and after (whatever comes after the main tag)" }
|
||||
{ $see-also <xml> tag prolog } ;
|
||||
|
||||
HELP: <xml>
|
||||
{ $values { "prolog" "an XML prolog" } { "before" "a sequence of XML elements" }
|
||||
{ "body" tag } { "after" "a sequence of XML elements" } { "xml" "an XML document" } }
|
||||
{ $description "creates an XML document, delegating to the main tag, with the specified prolog, before, and after" }
|
||||
{ $see-also xml <tag> } ;
|
||||
|
||||
HELP: prolog
|
||||
{ $class-description "represents an XML prolog, with the tuple fields version (containing \"1.0\" or \"1.1\"), encoding (a string representing the encoding type), and standalone (t or f, whether the document is standalone without external entities)" }
|
||||
{ $see-also <prolog> xml } ;
|
||||
|
||||
HELP: <prolog>
|
||||
{ $values { "version" "a string, 1.0 or 1.1" }
|
||||
{ "encoding" "a string" } { "standalone" "a boolean" } { "prolog" "an XML prolog" } }
|
||||
{ $description "creates an XML prolog tuple" }
|
||||
{ $see-also prolog <xml> } ;
|
||||
|
||||
HELP: comment
|
||||
{ $class-description "represents a comment in XML. Has one slot, text, which contains the string of the comment" }
|
||||
{ $see-also <comment> } ;
|
||||
|
||||
HELP: <comment>
|
||||
{ $values { "text" "a string" } { "comment" "a comment" } }
|
||||
{ $description "creates an XML comment tuple" }
|
||||
{ $see-also comment } ;
|
||||
|
||||
HELP: instruction
|
||||
{ $class-description "represents an XML instruction, such as <?xsl stylesheet='foo.xml'?>. Contains one slot, text, which contains the string between the question marks." }
|
||||
{ $see-also <instruction> } ;
|
||||
|
||||
HELP: <instruction>
|
||||
{ $values { "text" "a string" } { "instruction" "an XML instruction" } }
|
||||
{ $description "creates an XML parsing instruction, such as <?xsl stylesheet='foo.xml'?>." }
|
||||
{ $see-also instruction } ;
|
||||
|
||||
HELP: opener
|
||||
{ $class-description "describes an opening tag, like <a>. Contains two slots, name and attrs containing, respectively, the name of the tag and its attributes. Usually, the name-url will be f." }
|
||||
{ $see-also closer contained } ;
|
||||
|
||||
HELP: closer
|
||||
{ $class-description "describes a closing tag, like </a>. Contains one slot, name, containing the tag's name. Usually, the name-url will be f." }
|
||||
{ $see-also opener contained } ;
|
||||
|
||||
HELP: contained
|
||||
{ $class-description "represents a self-closing tag, like <a/>. Contains two slots, name and attrs containing, respectively, the name of the tag and its attributes. Usually, the name-url will be f." }
|
||||
{ $see-also opener closer } ;
|
||||
|
||||
HELP: open-tag
|
||||
{ $class-description "represents a tag that does have children, ie is not a contained tag" }
|
||||
{ $notes "the constructor used for this class is simply " { $link <tag> } "." }
|
||||
{ $see-also tag contained-tag } ;
|
||||
|
||||
HELP: names-match?
|
||||
{ $values { "name1" "a name" } { "name2" "a name" } { "?" "t or f" } }
|
||||
{ $description "checks to see if the two names match, that is, if all fields are equal, ignoring fields whose value is f in either name." }
|
||||
{ $example "USING: prettyprint xml.data ;" "T{ name f \"rpc\" \"methodCall\" f } T{ name f f \"methodCall\" \"http://www.xmlrpc.org/\" } names-match? ." "t" }
|
||||
{ $see-also name } ;
|
||||
|
||||
HELP: assure-name
|
||||
{ $values { "string/name" "a string or a name" } { "name" "a name" } }
|
||||
{ $description "Converts a string into an XML name, if it is not already a name." } ;
|
||||
|
||||
HELP: <simple-name>
|
||||
{ $values { "string" string } { "name" name } }
|
||||
{ $description "Converts a string into an XML name with an empty prefix and URL." } ;
|
|
@ -17,10 +17,13 @@ C: <name> name
|
|||
[ [ main>> ] bi@ ?= ] 2tri and and ;
|
||||
|
||||
: <simple-name> ( string -- name )
|
||||
"" swap f <name> ;
|
||||
|
||||
: <null-name> ( string -- name )
|
||||
f swap f <name> ;
|
||||
|
||||
: assure-name ( string/name -- name )
|
||||
dup name? [ <simple-name> ] unless ;
|
||||
dup name? [ <null-name> ] unless ;
|
||||
|
||||
TUPLE: opener name attrs ;
|
||||
C: <opener> opener
|
||||
|
@ -54,6 +57,9 @@ C: <public-id> public-id
|
|||
TUPLE: doctype-decl < directive name external-id internal-subset ;
|
||||
C: <doctype-decl> doctype-decl
|
||||
|
||||
TUPLE: notation-decl < directive name id ;
|
||||
C: <notation-decl> notation-decl
|
||||
|
||||
TUPLE: instruction text ;
|
||||
C: <instruction> instruction
|
||||
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: help.markup help.syntax ;
|
||||
IN: xml.entities
|
||||
|
||||
ABOUT: "xml.entities"
|
||||
|
||||
ARTICLE: "xml.entities" "XML entities"
|
||||
"When XML is parsed, entities like &foo; are replaced with the characters they represent. A few entities like & and < are defined by default, but more are available, and the set of entities can be customized. Below are some words involved in XML entities, defined in the vocabulary 'entities':"
|
||||
{ $subsection entities }
|
||||
{ $subsection with-entities }
|
||||
"For entities used in HTML/XHTML, see " { $vocab-link "xml.entities.html" } ;
|
||||
|
||||
HELP: entities
|
||||
{ $description "a hash table from default XML entity names (like & and <) to the characters they represent. This is automatically included when parsing any XML document." }
|
||||
{ $see-also with-entities } ;
|
||||
|
||||
HELP: with-entities
|
||||
{ $values { "entities" "a hash table of strings to chars" }
|
||||
{ "quot" "a quotation ( -- )" } }
|
||||
{ $description "calls the quotation using the given table of entity values (symbolizing, eg, that &foo; represents CHAR: a) on top of the default XML entities" } ;
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: help.markup help.syntax xml.entities ;
|
||||
IN: xml.entities.html
|
||||
|
||||
ARTICLE: "xml.entities.html" "HTML entities"
|
||||
{ $vocab-link "xml.entities.html" } " defines words for using entities defined in HTML/XHTML."
|
||||
{ $subsection html-entities }
|
||||
{ $subsection with-html-entities } ;
|
||||
|
||||
HELP: html-entities
|
||||
{ $description "a hash table from HTML entity names to their character values" }
|
||||
{ $see-also entities with-html-entities } ;
|
||||
|
||||
HELP: with-html-entities
|
||||
{ $values { "quot" "a quotation ( -- )" } }
|
||||
{ $description "calls the given quotation using HTML entity values" }
|
||||
{ $see-also html-entities with-entities } ;
|
|
@ -7,8 +7,10 @@ IN: xml.entities.html
|
|||
VALUE: html-entities
|
||||
|
||||
: read-entities-file ( file -- table )
|
||||
f swap binary <file-reader>
|
||||
[ 2drop extra-entities get ] sax ;
|
||||
H{ } clone [ extra-entities [
|
||||
binary <file-reader>
|
||||
[ drop ] sax
|
||||
] with-variable ] keep ;
|
||||
|
||||
: get-html ( -- table )
|
||||
{ "lat1" "special" "symbol" } [
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: help.markup help.syntax ;
|
||||
IN: xml.errors
|
||||
|
||||
HELP: multitags
|
||||
{ $class-description "XML parsing error describing the case where there is more than one main tag in a document. Contains no slots" } ;
|
||||
|
||||
HELP: notags
|
||||
{ $class-description "XML parsing error describing the case where an XML document contains no main tag, or any tags at all" } ;
|
||||
|
||||
HELP: extra-attrs
|
||||
{ $class-description "XML parsing error describing the case where the XML prolog (<?xml ...?>) contains attributes other than the three allowed ones, standalone, version and encoding. Contains one slot, attrs, which is a hashtable of all the extra attributes' names. Delegates to " { $link parsing-error } "." } ;
|
||||
|
||||
HELP: nonexist-ns
|
||||
{ $class-description "XML parsing error describing the case where a namespace doesn't exist but it is used in a tag. Contains one slot, name, which contains the name of the undeclared namespace, and delegates to " { $link parsing-error } "." } ;
|
||||
|
||||
HELP: not-yes/no
|
||||
{ $class-description "XML parsing error used to describe the case where standalone is set in the XML prolog to something other than 'yes' or 'no'. Delegates to " { $link parsing-error } " and contains one slot, text, which contains offending value." } ;
|
||||
|
||||
HELP: unclosed
|
||||
{ $class-description "XML parsing error used to describe the case where the XML document contains classes which are not closed by the end of the document. Contains one slot, tags, a sequence of names." } ;
|
||||
|
||||
HELP: mismatched
|
||||
{ $class-description "XML parsing error describing mismatched tags, eg <a></c>. Contains two slots: open is the name of the opening tag and close is the name of the closing tag. Delegates to " { $link parsing-error } " showing the location of the closing tag" } ;
|
||||
|
||||
HELP: expected
|
||||
{ $class-description "XML parsing error describing when an expected token was not present. Delegates to " { $link parsing-error } ". Contains two slots, should-be, which has the expected string, and was, which has the actual string." } ;
|
||||
|
||||
HELP: no-entity
|
||||
{ $class-description "XML parsing error describing the use of an undefined entity in a case where standalone is marked yes. Delegates to " { $link parsing-error } ". Contains one slot, thing, containing a string representing the entity." } ;
|
||||
|
||||
|
||||
HELP: pre/post-content
|
||||
{ $class-description "describes the error where a non-whitespace string is used before or after the main tag in an XML document. Contains two slots: string contains the offending string, and pre? is t if it occured before the main tag and f if it occured after" } ;
|
||||
|
||||
HELP: unclosed-quote
|
||||
{ $class-description "describes the error where a quotation for an attribute value is opened but not closed before the end of the document." } ;
|
||||
|
||||
HELP: bad-name
|
||||
{ $class-description "describes the error where a name is used, for example in an XML tag or attribute key, which is invalid." } ;
|
||||
|
||||
HELP: quoteless-attr
|
||||
{ $class-description "describes the error where an attribute of an XML tag is missing quotes around a value." } ;
|
||||
|
||||
HELP: xml-parse-error
|
||||
{ $class-description "the exception class that all parsing errors in XML documents are in." } ;
|
||||
|
||||
ARTICLE: "xml.errors" "XML parsing errors"
|
||||
{ $vocab-link "xml.errors" } " provides a rich and highly inspectable set of parsing errors. All XML errors are described by the union class " { $link xml-parse-error } " but there are many classes contained in that:"
|
||||
{ $subsection multitags }
|
||||
{ $subsection notags }
|
||||
{ $subsection extra-attrs }
|
||||
{ $subsection nonexist-ns }
|
||||
{ $subsection not-yes/no }
|
||||
{ $subsection unclosed }
|
||||
{ $subsection mismatched }
|
||||
{ $subsection expected }
|
||||
{ $subsection no-entity }
|
||||
{ $subsection pre/post-content }
|
||||
{ $subsection unclosed-quote }
|
||||
{ $subsection bad-name }
|
||||
{ $subsection quoteless-attr }
|
||||
"Additionally, most of these errors are a kind of " { $link parsing-error } " which provides more information"
|
||||
$nl
|
||||
"Note that, in parsing an XML document, only the first error is reported." ;
|
||||
|
||||
ABOUT: "xml.errors"
|
|
@ -1,5 +1,5 @@
|
|||
USING: continuations xml xml.errors tools.test kernel arrays
|
||||
xml.data state-parser quotations fry ;
|
||||
xml.data quotations fry ;
|
||||
IN: xml.errors.tests
|
||||
|
||||
: xml-error-test ( expected-error xml-string -- )
|
||||
|
@ -25,8 +25,12 @@ T{ capitalized-prolog f 1 6 "XmL" } "<?XmL version='1.0'?><x/>"
|
|||
xml-error-test
|
||||
T{ pre/post-content f "x" t } "x<y/>" xml-error-test
|
||||
T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test
|
||||
T{ bad-instruction f 1 11 T{ instruction f "xsl" } }
|
||||
"<x><?xsl?></x>" xml-error-test
|
||||
T{ unclosed-quote f 1 13 } "<x value='/>" xml-error-test
|
||||
T{ bad-name f 1 3 "-" } "<-/>" xml-error-test
|
||||
T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test
|
||||
T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test
|
||||
T{ attr-w/< f 1 11 } "<x value='<'/>" xml-error-test
|
||||
T{ text-w/]]> f 1 6 } "<x>]]></x>" xml-error-test
|
||||
T{ duplicate-attr f 1 21 T{ name { space "" } { main "this" } } V{ "a" "b" } } "<x this='a' this='b'/>" xml-error-test
|
||||
T{ bad-cdata f 1 3 } "<![CDATA[]]><x/>" xml-error-test
|
||||
T{ bad-cdata f 1 7 } "<x/><![CDATA[]]>" xml-error-test
|
||||
T{ pre/post-content f "&" t } "&32;<x/>" xml-error-test
|
||||
|
|
|
@ -1,10 +1,61 @@
|
|||
! Copyright (C) 2005, 2006 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: xml.data xml.writer kernel generic io prettyprint math
|
||||
debugger sequences state-parser accessors summary
|
||||
debugger sequences xml.state accessors summary
|
||||
namespaces io.streams.string xml.backend ;
|
||||
IN: xml.errors
|
||||
|
||||
TUPLE: parsing-error line column ;
|
||||
|
||||
: parsing-error ( class -- obj )
|
||||
new
|
||||
get-line >>line
|
||||
get-column >>column ;
|
||||
M: parsing-error summary ( obj -- str )
|
||||
[
|
||||
"Parsing error" print
|
||||
"Line: " write dup line>> .
|
||||
"Column: " write column>> .
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: expected < parsing-error should-be was ;
|
||||
: expected ( should-be was -- * )
|
||||
\ expected parsing-error
|
||||
swap >>was
|
||||
swap >>should-be throw ;
|
||||
M: expected summary ( obj -- str )
|
||||
[
|
||||
dup call-next-method write
|
||||
"Token expected: " write dup should-be>> print
|
||||
"Token present: " write was>> print
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: unexpected-end < parsing-error ;
|
||||
: unexpected-end ( -- * ) \ unexpected-end parsing-error throw ;
|
||||
M: unexpected-end summary ( obj -- str )
|
||||
[
|
||||
call-next-method write
|
||||
"File unexpectedly ended." print
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: missing-close < parsing-error ;
|
||||
: missing-close ( -- * ) \ missing-close parsing-error throw ;
|
||||
M: missing-close summary ( obj -- str )
|
||||
[
|
||||
call-next-method write
|
||||
"Missing closing token." print
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: disallowed-char < parsing-error char ;
|
||||
|
||||
: disallowed-char ( char -- * )
|
||||
\ disallowed-char parsing-error swap >>char throw ;
|
||||
|
||||
M: disallowed-char summary
|
||||
[ call-next-method ]
|
||||
[ char>> "Disallowed character in XML document: " swap suffix ] bi
|
||||
append ;
|
||||
|
||||
ERROR: multitags ;
|
||||
|
||||
M: multitags summary ( obj -- str )
|
||||
|
@ -170,18 +221,6 @@ M: versionless-prolog summary ( obj -- str )
|
|||
"XML prolog lacks a version declaration" print
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: bad-instruction < parsing-error instruction ;
|
||||
|
||||
: bad-instruction ( instruction -- * )
|
||||
\ bad-instruction parsing-error swap >>instruction throw ;
|
||||
|
||||
M: bad-instruction summary ( obj -- str )
|
||||
[
|
||||
dup call-next-method write
|
||||
"Misplaced processor instruction:" print
|
||||
instruction>> write-xml-chunk nl
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: bad-directive < parsing-error dir ;
|
||||
|
||||
: bad-directive ( directive -- * )
|
||||
|
@ -194,13 +233,13 @@ M: bad-directive summary ( obj -- str )
|
|||
dir>> write
|
||||
] with-string-writer ;
|
||||
|
||||
TUPLE: bad-doctype-decl < parsing-error ;
|
||||
TUPLE: bad-decl < parsing-error ;
|
||||
|
||||
: bad-doctype-decl ( -- * )
|
||||
\ bad-doctype-decl parsing-error throw ;
|
||||
: bad-decl ( -- * )
|
||||
\ bad-decl parsing-error throw ;
|
||||
|
||||
M: bad-doctype-decl summary ( obj -- str )
|
||||
call-next-method "\nBad DOCTYPE" append ;
|
||||
M: bad-decl summary ( obj -- str )
|
||||
call-next-method "\nExtra content in directive" append ;
|
||||
|
||||
TUPLE: bad-external-id < parsing-error ;
|
||||
|
||||
|
@ -249,7 +288,53 @@ TUPLE: quoteless-attr < parsing-error ;
|
|||
M: quoteless-attr summary
|
||||
call-next-method "Attribute lacks quotes around value\n" append ;
|
||||
|
||||
UNION: xml-parse-error multitags notags extra-attrs nonexist-ns
|
||||
not-yes/no unclosed mismatched expected no-entity
|
||||
bad-prolog versionless-prolog capitalized-prolog bad-instruction
|
||||
bad-directive bad-name unclosed-quote quoteless-attr ;
|
||||
TUPLE: attr-w/< < parsing-error ;
|
||||
|
||||
: attr-w/< ( value -- * )
|
||||
\ attr-w/< parsing-error throw ;
|
||||
|
||||
M: attr-w/< summary
|
||||
call-next-method
|
||||
"Attribute value contains literal <" append ;
|
||||
|
||||
TUPLE: text-w/]]> < parsing-error ;
|
||||
|
||||
: text-w/]]> ( text -- * )
|
||||
\ text-w/]]> parsing-error throw ;
|
||||
|
||||
M: text-w/]]> summary
|
||||
call-next-method
|
||||
"Text node contains ']]>'" append ;
|
||||
|
||||
TUPLE: duplicate-attr < parsing-error key values ;
|
||||
|
||||
: duplicate-attr ( key values -- * )
|
||||
\ duplicate-attr parsing-error
|
||||
swap >>values swap >>key throw ;
|
||||
|
||||
M: duplicate-attr summary
|
||||
call-next-method "\nDuplicate attribute" append ;
|
||||
|
||||
TUPLE: bad-cdata < parsing-error ;
|
||||
|
||||
: bad-cdata ( -- * )
|
||||
\ bad-cdata parsing-error throw ;
|
||||
|
||||
M: bad-cdata summary
|
||||
call-next-method "\nCDATA occurs before or after main tag" append ;
|
||||
|
||||
TUPLE: not-enough-characters < parsing-error ;
|
||||
: not-enough-characters ( -- * )
|
||||
\ not-enough-characters parsing-error throw ;
|
||||
M: not-enough-characters summary ( obj -- str )
|
||||
[
|
||||
call-next-method write
|
||||
"Not enough characters" print
|
||||
] with-string-writer ;
|
||||
|
||||
UNION: xml-parse-error
|
||||
multitags notags extra-attrs nonexist-ns bad-decl
|
||||
not-yes/no unclosed mismatched expected no-entity
|
||||
bad-prolog versionless-prolog capitalized-prolog
|
||||
bad-directive bad-name unclosed-quote quoteless-attr
|
||||
attr-w/< text-w/]]> duplicate-attr ;
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Daniel Ehrenberg
|
|
@ -0,0 +1,4 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: tools.test xml.interpolate ;
|
||||
IN: xml.interpolate.tests
|
|
@ -0,0 +1,4 @@
|
|||
! Copyright (C) 2009 Daniel Ehrenberg.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: ;
|
||||
IN: xml.interpolate
|
|
@ -0,0 +1,19 @@
|
|||
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors kernel namespaces ;
|
||||
IN: xml.state
|
||||
|
||||
TUPLE: spot char line column next check ;
|
||||
|
||||
C: <spot> spot
|
||||
|
||||
: get-char ( -- char ) spot get char>> ;
|
||||
: set-char ( char -- ) spot get swap >>char drop ;
|
||||
: get-line ( -- line ) spot get line>> ;
|
||||
: set-line ( line -- ) spot get swap >>line drop ;
|
||||
: get-column ( -- column ) spot get column>> ;
|
||||
: set-column ( column -- ) spot get swap >>column drop ;
|
||||
: get-next ( -- char ) spot get next>> ;
|
||||
: set-next ( char -- ) spot get swap >>next drop ;
|
||||
: get-check ( -- ? ) spot get check>> ;
|
||||
: check ( -- ) spot get t >>check drop ;
|
|
@ -25,9 +25,9 @@
|
|||
<directoryTitle xsi:type="xsd:string"></directoryTitle>
|
||||
<hostName xsi:type="xsd:string"></hostName>
|
||||
<relatedInformationPresent xsi:type="xsd:boolean">true</relatedInformationPresent>
|
||||
<snippet xsi:type="xsd:string">The O$-1òùReilly <b>Factor</b> with Bill OòùReilly on FOXNews.com. Bill OòùReilly hosts The <br> OòùReilly <b>Factor</b>, the most-watched program on cable news.</snippet>
|
||||
<snippet xsi:type="xsd:string">The O$-1òùReilly <b>Factor</b> with Bill OòùReilly on FOXNews.com. Bill OòùReilly hosts The <br> OòùReilly <b>Factor</b>, the most-watched program on cable news.</snippet>
|
||||
<summary xsi:type="xsd:string"></summary>
|
||||
<title xsi:type="xsd:string">Bill O$-1òùReilly | The OòùReilly <b>Factor</b> - FOXNews.com</title>
|
||||
<title xsi:type="xsd:string">Bill O$-1òùReilly | The OòùReilly <b>Factor</b> - FOXNews.com</title>
|
||||
</item>
|
||||
<item xsi:type="ns1:ResultElement">
|
||||
<URL xsi:type="xsd:string">http://www.factor.ca/</URL>
|
||||
|
|
|
@ -1,4 +1,11 @@
|
|||
USING: tools.test state-parser kernel io strings ascii ;
|
||||
USING: tools.test xml.tokenize xml.state io.streams.string kernel io strings ascii ;
|
||||
IN: xml.test.state
|
||||
|
||||
: string-parse ( str quot -- )
|
||||
[ <string-reader> ] dip state-parse ;
|
||||
|
||||
: take-rest ( -- string )
|
||||
[ f ] take-until ;
|
||||
|
||||
[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
|
||||
[ 2 4 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test
|
|
@ -3,7 +3,7 @@
|
|||
IN: xml.tests
|
||||
USING: kernel xml tools.test io namespaces make sequences
|
||||
xml.errors xml.entities.html parser strings xml.data io.files
|
||||
xml.writer xml.utilities state-parser continuations assocs
|
||||
xml.writer xml.utilities continuations assocs
|
||||
sequences.deep accessors io.streams.string ;
|
||||
|
||||
! This is insufficient
|
||||
|
@ -64,4 +64,5 @@ SYMBOL: xml-file
|
|||
[ t ] [ "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' >" dup string>xml-chunk [ write-xml-chunk ] with-string-writer = ] unit-test
|
||||
[ "foo" ] [ "<!ENTITY bar 'foo'><x>&bar;</x>" string>xml children>string ] unit-test
|
||||
[ V{ "hello" } ] [ "hello" string>xml-chunk ] unit-test
|
||||
[ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test
|
||||
[ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test
|
||||
[ "x" "<" ] [ "<x value='<'/>" string>xml [ name>> main>> ] [ "value" swap at ] bi ] unit-test
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
USING: accessors assocs combinators continuations fry generalizations
|
||||
io.pathnames kernel macros sequences stack-checker tools.test xml
|
||||
xml.utilities xml.writer arrays ;
|
||||
IN: xml.tests.suite
|
||||
|
||||
TUPLE: xml-test id uri sections description type ;
|
||||
|
||||
: >xml-test ( tag -- test )
|
||||
xml-test new swap {
|
||||
[ "TYPE" swap at >>type ]
|
||||
[ "ID" swap at >>id ]
|
||||
[ "URI" swap at >>uri ]
|
||||
[ "SECTIONS" swap at >>sections ]
|
||||
[ children>> xml-chunk>string >>description ]
|
||||
} cleave ;
|
||||
|
||||
: parse-tests ( xml -- tests )
|
||||
"TEST" tags-named [ >xml-test ] map ;
|
||||
|
||||
: base "resource:basis/xml/tests/xmltest/" ;
|
||||
|
||||
MACRO: drop-output ( quot -- newquot )
|
||||
dup infer out>> '[ @ _ ndrop ] ;
|
||||
|
||||
MACRO: drop-input ( quot -- newquot )
|
||||
infer in>> '[ _ ndrop ] ;
|
||||
|
||||
: fails? ( quot -- ? )
|
||||
[ '[ _ drop-output f ] ]
|
||||
[ '[ drop _ drop-input t ] ] bi recover ; inline
|
||||
|
||||
: well-formed? ( uri -- answer )
|
||||
[ file>xml ] fails? "not-wf" "valid" ? ;
|
||||
|
||||
: test-quots ( test -- result quot )
|
||||
[ type>> '[ _ ] ]
|
||||
[ '[ _ uri>> base swap append-path well-formed? ] ] bi ;
|
||||
|
||||
: xml-tests ( -- tests )
|
||||
base "xmltest.xml" append-path file>xml
|
||||
parse-tests [ test-quots 2array ] map ;
|
||||
|
||||
: run-xml-tests ( -- )
|
||||
xml-tests [ unit-test ] assoc-each ;
|
||||
|
||||
: works? ( result quot -- ? )
|
||||
[ first ] [ call ] bi* = ;
|
||||
|
||||
: partition-xml-tests ( -- successes failures )
|
||||
xml-tests [ first2 works? ] partition ;
|
||||
|
||||
: failing-valids ( -- tests )
|
||||
partition-xml-tests nip [ second first ] map [ type>> "valid" = ] filter ;
|
|
@ -0,0 +1,44 @@
|
|||
<HTML>
|
||||
<TITLE>Canonical XML</TITLE>
|
||||
<BODY>
|
||||
<H1>Canonical XML</H1>
|
||||
<P>
|
||||
This document defines a subset of XML called canonical XML.
|
||||
The intended use of canonical XML is in testing XML processors,
|
||||
as a representation of the result of parsing an XML document.
|
||||
<P>
|
||||
Every well-formed XML document has a unique structurally equivalent
|
||||
canonical XML document. Two structurally equivalent XML
|
||||
documents have a byte-for-byte identical canonical XML document.
|
||||
Canonicalizing an XML document requires only information that an XML
|
||||
processor is required to make available to an application.
|
||||
<P>
|
||||
A canonical XML document conforms to the following grammar:
|
||||
<PRE>
|
||||
CanonXML ::= Pi* element Pi*
|
||||
element ::= Stag (Datachar | Pi | element)* Etag
|
||||
Stag ::= '<' Name Atts '>'
|
||||
Etag ::= '</' Name '>'
|
||||
Pi ::= '<?' Name ' ' (((Char - S) Char*)? - (Char* '?>' Char*)) '?>'
|
||||
Atts ::= (' ' Name '=' '"' Datachar* '"')*
|
||||
Datachar ::= '&amp;' | '&lt;' | '&gt;' | '&quot;'
|
||||
| '&#9;'| '&#10;'| '&#13;'
|
||||
| (Char - ('&' | '<' | '>' | '"' | #x9 | #xA | #xD))
|
||||
Name ::= (see XML spec)
|
||||
Char ::= (see XML spec)
|
||||
S ::= (see XML spec)
|
||||
</PRE>
|
||||
<P>
|
||||
Attributes are in lexicographical order (in Unicode bit order).
|
||||
<P>
|
||||
A canonical XML document is encoded in UTF-8.
|
||||
<P>
|
||||
Ignorable white space is considered significant and is treated equivalently
|
||||
to data.
|
||||
<P>
|
||||
<ADDRESS>
|
||||
<A HREF="mailto:jjc@jclark.com">James Clark</A>
|
||||
</ADDRESS>
|
||||
|
||||
</BODY>
|
||||
</HTML>
|
|
@ -0,0 +1,2 @@
|
|||
<!ENTITY % e "(#PCDATA">
|
||||
<!ELEMENT doc %e;)>
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "002.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,2 @@
|
|||
<!ENTITY % e ">">
|
||||
<!ELEMENT doc (#PCDATA) %e;
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "005.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,2 @@
|
|||
<!ENTITY % e "(#PCDATA)>">
|
||||
<!ELEMENT doc %e;
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "006.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,3 @@
|
|||
<!ENTITY % e "INCLUDE[">
|
||||
<!ELEMENT doc (#PCDATA)>
|
||||
<![ %e; <!ATTLIST doc a1 CDATA "v1"> ]]>
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "022.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1="v1"></doc>
|
|
@ -0,0 +1 @@
|
|||
&e;
|
|
@ -0,0 +1,4 @@
|
|||
<!DOCTYPE doc [
|
||||
<!ENTITY e SYSTEM "001.ent">
|
||||
]>
|
||||
<doc>&e;</doc>
|
|
@ -0,0 +1,3 @@
|
|||
<?xml version="1.0" standalone="yes"?>
|
||||
data
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
<!DOCTYPE doc [
|
||||
<!ELEMENT doc (#PCDATA)>
|
||||
<!ENTITY e SYSTEM "002.ent">
|
||||
]>
|
||||
<doc>&e;</doc>
|
|
@ -0,0 +1,2 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?><?xml version="1.0" encoding="UTF-8"?>
|
||||
data
|
|
@ -0,0 +1,5 @@
|
|||
<!DOCTYPE doc [
|
||||
<!ELEMENT doc (#PCDATA)>
|
||||
<!ENTITY e SYSTEM "003.ent">
|
||||
]>
|
||||
<doc>&e;</doc>
|
|
@ -0,0 +1,3 @@
|
|||
<![ INCLUDE [
|
||||
<!ELEMENT doc (#PCDATA)>
|
||||
]>
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "001.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,6 @@
|
|||
<!DOCTYPE doc [
|
||||
<!ELEMENT doc (#PCDATA)>
|
||||
<!ENTITY % e "<?xml version='1.0' encoding='UTF-8'?>">
|
||||
%e;
|
||||
]>
|
||||
<doc></doc>
|
|
@ -0,0 +1,2 @@
|
|||
<!ELEMENT doc (#PCDATA)>
|
||||
<![ IGNORE [
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "003.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,2 @@
|
|||
<!ELEMENT doc (#PCDATA)>
|
||||
<![ INCLUDE [
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "004.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,2 @@
|
|||
<!ELEMENT doc (#PCDATA)>
|
||||
%e;
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "005.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,3 @@
|
|||
<![INCLUDE
|
||||
<!ELEMENT doc (#PCDATA)>
|
||||
]]>
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "006.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,3 @@
|
|||
<!DOCTYPE doc [
|
||||
<!ELEMENT doc (#PCDATA)>
|
||||
]>
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "007.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,2 @@
|
|||
<!ELEMENT doc ANY>
|
||||
<!ENTITY e "100%">
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "008.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,3 @@
|
|||
<!ELEMENT doc EMPTY>
|
||||
<!ENTITY % e "<!--">
|
||||
%e; -->
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "009.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,2 @@
|
|||
<!ENTITY % e "<!ELEMENT ">
|
||||
%e; doc (#PCDATA)>
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "010.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,3 @@
|
|||
<!ENTITY % e1 "<!ELEMENT ">
|
||||
<!ENTITY % e2 ">">
|
||||
%e1; doc (#PCDATA) %e2;
|
|
@ -0,0 +1,2 @@
|
|||
<!DOCTYPE doc SYSTEM "011.ent">
|
||||
<doc></doc>
|
|
@ -0,0 +1,5 @@
|
|||
<doc>
|
||||
<doc
|
||||
?
|
||||
<a</a>
|
||||
</doc>
|
|
@ -0,0 +1,4 @@
|
|||
<doc>
|
||||
<.doc></.doc>
|
||||
</doc>
|
||||
|
|
@ -0,0 +1 @@
|
|||
<doc><? ?></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc><?target some data></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc><?target some data?</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc><!-- a comment -- another --></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>& no refc</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>&.entity;</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>&#RE;</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>A & B</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1=v1></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1="v1'></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1="<foo>"></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1=></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1="v1" "v2"></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc><![CDATA[</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc><![CDATA [ stuff]]></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc></>
|
|
@ -0,0 +1 @@
|
|||
<doc a1="A & B"></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1="a&b"></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc a1="{:"></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc 12="34"></doc>
|
|
@ -0,0 +1,3 @@
|
|||
<doc>
|
||||
<123></123>
|
||||
</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>]]></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>]]]></doc>
|
|
@ -0,0 +1,3 @@
|
|||
<doc>
|
||||
<!-- abc
|
||||
</doc>
|
|
@ -0,0 +1,4 @@
|
|||
<doc>
|
||||
<?a pi that is not closed
|
||||
</doc>
|
||||
|
|
@ -0,0 +1 @@
|
|||
<doc>abc]]]>def</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>A form feed () is not legal in data</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc><?pi a form feed () is not allowed in a pi?></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc><!-- a form feed () is not allowed in a comment --></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>abcdef</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>A form-feed is not white space or a name character</doc>
|
|
@ -0,0 +1 @@
|
|||
<doc>1 < 2 but not in XML</doc>
|
|
@ -0,0 +1,2 @@
|
|||
<doc></doc>
|
||||
Illegal data
|
|
@ -0,0 +1,2 @@
|
|||
<doc></doc>
|
||||
 
|
|
@ -0,0 +1 @@
|
|||
<doc x="foo" y="bar" x="baz"></doc>
|
|
@ -0,0 +1 @@
|
|||
<doc><a></aa></doc>
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue