Code cleanup in XML

db4
Daniel Ehrenberg 2009-01-29 16:57:13 -06:00
parent dd15816bad
commit af2706b75c
7 changed files with 75 additions and 71 deletions

View File

@ -2,14 +2,15 @@
! See http://factorcode.org/license.txt for BSD license.
USING: kernel namespaces xml.name io.encodings.utf8 xml.elements
io.encodings.utf16 xml.tokenize xml.state math ascii sequences
io.encodings.string io.encodings combinators ;
io.encodings.string io.encodings combinators accessors
xml.data io.encodings.iana ;
IN: xml.autoencoding
: continue-make-tag ( str -- tag )
parse-name-starting middle-tag end-tag ;
: start-utf16le ( -- tag )
utf16le decode-input-if
utf16le decode-input
"?\0" expect
check instruct ;
@ -17,20 +18,36 @@ IN: xml.autoencoding
-6 shift 3 bitand 2 = ;
: start<name ( ch -- tag )
! This is unfortunate, and exists for the corner case
! that the first letter of the document is < and second is
! not ASCII
ascii?
[ utf8 decode-input-if next make-tag ] [
[ utf8 decode-input next make-tag ] [
next
[ get-next 10xxxxxx? not ] take-until
get-char suffix utf8 decode
utf8 decode-input-if next
utf8 decode-input next
continue-make-tag
] if ;
: prolog-encoding ( prolog -- )
encoding>> dup "UTF-16" =
[ drop ] [ name>encoding [ decode-input ] when* ] if ;
: instruct-encoding ( instruct/prolog -- )
dup prolog?
[ prolog-encoding ]
[ drop utf8 decode-input ] if ;
: something ( -- )
check utf8 decode-input next next ;
: start< ( -- tag )
! What if first letter of processing instruction is non-ASCII?
get-next {
{ 0 [ next next start-utf16le ] }
{ CHAR: ? [ check next next instruct ] } ! XML prolog parsing sets the encoding
{ CHAR: ! [ check utf8 decode-input next next direct ] }
{ CHAR: ? [ something instruct dup instruct-encoding ] }
{ CHAR: ! [ something direct ] }
[ check start<name ]
} case ;
@ -39,7 +56,7 @@ IN: xml.autoencoding
"<" expect check make-tag ;
: decode-expecting ( encoding string -- tag )
[ decode-input-if next ] [ expect ] bi* check make-tag ;
[ decode-input next ] [ expect ] bi* check make-tag ;
: start-utf16be ( -- tag )
utf16be "<" decode-expecting ;
@ -57,8 +74,6 @@ IN: xml.autoencoding
{ HEX: EF [ skip-utf8-bom ] }
{ HEX: FF [ skip-utf16le-bom ] }
{ HEX: FE [ skip-utf16be-bom ] }
{ f [ "" ] }
[ drop utf8 decode-input-if f ]
! Same problem as with <e`>, in the case of XML chunks?
} case check ;
[ drop utf8 decode-input check f ]
} case ;

View File

@ -26,7 +26,7 @@ CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
! 1.1:
! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
{
{ [ dup HEX: 20 < ] [ "\t\r\n" member? and ] }
{ [ dup HEX: 20 < ] [ swap [ "\t\r\n" member? ] [ zero? not ] if ] }
{ [ nip dup HEX: D800 < ] [ drop t ] }
{ [ dup HEX: E000 < ] [ drop f ] }
[ { HEX: FFFE HEX: FFFF } member? not ]

View File

@ -29,7 +29,7 @@ IN: xml.elements
parse-name swap ;
: (middle-tag) ( -- )
pass-blank version=1.0? get-char name-start?
pass-blank version-1.0? get-char name-start?
[ parse-attr (middle-tag) ] when ;
: assure-no-duplicates ( attrs-alist -- attrs-alist )
@ -66,7 +66,8 @@ IN: xml.elements
: prolog-version ( alist -- version )
T{ name { space "" } { main "version" } } swap at
[ good-version ] [ versionless-prolog ] if* ;
[ good-version ] [ versionless-prolog ] if*
dup set-version ;
: prolog-encoding ( alist -- encoding )
T{ name { space "" } { main "encoding" } } swap at
@ -89,16 +90,9 @@ IN: xml.elements
[ prolog-standalone ]
tri <prolog> ;
SYMBOL: string-input?
: decode-input-if ( encoding -- )
string-input? get [ drop ] [ decode-input ] if ;
: parse-prolog ( -- prolog )
pass-blank middle-tag "?>" expect
dup assure-no-extra prolog-attrs
dup encoding>> dup "UTF-16" =
[ drop ] [ name>encoding [ decode-input-if ] when* ] if
dup prolog-data set ;
dup assure-no-extra prolog-attrs ;
: instruct ( -- instruction )
take-name {

View File

@ -3,7 +3,7 @@
USING: accessors kernel namespaces io ;
IN: xml.state
TUPLE: spot char line column next check ;
TUPLE: spot char line column next check version-1.0? ;
C: <spot> spot
@ -17,11 +17,12 @@ C: <spot> spot
: set-next ( char -- ) spot get swap >>next drop ;
: get-check ( -- ? ) spot get check>> ;
: check ( -- ) spot get t >>check drop ;
: version-1.0? ( -- ? ) spot get version-1.0?>> ;
: set-version ( string -- )
spot get swap "1.0" = >>version-1.0? drop ;
SYMBOL: xml-stack
SYMBOL: prolog-data
SYMBOL: depth
SYMBOL: interpolating?

View File

@ -51,14 +51,18 @@ SYMBOL: xml-file
[ "foo" ] [ "<boo><![CDATA[foo]]></boo>" string>xml children>string ] unit-test
[ "<!-- B+, B, or B--->" string>xml ] must-fail
[ ] [ "<?xml version='1.0'?><!-- declarations for <head> & <body> --><foo/>" string>xml drop ] unit-test
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>dtd directives>> first ] unit-test
[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>dtd directives>> first ] unit-test
[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>dtd directives>> first ] unit-test
[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>dtd directives>> first ] unit-test
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first ] unit-test
: first-thing ( seq -- elt )
[ "" = not ] filter first ;
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>dtd directives>> first-thing ] unit-test
[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>dtd directives>> first-thing ] unit-test
[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>dtd directives>> first-thing ] unit-test
[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>dtd directives>> first-thing ] unit-test
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk first-thing ] unit-test
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first-thing ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first-thing ] unit-test
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first-thing ] unit-test
[ 958 ] [ [ "&xi;" string>xml-chunk ] with-html-entities first first ] unit-test
[ "x" "<" ] [ "<x value='&lt;'/>" string>xml [ name>> main>> ] [ "value" attr ] bi ] unit-test
[ "foo" ] [ "<!DOCTYPE foo [<!ENTITY bar 'foo'>]><x>&bar;</x>" string>xml children>string ] unit-test

View File

@ -6,12 +6,9 @@ circular xml.entities assocs make splitting math.parser
locals combinators arrays ;
IN: xml.tokenize
: version=1.0? ( -- ? )
prolog-data get [ version>> "1.0" = ] [ t ] if* ;
: assure-good-char ( ch -- ch )
[
version=1.0? over text? not get-check and
version-1.0? over text? not get-check and
[ disallowed-char ] when
] [ f ] if* ;
@ -36,7 +33,7 @@ IN: xml.tokenize
get-char [ unexpected-end ] unless (next) record ;
: init-parser ( -- )
0 1 0 f f <spot> spot set
0 1 0 f f t <spot> spot set
read1 set-next next ;
: with-state ( stream quot -- )

View File

@ -22,7 +22,7 @@ GENERIC: process ( object -- )
M: object process add-child ;
M: prolog process
xml-stack get V{ { f V{ } } } =
xml-stack get { V{ { f V{ "" } } } V{ { f V{ } } } } member?
[ bad-prolog ] unless drop ;
M: directive process
@ -49,17 +49,14 @@ M: closer process
: init-xml-stack ( -- )
V{ } clone xml-stack set
extra-entities [ H{ } assoc-like ] change
f push-xml ;
: default-prolog ( -- prolog )
"1.0" "UTF-8" f <prolog> ;
: reset-prolog ( -- )
default-prolog prolog-data set ;
: init-xml ( -- )
reset-prolog init-xml-stack init-ns-stack ;
init-ns-stack
extra-entities [ H{ } assoc-like ] change ;
: assert-blanks ( seq pre? -- )
swap [ string? ] filter
@ -80,7 +77,11 @@ M: closer process
! this does *not* affect the contents of the stack
[ notags ] unless* ;
: make-xml-doc ( prolog seq -- xml-doc )
: get-prolog ( seq -- prolog )
first dup prolog? [ drop default-prolog ] unless ;
: make-xml-doc ( seq -- xml-doc )
[ get-prolog ] keep
dup [ tag? ] find
[ assure-tags cut rest no-pre/post no-post-tags ] dip
swap <xml> ;
@ -95,8 +96,7 @@ TUPLE: pull-xml scope ;
: <pull-xml> ( -- pull-xml )
[
input-stream [ ] change ! bring var in this scope
init-parser reset-prolog init-ns-stack
text-now? on
init-xml text-now? on
] H{ } make-assoc
pull-xml boa ;
! pull-xml needs to call start-document somewhere
@ -135,50 +135,43 @@ PRIVATE>
get-char [ make-tag call-under xml-loop ]
[ drop ] if ; inline recursive
: read-seq ( stream quot n -- seq )
rot [
depth set
init-xml init-xml-stack
call
[ process ] xml-loop
done? [ unclosed ] unless
xml-stack get first second
] with-state ; inline
PRIVATE>
: each-element ( stream quot: ( xml-elem -- ) -- )
swap [
reset-prolog init-ns-stack
init-xml
start-document [ call-under ] when*
xml-loop
] with-state ; inline
: (read-xml) ( -- )
start-document [ process ] when*
[ process ] xml-loop ; inline
: (read-xml-chunk) ( stream -- prolog seq )
[
init-xml (read-xml)
done? [ unclosed ] unless
xml-stack get first second
prolog-data get swap
] with-state ;
: read-xml ( stream -- xml )
0 depth
[ (read-xml-chunk) make-xml-doc ] with-variable ;
[ start-document [ process ] when* ]
0 read-seq make-xml-doc ;
: read-xml-chunk ( stream -- seq )
1 depth
[ (read-xml-chunk) nip ] with-variable
<xml-chunk> ;
[ check ] 1 read-seq <xml-chunk> ;
: string>xml ( string -- xml )
t string-input?
[ <string-reader> read-xml ] with-variable ;
<string-reader> [ check ] 0 read-seq make-xml-doc ;
: string>xml-chunk ( string -- xml )
t string-input?
[ <string-reader> read-xml-chunk ] with-variable ;
<string-reader> read-xml-chunk ;
: file>xml ( filename -- xml )
binary <file-reader> read-xml ;
: read-dtd ( stream -- dtd )
[
reset-prolog
H{ } clone extra-entities set
take-internal-subset
] with-state ;