Code cleanup in XML
parent
dd15816bad
commit
af2706b75c
|
@ -2,14 +2,15 @@
|
|||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: kernel namespaces xml.name io.encodings.utf8 xml.elements
|
||||
io.encodings.utf16 xml.tokenize xml.state math ascii sequences
|
||||
io.encodings.string io.encodings combinators ;
|
||||
io.encodings.string io.encodings combinators accessors
|
||||
xml.data io.encodings.iana ;
|
||||
IN: xml.autoencoding
|
||||
|
||||
: continue-make-tag ( str -- tag )
|
||||
parse-name-starting middle-tag end-tag ;
|
||||
|
||||
: start-utf16le ( -- tag )
|
||||
utf16le decode-input-if
|
||||
utf16le decode-input
|
||||
"?\0" expect
|
||||
check instruct ;
|
||||
|
||||
|
@ -17,20 +18,36 @@ IN: xml.autoencoding
|
|||
-6 shift 3 bitand 2 = ;
|
||||
|
||||
: start<name ( ch -- tag )
|
||||
! This is unfortunate, and exists for the corner case
|
||||
! that the first letter of the document is < and second is
|
||||
! not ASCII
|
||||
ascii?
|
||||
[ utf8 decode-input-if next make-tag ] [
|
||||
[ utf8 decode-input next make-tag ] [
|
||||
next
|
||||
[ get-next 10xxxxxx? not ] take-until
|
||||
get-char suffix utf8 decode
|
||||
utf8 decode-input-if next
|
||||
utf8 decode-input next
|
||||
continue-make-tag
|
||||
] if ;
|
||||
|
||||
|
||||
: prolog-encoding ( prolog -- )
|
||||
encoding>> dup "UTF-16" =
|
||||
[ drop ] [ name>encoding [ decode-input ] when* ] if ;
|
||||
|
||||
: instruct-encoding ( instruct/prolog -- )
|
||||
dup prolog?
|
||||
[ prolog-encoding ]
|
||||
[ drop utf8 decode-input ] if ;
|
||||
|
||||
: something ( -- )
|
||||
check utf8 decode-input next next ;
|
||||
|
||||
: start< ( -- tag )
|
||||
! What if first letter of processing instruction is non-ASCII?
|
||||
get-next {
|
||||
{ 0 [ next next start-utf16le ] }
|
||||
{ CHAR: ? [ check next next instruct ] } ! XML prolog parsing sets the encoding
|
||||
{ CHAR: ! [ check utf8 decode-input next next direct ] }
|
||||
{ CHAR: ? [ something instruct dup instruct-encoding ] }
|
||||
{ CHAR: ! [ something direct ] }
|
||||
[ check start<name ]
|
||||
} case ;
|
||||
|
||||
|
@ -39,7 +56,7 @@ IN: xml.autoencoding
|
|||
"<" expect check make-tag ;
|
||||
|
||||
: decode-expecting ( encoding string -- tag )
|
||||
[ decode-input-if next ] [ expect ] bi* check make-tag ;
|
||||
[ decode-input next ] [ expect ] bi* check make-tag ;
|
||||
|
||||
: start-utf16be ( -- tag )
|
||||
utf16be "<" decode-expecting ;
|
||||
|
@ -57,8 +74,6 @@ IN: xml.autoencoding
|
|||
{ HEX: EF [ skip-utf8-bom ] }
|
||||
{ HEX: FF [ skip-utf16le-bom ] }
|
||||
{ HEX: FE [ skip-utf16be-bom ] }
|
||||
{ f [ "" ] }
|
||||
[ drop utf8 decode-input-if f ]
|
||||
! Same problem as with <e`>, in the case of XML chunks?
|
||||
} case check ;
|
||||
[ drop utf8 decode-input check f ]
|
||||
} case ;
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
|
|||
! 1.1:
|
||||
! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||
{
|
||||
{ [ dup HEX: 20 < ] [ "\t\r\n" member? and ] }
|
||||
{ [ dup HEX: 20 < ] [ swap [ "\t\r\n" member? ] [ zero? not ] if ] }
|
||||
{ [ nip dup HEX: D800 < ] [ drop t ] }
|
||||
{ [ dup HEX: E000 < ] [ drop f ] }
|
||||
[ { HEX: FFFE HEX: FFFF } member? not ]
|
||||
|
|
|
@ -29,7 +29,7 @@ IN: xml.elements
|
|||
parse-name swap ;
|
||||
|
||||
: (middle-tag) ( -- )
|
||||
pass-blank version=1.0? get-char name-start?
|
||||
pass-blank version-1.0? get-char name-start?
|
||||
[ parse-attr (middle-tag) ] when ;
|
||||
|
||||
: assure-no-duplicates ( attrs-alist -- attrs-alist )
|
||||
|
@ -66,7 +66,8 @@ IN: xml.elements
|
|||
|
||||
: prolog-version ( alist -- version )
|
||||
T{ name { space "" } { main "version" } } swap at
|
||||
[ good-version ] [ versionless-prolog ] if* ;
|
||||
[ good-version ] [ versionless-prolog ] if*
|
||||
dup set-version ;
|
||||
|
||||
: prolog-encoding ( alist -- encoding )
|
||||
T{ name { space "" } { main "encoding" } } swap at
|
||||
|
@ -89,16 +90,9 @@ IN: xml.elements
|
|||
[ prolog-standalone ]
|
||||
tri <prolog> ;
|
||||
|
||||
SYMBOL: string-input?
|
||||
: decode-input-if ( encoding -- )
|
||||
string-input? get [ drop ] [ decode-input ] if ;
|
||||
|
||||
: parse-prolog ( -- prolog )
|
||||
pass-blank middle-tag "?>" expect
|
||||
dup assure-no-extra prolog-attrs
|
||||
dup encoding>> dup "UTF-16" =
|
||||
[ drop ] [ name>encoding [ decode-input-if ] when* ] if
|
||||
dup prolog-data set ;
|
||||
dup assure-no-extra prolog-attrs ;
|
||||
|
||||
: instruct ( -- instruction )
|
||||
take-name {
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
USING: accessors kernel namespaces io ;
|
||||
IN: xml.state
|
||||
|
||||
TUPLE: spot char line column next check ;
|
||||
TUPLE: spot char line column next check version-1.0? ;
|
||||
|
||||
C: <spot> spot
|
||||
|
||||
|
@ -17,11 +17,12 @@ C: <spot> spot
|
|||
: set-next ( char -- ) spot get swap >>next drop ;
|
||||
: get-check ( -- ? ) spot get check>> ;
|
||||
: check ( -- ) spot get t >>check drop ;
|
||||
: version-1.0? ( -- ? ) spot get version-1.0?>> ;
|
||||
: set-version ( string -- )
|
||||
spot get swap "1.0" = >>version-1.0? drop ;
|
||||
|
||||
SYMBOL: xml-stack
|
||||
|
||||
SYMBOL: prolog-data
|
||||
|
||||
SYMBOL: depth
|
||||
|
||||
SYMBOL: interpolating?
|
||||
|
|
|
@ -51,14 +51,18 @@ SYMBOL: xml-file
|
|||
[ "foo" ] [ "<boo><![CDATA[foo]]></boo>" string>xml children>string ] unit-test
|
||||
[ "<!-- B+, B, or B--->" string>xml ] must-fail
|
||||
[ ] [ "<?xml version='1.0'?><!-- declarations for <head> & <body> --><foo/>" string>xml drop ] unit-test
|
||||
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>dtd directives>> first ] unit-test
|
||||
[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>dtd directives>> first ] unit-test
|
||||
[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>dtd directives>> first ] unit-test
|
||||
[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>dtd directives>> first ] unit-test
|
||||
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk first ] unit-test
|
||||
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first ] unit-test
|
||||
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first ] unit-test
|
||||
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first ] unit-test
|
||||
|
||||
: first-thing ( seq -- elt )
|
||||
[ "" = not ] filter first ;
|
||||
|
||||
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>dtd directives>> first-thing ] unit-test
|
||||
[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>dtd directives>> first-thing ] unit-test
|
||||
[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>dtd directives>> first-thing ] unit-test
|
||||
[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>dtd directives>> first-thing ] unit-test
|
||||
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk first-thing ] unit-test
|
||||
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first-thing ] unit-test
|
||||
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first-thing ] unit-test
|
||||
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first-thing ] unit-test
|
||||
[ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test
|
||||
[ "x" "<" ] [ "<x value='<'/>" string>xml [ name>> main>> ] [ "value" attr ] bi ] unit-test
|
||||
[ "foo" ] [ "<!DOCTYPE foo [<!ENTITY bar 'foo'>]><x>&bar;</x>" string>xml children>string ] unit-test
|
||||
|
|
|
@ -6,12 +6,9 @@ circular xml.entities assocs make splitting math.parser
|
|||
locals combinators arrays ;
|
||||
IN: xml.tokenize
|
||||
|
||||
: version=1.0? ( -- ? )
|
||||
prolog-data get [ version>> "1.0" = ] [ t ] if* ;
|
||||
|
||||
: assure-good-char ( ch -- ch )
|
||||
[
|
||||
version=1.0? over text? not get-check and
|
||||
version-1.0? over text? not get-check and
|
||||
[ disallowed-char ] when
|
||||
] [ f ] if* ;
|
||||
|
||||
|
@ -36,7 +33,7 @@ IN: xml.tokenize
|
|||
get-char [ unexpected-end ] unless (next) record ;
|
||||
|
||||
: init-parser ( -- )
|
||||
0 1 0 f f <spot> spot set
|
||||
0 1 0 f f t <spot> spot set
|
||||
read1 set-next next ;
|
||||
|
||||
: with-state ( stream quot -- )
|
||||
|
|
|
@ -22,7 +22,7 @@ GENERIC: process ( object -- )
|
|||
M: object process add-child ;
|
||||
|
||||
M: prolog process
|
||||
xml-stack get V{ { f V{ } } } =
|
||||
xml-stack get { V{ { f V{ "" } } } V{ { f V{ } } } } member?
|
||||
[ bad-prolog ] unless drop ;
|
||||
|
||||
M: directive process
|
||||
|
@ -49,17 +49,14 @@ M: closer process
|
|||
|
||||
: init-xml-stack ( -- )
|
||||
V{ } clone xml-stack set
|
||||
extra-entities [ H{ } assoc-like ] change
|
||||
f push-xml ;
|
||||
|
||||
: default-prolog ( -- prolog )
|
||||
"1.0" "UTF-8" f <prolog> ;
|
||||
|
||||
: reset-prolog ( -- )
|
||||
default-prolog prolog-data set ;
|
||||
|
||||
: init-xml ( -- )
|
||||
reset-prolog init-xml-stack init-ns-stack ;
|
||||
init-ns-stack
|
||||
extra-entities [ H{ } assoc-like ] change ;
|
||||
|
||||
: assert-blanks ( seq pre? -- )
|
||||
swap [ string? ] filter
|
||||
|
@ -80,7 +77,11 @@ M: closer process
|
|||
! this does *not* affect the contents of the stack
|
||||
[ notags ] unless* ;
|
||||
|
||||
: make-xml-doc ( prolog seq -- xml-doc )
|
||||
: get-prolog ( seq -- prolog )
|
||||
first dup prolog? [ drop default-prolog ] unless ;
|
||||
|
||||
: make-xml-doc ( seq -- xml-doc )
|
||||
[ get-prolog ] keep
|
||||
dup [ tag? ] find
|
||||
[ assure-tags cut rest no-pre/post no-post-tags ] dip
|
||||
swap <xml> ;
|
||||
|
@ -95,8 +96,7 @@ TUPLE: pull-xml scope ;
|
|||
: <pull-xml> ( -- pull-xml )
|
||||
[
|
||||
input-stream [ ] change ! bring var in this scope
|
||||
init-parser reset-prolog init-ns-stack
|
||||
text-now? on
|
||||
init-xml text-now? on
|
||||
] H{ } make-assoc
|
||||
pull-xml boa ;
|
||||
! pull-xml needs to call start-document somewhere
|
||||
|
@ -135,50 +135,43 @@ PRIVATE>
|
|||
get-char [ make-tag call-under xml-loop ]
|
||||
[ drop ] if ; inline recursive
|
||||
|
||||
: read-seq ( stream quot n -- seq )
|
||||
rot [
|
||||
depth set
|
||||
init-xml init-xml-stack
|
||||
call
|
||||
[ process ] xml-loop
|
||||
done? [ unclosed ] unless
|
||||
xml-stack get first second
|
||||
] with-state ; inline
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: each-element ( stream quot: ( xml-elem -- ) -- )
|
||||
swap [
|
||||
reset-prolog init-ns-stack
|
||||
init-xml
|
||||
start-document [ call-under ] when*
|
||||
xml-loop
|
||||
] with-state ; inline
|
||||
|
||||
: (read-xml) ( -- )
|
||||
start-document [ process ] when*
|
||||
[ process ] xml-loop ; inline
|
||||
|
||||
: (read-xml-chunk) ( stream -- prolog seq )
|
||||
[
|
||||
init-xml (read-xml)
|
||||
done? [ unclosed ] unless
|
||||
xml-stack get first second
|
||||
prolog-data get swap
|
||||
] with-state ;
|
||||
|
||||
: read-xml ( stream -- xml )
|
||||
0 depth
|
||||
[ (read-xml-chunk) make-xml-doc ] with-variable ;
|
||||
[ start-document [ process ] when* ]
|
||||
0 read-seq make-xml-doc ;
|
||||
|
||||
: read-xml-chunk ( stream -- seq )
|
||||
1 depth
|
||||
[ (read-xml-chunk) nip ] with-variable
|
||||
<xml-chunk> ;
|
||||
[ check ] 1 read-seq <xml-chunk> ;
|
||||
|
||||
: string>xml ( string -- xml )
|
||||
t string-input?
|
||||
[ <string-reader> read-xml ] with-variable ;
|
||||
<string-reader> [ check ] 0 read-seq make-xml-doc ;
|
||||
|
||||
: string>xml-chunk ( string -- xml )
|
||||
t string-input?
|
||||
[ <string-reader> read-xml-chunk ] with-variable ;
|
||||
<string-reader> read-xml-chunk ;
|
||||
|
||||
: file>xml ( filename -- xml )
|
||||
binary <file-reader> read-xml ;
|
||||
|
||||
: read-dtd ( stream -- dtd )
|
||||
[
|
||||
reset-prolog
|
||||
H{ } clone extra-entities set
|
||||
take-internal-subset
|
||||
] with-state ;
|
||||
|
|
Loading…
Reference in New Issue