Code cleanup in XML
parent
dd15816bad
commit
af2706b75c
|
@ -2,14 +2,15 @@
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: kernel namespaces xml.name io.encodings.utf8 xml.elements
|
USING: kernel namespaces xml.name io.encodings.utf8 xml.elements
|
||||||
io.encodings.utf16 xml.tokenize xml.state math ascii sequences
|
io.encodings.utf16 xml.tokenize xml.state math ascii sequences
|
||||||
io.encodings.string io.encodings combinators ;
|
io.encodings.string io.encodings combinators accessors
|
||||||
|
xml.data io.encodings.iana ;
|
||||||
IN: xml.autoencoding
|
IN: xml.autoencoding
|
||||||
|
|
||||||
: continue-make-tag ( str -- tag )
|
: continue-make-tag ( str -- tag )
|
||||||
parse-name-starting middle-tag end-tag ;
|
parse-name-starting middle-tag end-tag ;
|
||||||
|
|
||||||
: start-utf16le ( -- tag )
|
: start-utf16le ( -- tag )
|
||||||
utf16le decode-input-if
|
utf16le decode-input
|
||||||
"?\0" expect
|
"?\0" expect
|
||||||
check instruct ;
|
check instruct ;
|
||||||
|
|
||||||
|
@ -17,20 +18,36 @@ IN: xml.autoencoding
|
||||||
-6 shift 3 bitand 2 = ;
|
-6 shift 3 bitand 2 = ;
|
||||||
|
|
||||||
: start<name ( ch -- tag )
|
: start<name ( ch -- tag )
|
||||||
|
! This is unfortunate, and exists for the corner case
|
||||||
|
! that the first letter of the document is < and second is
|
||||||
|
! not ASCII
|
||||||
ascii?
|
ascii?
|
||||||
[ utf8 decode-input-if next make-tag ] [
|
[ utf8 decode-input next make-tag ] [
|
||||||
next
|
next
|
||||||
[ get-next 10xxxxxx? not ] take-until
|
[ get-next 10xxxxxx? not ] take-until
|
||||||
get-char suffix utf8 decode
|
get-char suffix utf8 decode
|
||||||
utf8 decode-input-if next
|
utf8 decode-input next
|
||||||
continue-make-tag
|
continue-make-tag
|
||||||
] if ;
|
] if ;
|
||||||
|
|
||||||
|
: prolog-encoding ( prolog -- )
|
||||||
|
encoding>> dup "UTF-16" =
|
||||||
|
[ drop ] [ name>encoding [ decode-input ] when* ] if ;
|
||||||
|
|
||||||
|
: instruct-encoding ( instruct/prolog -- )
|
||||||
|
dup prolog?
|
||||||
|
[ prolog-encoding ]
|
||||||
|
[ drop utf8 decode-input ] if ;
|
||||||
|
|
||||||
|
: something ( -- )
|
||||||
|
check utf8 decode-input next next ;
|
||||||
|
|
||||||
: start< ( -- tag )
|
: start< ( -- tag )
|
||||||
|
! What if first letter of processing instruction is non-ASCII?
|
||||||
get-next {
|
get-next {
|
||||||
{ 0 [ next next start-utf16le ] }
|
{ 0 [ next next start-utf16le ] }
|
||||||
{ CHAR: ? [ check next next instruct ] } ! XML prolog parsing sets the encoding
|
{ CHAR: ? [ something instruct dup instruct-encoding ] }
|
||||||
{ CHAR: ! [ check utf8 decode-input next next direct ] }
|
{ CHAR: ! [ something direct ] }
|
||||||
[ check start<name ]
|
[ check start<name ]
|
||||||
} case ;
|
} case ;
|
||||||
|
|
||||||
|
@ -39,7 +56,7 @@ IN: xml.autoencoding
|
||||||
"<" expect check make-tag ;
|
"<" expect check make-tag ;
|
||||||
|
|
||||||
: decode-expecting ( encoding string -- tag )
|
: decode-expecting ( encoding string -- tag )
|
||||||
[ decode-input-if next ] [ expect ] bi* check make-tag ;
|
[ decode-input next ] [ expect ] bi* check make-tag ;
|
||||||
|
|
||||||
: start-utf16be ( -- tag )
|
: start-utf16be ( -- tag )
|
||||||
utf16be "<" decode-expecting ;
|
utf16be "<" decode-expecting ;
|
||||||
|
@ -57,8 +74,6 @@ IN: xml.autoencoding
|
||||||
{ HEX: EF [ skip-utf8-bom ] }
|
{ HEX: EF [ skip-utf8-bom ] }
|
||||||
{ HEX: FF [ skip-utf16le-bom ] }
|
{ HEX: FF [ skip-utf16le-bom ] }
|
||||||
{ HEX: FE [ skip-utf16be-bom ] }
|
{ HEX: FE [ skip-utf16be-bom ] }
|
||||||
{ f [ "" ] }
|
[ drop utf8 decode-input check f ]
|
||||||
[ drop utf8 decode-input-if f ]
|
} case ;
|
||||||
! Same problem as with <e`>, in the case of XML chunks?
|
|
||||||
} case check ;
|
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
|
||||||
! 1.1:
|
! 1.1:
|
||||||
! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||||
{
|
{
|
||||||
{ [ dup HEX: 20 < ] [ "\t\r\n" member? and ] }
|
{ [ dup HEX: 20 < ] [ swap [ "\t\r\n" member? ] [ zero? not ] if ] }
|
||||||
{ [ nip dup HEX: D800 < ] [ drop t ] }
|
{ [ nip dup HEX: D800 < ] [ drop t ] }
|
||||||
{ [ dup HEX: E000 < ] [ drop f ] }
|
{ [ dup HEX: E000 < ] [ drop f ] }
|
||||||
[ { HEX: FFFE HEX: FFFF } member? not ]
|
[ { HEX: FFFE HEX: FFFF } member? not ]
|
||||||
|
|
|
@ -29,7 +29,7 @@ IN: xml.elements
|
||||||
parse-name swap ;
|
parse-name swap ;
|
||||||
|
|
||||||
: (middle-tag) ( -- )
|
: (middle-tag) ( -- )
|
||||||
pass-blank version=1.0? get-char name-start?
|
pass-blank version-1.0? get-char name-start?
|
||||||
[ parse-attr (middle-tag) ] when ;
|
[ parse-attr (middle-tag) ] when ;
|
||||||
|
|
||||||
: assure-no-duplicates ( attrs-alist -- attrs-alist )
|
: assure-no-duplicates ( attrs-alist -- attrs-alist )
|
||||||
|
@ -66,7 +66,8 @@ IN: xml.elements
|
||||||
|
|
||||||
: prolog-version ( alist -- version )
|
: prolog-version ( alist -- version )
|
||||||
T{ name { space "" } { main "version" } } swap at
|
T{ name { space "" } { main "version" } } swap at
|
||||||
[ good-version ] [ versionless-prolog ] if* ;
|
[ good-version ] [ versionless-prolog ] if*
|
||||||
|
dup set-version ;
|
||||||
|
|
||||||
: prolog-encoding ( alist -- encoding )
|
: prolog-encoding ( alist -- encoding )
|
||||||
T{ name { space "" } { main "encoding" } } swap at
|
T{ name { space "" } { main "encoding" } } swap at
|
||||||
|
@ -89,16 +90,9 @@ IN: xml.elements
|
||||||
[ prolog-standalone ]
|
[ prolog-standalone ]
|
||||||
tri <prolog> ;
|
tri <prolog> ;
|
||||||
|
|
||||||
SYMBOL: string-input?
|
|
||||||
: decode-input-if ( encoding -- )
|
|
||||||
string-input? get [ drop ] [ decode-input ] if ;
|
|
||||||
|
|
||||||
: parse-prolog ( -- prolog )
|
: parse-prolog ( -- prolog )
|
||||||
pass-blank middle-tag "?>" expect
|
pass-blank middle-tag "?>" expect
|
||||||
dup assure-no-extra prolog-attrs
|
dup assure-no-extra prolog-attrs ;
|
||||||
dup encoding>> dup "UTF-16" =
|
|
||||||
[ drop ] [ name>encoding [ decode-input-if ] when* ] if
|
|
||||||
dup prolog-data set ;
|
|
||||||
|
|
||||||
: instruct ( -- instruction )
|
: instruct ( -- instruction )
|
||||||
take-name {
|
take-name {
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
USING: accessors kernel namespaces io ;
|
USING: accessors kernel namespaces io ;
|
||||||
IN: xml.state
|
IN: xml.state
|
||||||
|
|
||||||
TUPLE: spot char line column next check ;
|
TUPLE: spot char line column next check version-1.0? ;
|
||||||
|
|
||||||
C: <spot> spot
|
C: <spot> spot
|
||||||
|
|
||||||
|
@ -17,11 +17,12 @@ C: <spot> spot
|
||||||
: set-next ( char -- ) spot get swap >>next drop ;
|
: set-next ( char -- ) spot get swap >>next drop ;
|
||||||
: get-check ( -- ? ) spot get check>> ;
|
: get-check ( -- ? ) spot get check>> ;
|
||||||
: check ( -- ) spot get t >>check drop ;
|
: check ( -- ) spot get t >>check drop ;
|
||||||
|
: version-1.0? ( -- ? ) spot get version-1.0?>> ;
|
||||||
|
: set-version ( string -- )
|
||||||
|
spot get swap "1.0" = >>version-1.0? drop ;
|
||||||
|
|
||||||
SYMBOL: xml-stack
|
SYMBOL: xml-stack
|
||||||
|
|
||||||
SYMBOL: prolog-data
|
|
||||||
|
|
||||||
SYMBOL: depth
|
SYMBOL: depth
|
||||||
|
|
||||||
SYMBOL: interpolating?
|
SYMBOL: interpolating?
|
||||||
|
|
|
@ -51,14 +51,18 @@ SYMBOL: xml-file
|
||||||
[ "foo" ] [ "<boo><![CDATA[foo]]></boo>" string>xml children>string ] unit-test
|
[ "foo" ] [ "<boo><![CDATA[foo]]></boo>" string>xml children>string ] unit-test
|
||||||
[ "<!-- B+, B, or B--->" string>xml ] must-fail
|
[ "<!-- B+, B, or B--->" string>xml ] must-fail
|
||||||
[ ] [ "<?xml version='1.0'?><!-- declarations for <head> & <body> --><foo/>" string>xml drop ] unit-test
|
[ ] [ "<?xml version='1.0'?><!-- declarations for <head> & <body> --><foo/>" string>xml drop ] unit-test
|
||||||
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>dtd directives>> first ] unit-test
|
|
||||||
[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>dtd directives>> first ] unit-test
|
: first-thing ( seq -- elt )
|
||||||
[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>dtd directives>> first ] unit-test
|
[ "" = not ] filter first ;
|
||||||
[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>dtd directives>> first ] unit-test
|
|
||||||
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk first ] unit-test
|
[ T{ element-decl f "br" "EMPTY" } ] [ "<!ELEMENT br EMPTY>" string>dtd directives>> first-thing ] unit-test
|
||||||
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first ] unit-test
|
[ T{ element-decl f "p" "(#PCDATA|emph)*" } ] [ "<!ELEMENT p (#PCDATA|emph)*>" string>dtd directives>> first-thing ] unit-test
|
||||||
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first ] unit-test
|
[ T{ element-decl f "%name.para;" "%content.para;" } ] [ "<!ELEMENT %name.para; %content.para;>" string>dtd directives>> first-thing ] unit-test
|
||||||
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first ] unit-test
|
[ T{ element-decl f "container" "ANY" } ] [ "<!ELEMENT container ANY>" string>dtd directives>> first-thing ] unit-test
|
||||||
|
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo>" string>xml-chunk first-thing ] unit-test
|
||||||
|
[ T{ doctype-decl f "foo" } ] [ "<!DOCTYPE foo >" string>xml-chunk first-thing ] unit-test
|
||||||
|
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM 'blah.dtd'>" string>xml-chunk first-thing ] unit-test
|
||||||
|
[ T{ doctype-decl f "foo" T{ system-id f "blah.dtd" } } ] [ "<!DOCTYPE foo SYSTEM \"blah.dtd\" >" string>xml-chunk first-thing ] unit-test
|
||||||
[ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test
|
[ 958 ] [ [ "ξ" string>xml-chunk ] with-html-entities first first ] unit-test
|
||||||
[ "x" "<" ] [ "<x value='<'/>" string>xml [ name>> main>> ] [ "value" attr ] bi ] unit-test
|
[ "x" "<" ] [ "<x value='<'/>" string>xml [ name>> main>> ] [ "value" attr ] bi ] unit-test
|
||||||
[ "foo" ] [ "<!DOCTYPE foo [<!ENTITY bar 'foo'>]><x>&bar;</x>" string>xml children>string ] unit-test
|
[ "foo" ] [ "<!DOCTYPE foo [<!ENTITY bar 'foo'>]><x>&bar;</x>" string>xml children>string ] unit-test
|
||||||
|
|
|
@ -6,12 +6,9 @@ circular xml.entities assocs make splitting math.parser
|
||||||
locals combinators arrays ;
|
locals combinators arrays ;
|
||||||
IN: xml.tokenize
|
IN: xml.tokenize
|
||||||
|
|
||||||
: version=1.0? ( -- ? )
|
|
||||||
prolog-data get [ version>> "1.0" = ] [ t ] if* ;
|
|
||||||
|
|
||||||
: assure-good-char ( ch -- ch )
|
: assure-good-char ( ch -- ch )
|
||||||
[
|
[
|
||||||
version=1.0? over text? not get-check and
|
version-1.0? over text? not get-check and
|
||||||
[ disallowed-char ] when
|
[ disallowed-char ] when
|
||||||
] [ f ] if* ;
|
] [ f ] if* ;
|
||||||
|
|
||||||
|
@ -36,7 +33,7 @@ IN: xml.tokenize
|
||||||
get-char [ unexpected-end ] unless (next) record ;
|
get-char [ unexpected-end ] unless (next) record ;
|
||||||
|
|
||||||
: init-parser ( -- )
|
: init-parser ( -- )
|
||||||
0 1 0 f f <spot> spot set
|
0 1 0 f f t <spot> spot set
|
||||||
read1 set-next next ;
|
read1 set-next next ;
|
||||||
|
|
||||||
: with-state ( stream quot -- )
|
: with-state ( stream quot -- )
|
||||||
|
|
|
@ -22,7 +22,7 @@ GENERIC: process ( object -- )
|
||||||
M: object process add-child ;
|
M: object process add-child ;
|
||||||
|
|
||||||
M: prolog process
|
M: prolog process
|
||||||
xml-stack get V{ { f V{ } } } =
|
xml-stack get { V{ { f V{ "" } } } V{ { f V{ } } } } member?
|
||||||
[ bad-prolog ] unless drop ;
|
[ bad-prolog ] unless drop ;
|
||||||
|
|
||||||
M: directive process
|
M: directive process
|
||||||
|
@ -49,17 +49,14 @@ M: closer process
|
||||||
|
|
||||||
: init-xml-stack ( -- )
|
: init-xml-stack ( -- )
|
||||||
V{ } clone xml-stack set
|
V{ } clone xml-stack set
|
||||||
extra-entities [ H{ } assoc-like ] change
|
|
||||||
f push-xml ;
|
f push-xml ;
|
||||||
|
|
||||||
: default-prolog ( -- prolog )
|
: default-prolog ( -- prolog )
|
||||||
"1.0" "UTF-8" f <prolog> ;
|
"1.0" "UTF-8" f <prolog> ;
|
||||||
|
|
||||||
: reset-prolog ( -- )
|
|
||||||
default-prolog prolog-data set ;
|
|
||||||
|
|
||||||
: init-xml ( -- )
|
: init-xml ( -- )
|
||||||
reset-prolog init-xml-stack init-ns-stack ;
|
init-ns-stack
|
||||||
|
extra-entities [ H{ } assoc-like ] change ;
|
||||||
|
|
||||||
: assert-blanks ( seq pre? -- )
|
: assert-blanks ( seq pre? -- )
|
||||||
swap [ string? ] filter
|
swap [ string? ] filter
|
||||||
|
@ -80,7 +77,11 @@ M: closer process
|
||||||
! this does *not* affect the contents of the stack
|
! this does *not* affect the contents of the stack
|
||||||
[ notags ] unless* ;
|
[ notags ] unless* ;
|
||||||
|
|
||||||
: make-xml-doc ( prolog seq -- xml-doc )
|
: get-prolog ( seq -- prolog )
|
||||||
|
first dup prolog? [ drop default-prolog ] unless ;
|
||||||
|
|
||||||
|
: make-xml-doc ( seq -- xml-doc )
|
||||||
|
[ get-prolog ] keep
|
||||||
dup [ tag? ] find
|
dup [ tag? ] find
|
||||||
[ assure-tags cut rest no-pre/post no-post-tags ] dip
|
[ assure-tags cut rest no-pre/post no-post-tags ] dip
|
||||||
swap <xml> ;
|
swap <xml> ;
|
||||||
|
@ -95,8 +96,7 @@ TUPLE: pull-xml scope ;
|
||||||
: <pull-xml> ( -- pull-xml )
|
: <pull-xml> ( -- pull-xml )
|
||||||
[
|
[
|
||||||
input-stream [ ] change ! bring var in this scope
|
input-stream [ ] change ! bring var in this scope
|
||||||
init-parser reset-prolog init-ns-stack
|
init-xml text-now? on
|
||||||
text-now? on
|
|
||||||
] H{ } make-assoc
|
] H{ } make-assoc
|
||||||
pull-xml boa ;
|
pull-xml boa ;
|
||||||
! pull-xml needs to call start-document somewhere
|
! pull-xml needs to call start-document somewhere
|
||||||
|
@ -135,50 +135,43 @@ PRIVATE>
|
||||||
get-char [ make-tag call-under xml-loop ]
|
get-char [ make-tag call-under xml-loop ]
|
||||||
[ drop ] if ; inline recursive
|
[ drop ] if ; inline recursive
|
||||||
|
|
||||||
|
: read-seq ( stream quot n -- seq )
|
||||||
|
rot [
|
||||||
|
depth set
|
||||||
|
init-xml init-xml-stack
|
||||||
|
call
|
||||||
|
[ process ] xml-loop
|
||||||
|
done? [ unclosed ] unless
|
||||||
|
xml-stack get first second
|
||||||
|
] with-state ; inline
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: each-element ( stream quot: ( xml-elem -- ) -- )
|
: each-element ( stream quot: ( xml-elem -- ) -- )
|
||||||
swap [
|
swap [
|
||||||
reset-prolog init-ns-stack
|
init-xml
|
||||||
start-document [ call-under ] when*
|
start-document [ call-under ] when*
|
||||||
xml-loop
|
xml-loop
|
||||||
] with-state ; inline
|
] with-state ; inline
|
||||||
|
|
||||||
: (read-xml) ( -- )
|
|
||||||
start-document [ process ] when*
|
|
||||||
[ process ] xml-loop ; inline
|
|
||||||
|
|
||||||
: (read-xml-chunk) ( stream -- prolog seq )
|
|
||||||
[
|
|
||||||
init-xml (read-xml)
|
|
||||||
done? [ unclosed ] unless
|
|
||||||
xml-stack get first second
|
|
||||||
prolog-data get swap
|
|
||||||
] with-state ;
|
|
||||||
|
|
||||||
: read-xml ( stream -- xml )
|
: read-xml ( stream -- xml )
|
||||||
0 depth
|
[ start-document [ process ] when* ]
|
||||||
[ (read-xml-chunk) make-xml-doc ] with-variable ;
|
0 read-seq make-xml-doc ;
|
||||||
|
|
||||||
: read-xml-chunk ( stream -- seq )
|
: read-xml-chunk ( stream -- seq )
|
||||||
1 depth
|
[ check ] 1 read-seq <xml-chunk> ;
|
||||||
[ (read-xml-chunk) nip ] with-variable
|
|
||||||
<xml-chunk> ;
|
|
||||||
|
|
||||||
: string>xml ( string -- xml )
|
: string>xml ( string -- xml )
|
||||||
t string-input?
|
<string-reader> [ check ] 0 read-seq make-xml-doc ;
|
||||||
[ <string-reader> read-xml ] with-variable ;
|
|
||||||
|
|
||||||
: string>xml-chunk ( string -- xml )
|
: string>xml-chunk ( string -- xml )
|
||||||
t string-input?
|
<string-reader> read-xml-chunk ;
|
||||||
[ <string-reader> read-xml-chunk ] with-variable ;
|
|
||||||
|
|
||||||
: file>xml ( filename -- xml )
|
: file>xml ( filename -- xml )
|
||||||
binary <file-reader> read-xml ;
|
binary <file-reader> read-xml ;
|
||||||
|
|
||||||
: read-dtd ( stream -- dtd )
|
: read-dtd ( stream -- dtd )
|
||||||
[
|
[
|
||||||
reset-prolog
|
|
||||||
H{ } clone extra-entities set
|
H{ } clone extra-entities set
|
||||||
take-internal-subset
|
take-internal-subset
|
||||||
] with-state ;
|
] with-state ;
|
||||||
|
|
Loading…
Reference in New Issue