Making XML slightly faster
parent
eccabfea12
commit
b7975de956
|
@ -6,11 +6,14 @@ io.encodings.string io.encodings combinators accessors
|
|||
xml.data io.encodings.iana ;
|
||||
IN: xml.autoencoding
|
||||
|
||||
: decode-stream ( encoding -- )
|
||||
spot get [ swap re-decode ] change-stream drop ;
|
||||
|
||||
: continue-make-tag ( str -- tag )
|
||||
parse-name-starting middle-tag end-tag ;
|
||||
|
||||
: start-utf16le ( -- tag )
|
||||
utf16le decode-input
|
||||
utf16le decode-stream
|
||||
"?\0" expect
|
||||
check instruct ;
|
||||
|
||||
|
@ -22,25 +25,25 @@ IN: xml.autoencoding
|
|||
! that the first letter of the document is < and second is
|
||||
! not ASCII
|
||||
ascii?
|
||||
[ utf8 decode-input next make-tag ] [
|
||||
[ utf8 decode-stream next make-tag ] [
|
||||
next
|
||||
[ get-next 10xxxxxx? not ] take-until
|
||||
get-char suffix utf8 decode
|
||||
utf8 decode-input next
|
||||
utf8 decode-stream next
|
||||
continue-make-tag
|
||||
] if ;
|
||||
|
||||
: prolog-encoding ( prolog -- )
|
||||
encoding>> dup "UTF-16" =
|
||||
[ drop ] [ name>encoding [ decode-input ] when* ] if ;
|
||||
[ drop ] [ name>encoding [ decode-stream ] when* ] if ;
|
||||
|
||||
: instruct-encoding ( instruct/prolog -- )
|
||||
dup prolog?
|
||||
[ prolog-encoding ]
|
||||
[ drop utf8 decode-input ] if ;
|
||||
[ drop utf8 decode-stream ] if ;
|
||||
|
||||
: go-utf8 ( -- )
|
||||
check utf8 decode-input next next ;
|
||||
check utf8 decode-stream next next ;
|
||||
|
||||
: start< ( -- tag )
|
||||
! What if first letter of processing instruction is non-ASCII?
|
||||
|
@ -52,11 +55,11 @@ IN: xml.autoencoding
|
|||
} case ;
|
||||
|
||||
: skip-utf8-bom ( -- tag )
|
||||
"\u0000bb\u0000bf" expect utf8 decode-input
|
||||
"\u0000bb\u0000bf" expect utf8 decode-stream
|
||||
"<" expect check make-tag ;
|
||||
|
||||
: decode-expecting ( encoding string -- tag )
|
||||
[ decode-input next ] [ expect ] bi* check make-tag ;
|
||||
[ decode-stream next ] [ expect ] bi* check make-tag ;
|
||||
|
||||
: start-utf16be ( -- tag )
|
||||
utf16be "<" decode-expecting ;
|
||||
|
@ -74,6 +77,6 @@ IN: xml.autoencoding
|
|||
{ HEX: EF [ skip-utf8-bom ] }
|
||||
{ HEX: FF [ skip-utf16le-bom ] }
|
||||
{ HEX: FE [ skip-utf16be-bom ] }
|
||||
[ drop utf8 decode-input check f ]
|
||||
[ drop utf8 decode-stream check f ]
|
||||
} case ;
|
||||
|
||||
|
|
|
@ -4,8 +4,7 @@ USING: accessors kernel namespaces io math ;
|
|||
IN: xml.state
|
||||
|
||||
TUPLE: spot
|
||||
char { line fixnum } { column fixnum }
|
||||
next check version-1.0? ;
|
||||
char line column next check version-1.0? stream ;
|
||||
|
||||
C: <spot> spot
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: namespaces xml.state kernel sequences accessors
|
||||
xml.char-classes xml.errors math io sbufs fry strings ascii
|
||||
circular xml.entities assocs make splitting math.parser
|
||||
circular xml.entities assocs splitting math.parser
|
||||
locals combinators arrays hints ;
|
||||
IN: xml.tokenize
|
||||
|
||||
|
@ -31,11 +31,11 @@ HINTS: record { spot fixnum } ;
|
|||
|
||||
:: (next) ( spot -- spot char )
|
||||
spot next>> :> old-next
|
||||
read1 :> new-next
|
||||
spot stream>> stream-read1 :> new-next
|
||||
old-next CHAR: \r = [
|
||||
spot CHAR: \n >>char
|
||||
new-next CHAR: \n =
|
||||
[ read1 >>next ]
|
||||
[ spot stream>> stream-read1 >>next ]
|
||||
[ new-next >>next ] if
|
||||
] [ spot old-next >>char new-next >>next ] if
|
||||
spot next>> ; inline
|
||||
|
@ -50,7 +50,9 @@ HINTS: next* { spot } ;
|
|||
spot get next* ;
|
||||
|
||||
: init-parser ( -- )
|
||||
0 1 0 0 f t <spot> spot set
|
||||
0 1 0 0 f t f <spot>
|
||||
input-stream get >>stream
|
||||
spot set
|
||||
read1 set-next next ;
|
||||
|
||||
: with-state ( stream quot -- )
|
||||
|
@ -116,18 +118,28 @@ HINTS: next* { spot } ;
|
|||
take-; dup pe-table get at
|
||||
[ swap push-all ] [ no-entity ] ?if ;
|
||||
|
||||
:: (parse-char) ( quot: ( ch -- ? ) accum -- )
|
||||
get-char :> char
|
||||
:: (parse-char) ( quot: ( ch -- ? ) accum spot -- )
|
||||
spot char>> :> char
|
||||
{
|
||||
{ [ char not ] [ ] }
|
||||
{ [ char quot call ] [ next ] }
|
||||
{ [ char CHAR: & = ] [ accum parse-entity quot accum (parse-char) ] }
|
||||
{ [ in-dtd? get char CHAR: % = and ] [ accum parse-pe quot accum (parse-char) ] }
|
||||
[ char accum push next quot accum (parse-char) ]
|
||||
{ [ char quot call ] [ spot next* ] }
|
||||
{ [ char CHAR: & = ] [
|
||||
accum parse-entity
|
||||
quot accum spot (parse-char)
|
||||
] }
|
||||
{ [ in-dtd? get char CHAR: % = and ] [
|
||||
accum parse-pe
|
||||
quot accum spot (parse-char)
|
||||
] }
|
||||
[
|
||||
char accum push
|
||||
spot next*
|
||||
quot accum spot (parse-char)
|
||||
]
|
||||
} cond ; inline recursive
|
||||
|
||||
: parse-char ( quot: ( ch -- ? ) -- seq )
|
||||
1024 <sbuf> [ (parse-char) ] keep >string ; inline
|
||||
1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline
|
||||
|
||||
: assure-no-]]> ( circular -- )
|
||||
"]]>" sequence= [ text-w/]]> ] when ;
|
||||
|
|
Loading…
Reference in New Issue