Making XML slightly faster

db4
Daniel Ehrenberg 2009-01-29 22:17:55 -06:00
parent eccabfea12
commit b7975de956
3 changed files with 36 additions and 22 deletions

View File

@ -6,11 +6,14 @@ io.encodings.string io.encodings combinators accessors
xml.data io.encodings.iana ;
IN: xml.autoencoding
: decode-stream ( encoding -- )
spot get [ swap re-decode ] change-stream drop ;
: continue-make-tag ( str -- tag )
parse-name-starting middle-tag end-tag ;
: start-utf16le ( -- tag )
utf16le decode-input
utf16le decode-stream
"?\0" expect
check instruct ;
@ -22,25 +25,25 @@ IN: xml.autoencoding
! that the first letter of the document is < and second is
! not ASCII
ascii?
[ utf8 decode-input next make-tag ] [
[ utf8 decode-stream next make-tag ] [
next
[ get-next 10xxxxxx? not ] take-until
get-char suffix utf8 decode
utf8 decode-input next
utf8 decode-stream next
continue-make-tag
] if ;
: prolog-encoding ( prolog -- )
encoding>> dup "UTF-16" =
[ drop ] [ name>encoding [ decode-input ] when* ] if ;
[ drop ] [ name>encoding [ decode-stream ] when* ] if ;
: instruct-encoding ( instruct/prolog -- )
dup prolog?
[ prolog-encoding ]
[ drop utf8 decode-input ] if ;
[ drop utf8 decode-stream ] if ;
: go-utf8 ( -- )
check utf8 decode-input next next ;
check utf8 decode-stream next next ;
: start< ( -- tag )
! What if first letter of processing instruction is non-ASCII?
@ -52,11 +55,11 @@ IN: xml.autoencoding
} case ;
: skip-utf8-bom ( -- tag )
"\u0000bb\u0000bf" expect utf8 decode-input
"\u0000bb\u0000bf" expect utf8 decode-stream
"<" expect check make-tag ;
: decode-expecting ( encoding string -- tag )
[ decode-input next ] [ expect ] bi* check make-tag ;
[ decode-stream next ] [ expect ] bi* check make-tag ;
: start-utf16be ( -- tag )
utf16be "<" decode-expecting ;
@ -74,6 +77,6 @@ IN: xml.autoencoding
{ HEX: EF [ skip-utf8-bom ] }
{ HEX: FF [ skip-utf16le-bom ] }
{ HEX: FE [ skip-utf16be-bom ] }
[ drop utf8 decode-input check f ]
[ drop utf8 decode-stream check f ]
} case ;

View File

@ -4,8 +4,7 @@ USING: accessors kernel namespaces io math ;
IN: xml.state
TUPLE: spot
char { line fixnum } { column fixnum }
next check version-1.0? ;
char line column next check version-1.0? stream ;
C: <spot> spot

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license.
USING: namespaces xml.state kernel sequences accessors
xml.char-classes xml.errors math io sbufs fry strings ascii
circular xml.entities assocs make splitting math.parser
circular xml.entities assocs splitting math.parser
locals combinators arrays hints ;
IN: xml.tokenize
@ -31,11 +31,11 @@ HINTS: record { spot fixnum } ;
:: (next) ( spot -- spot char )
spot next>> :> old-next
read1 :> new-next
spot stream>> stream-read1 :> new-next
old-next CHAR: \r = [
spot CHAR: \n >>char
new-next CHAR: \n =
[ read1 >>next ]
[ spot stream>> stream-read1 >>next ]
[ new-next >>next ] if
] [ spot old-next >>char new-next >>next ] if
spot next>> ; inline
@ -50,7 +50,9 @@ HINTS: next* { spot } ;
spot get next* ;
: init-parser ( -- )
0 1 0 0 f t <spot> spot set
0 1 0 0 f t f <spot>
input-stream get >>stream
spot set
read1 set-next next ;
: with-state ( stream quot -- )
@ -116,18 +118,28 @@ HINTS: next* { spot } ;
take-; dup pe-table get at
[ swap push-all ] [ no-entity ] ?if ;
:: (parse-char) ( quot: ( ch -- ? ) accum -- )
get-char :> char
:: (parse-char) ( quot: ( ch -- ? ) accum spot -- )
spot char>> :> char
{
{ [ char not ] [ ] }
{ [ char quot call ] [ next ] }
{ [ char CHAR: & = ] [ accum parse-entity quot accum (parse-char) ] }
{ [ in-dtd? get char CHAR: % = and ] [ accum parse-pe quot accum (parse-char) ] }
[ char accum push next quot accum (parse-char) ]
{ [ char quot call ] [ spot next* ] }
{ [ char CHAR: & = ] [
accum parse-entity
quot accum spot (parse-char)
] }
{ [ in-dtd? get char CHAR: % = and ] [
accum parse-pe
quot accum spot (parse-char)
] }
[
char accum push
spot next*
quot accum spot (parse-char)
]
} cond ; inline recursive
: parse-char ( quot: ( ch -- ? ) -- seq )
1024 <sbuf> [ (parse-char) ] keep >string ; inline
1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline
: assure-no-]]> ( circular -- )
"]]>" sequence= [ text-w/]]> ] when ;