Making XML slightly faster

db4
Daniel Ehrenberg 2009-01-29 22:17:55 -06:00
parent eccabfea12
commit b7975de956
3 changed files with 36 additions and 22 deletions

View File

@ -6,11 +6,14 @@ io.encodings.string io.encodings combinators accessors
xml.data io.encodings.iana ; xml.data io.encodings.iana ;
IN: xml.autoencoding IN: xml.autoencoding
: decode-stream ( encoding -- )
spot get [ swap re-decode ] change-stream drop ;
: continue-make-tag ( str -- tag ) : continue-make-tag ( str -- tag )
parse-name-starting middle-tag end-tag ; parse-name-starting middle-tag end-tag ;
: start-utf16le ( -- tag ) : start-utf16le ( -- tag )
utf16le decode-input utf16le decode-stream
"?\0" expect "?\0" expect
check instruct ; check instruct ;
@ -22,25 +25,25 @@ IN: xml.autoencoding
! that the first letter of the document is < and second is ! that the first letter of the document is < and second is
! not ASCII ! not ASCII
ascii? ascii?
[ utf8 decode-input next make-tag ] [ [ utf8 decode-stream next make-tag ] [
next next
[ get-next 10xxxxxx? not ] take-until [ get-next 10xxxxxx? not ] take-until
get-char suffix utf8 decode get-char suffix utf8 decode
utf8 decode-input next utf8 decode-stream next
continue-make-tag continue-make-tag
] if ; ] if ;
: prolog-encoding ( prolog -- ) : prolog-encoding ( prolog -- )
encoding>> dup "UTF-16" = encoding>> dup "UTF-16" =
[ drop ] [ name>encoding [ decode-input ] when* ] if ; [ drop ] [ name>encoding [ decode-stream ] when* ] if ;
: instruct-encoding ( instruct/prolog -- ) : instruct-encoding ( instruct/prolog -- )
dup prolog? dup prolog?
[ prolog-encoding ] [ prolog-encoding ]
[ drop utf8 decode-input ] if ; [ drop utf8 decode-stream ] if ;
: go-utf8 ( -- ) : go-utf8 ( -- )
check utf8 decode-input next next ; check utf8 decode-stream next next ;
: start< ( -- tag ) : start< ( -- tag )
! What if first letter of processing instruction is non-ASCII? ! What if first letter of processing instruction is non-ASCII?
@ -52,11 +55,11 @@ IN: xml.autoencoding
} case ; } case ;
: skip-utf8-bom ( -- tag ) : skip-utf8-bom ( -- tag )
"\u0000bb\u0000bf" expect utf8 decode-input "\u0000bb\u0000bf" expect utf8 decode-stream
"<" expect check make-tag ; "<" expect check make-tag ;
: decode-expecting ( encoding string -- tag ) : decode-expecting ( encoding string -- tag )
[ decode-input next ] [ expect ] bi* check make-tag ; [ decode-stream next ] [ expect ] bi* check make-tag ;
: start-utf16be ( -- tag ) : start-utf16be ( -- tag )
utf16be "<" decode-expecting ; utf16be "<" decode-expecting ;
@ -74,6 +77,6 @@ IN: xml.autoencoding
{ HEX: EF [ skip-utf8-bom ] } { HEX: EF [ skip-utf8-bom ] }
{ HEX: FF [ skip-utf16le-bom ] } { HEX: FF [ skip-utf16le-bom ] }
{ HEX: FE [ skip-utf16be-bom ] } { HEX: FE [ skip-utf16be-bom ] }
[ drop utf8 decode-input check f ] [ drop utf8 decode-stream check f ]
} case ; } case ;

View File

@ -4,8 +4,7 @@ USING: accessors kernel namespaces io math ;
IN: xml.state IN: xml.state
TUPLE: spot TUPLE: spot
char { line fixnum } { column fixnum } char line column next check version-1.0? stream ;
next check version-1.0? ;
C: <spot> spot C: <spot> spot

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: namespaces xml.state kernel sequences accessors USING: namespaces xml.state kernel sequences accessors
xml.char-classes xml.errors math io sbufs fry strings ascii xml.char-classes xml.errors math io sbufs fry strings ascii
circular xml.entities assocs make splitting math.parser circular xml.entities assocs splitting math.parser
locals combinators arrays hints ; locals combinators arrays hints ;
IN: xml.tokenize IN: xml.tokenize
@ -31,11 +31,11 @@ HINTS: record { spot fixnum } ;
:: (next) ( spot -- spot char ) :: (next) ( spot -- spot char )
spot next>> :> old-next spot next>> :> old-next
read1 :> new-next spot stream>> stream-read1 :> new-next
old-next CHAR: \r = [ old-next CHAR: \r = [
spot CHAR: \n >>char spot CHAR: \n >>char
new-next CHAR: \n = new-next CHAR: \n =
[ read1 >>next ] [ spot stream>> stream-read1 >>next ]
[ new-next >>next ] if [ new-next >>next ] if
] [ spot old-next >>char new-next >>next ] if ] [ spot old-next >>char new-next >>next ] if
spot next>> ; inline spot next>> ; inline
@ -50,7 +50,9 @@ HINTS: next* { spot } ;
spot get next* ; spot get next* ;
: init-parser ( -- ) : init-parser ( -- )
0 1 0 0 f t <spot> spot set 0 1 0 0 f t f <spot>
input-stream get >>stream
spot set
read1 set-next next ; read1 set-next next ;
: with-state ( stream quot -- ) : with-state ( stream quot -- )
@ -116,18 +118,28 @@ HINTS: next* { spot } ;
take-; dup pe-table get at take-; dup pe-table get at
[ swap push-all ] [ no-entity ] ?if ; [ swap push-all ] [ no-entity ] ?if ;
:: (parse-char) ( quot: ( ch -- ? ) accum -- ) :: (parse-char) ( quot: ( ch -- ? ) accum spot -- )
get-char :> char spot char>> :> char
{ {
{ [ char not ] [ ] } { [ char not ] [ ] }
{ [ char quot call ] [ next ] } { [ char quot call ] [ spot next* ] }
{ [ char CHAR: & = ] [ accum parse-entity quot accum (parse-char) ] } { [ char CHAR: & = ] [
{ [ in-dtd? get char CHAR: % = and ] [ accum parse-pe quot accum (parse-char) ] } accum parse-entity
[ char accum push next quot accum (parse-char) ] quot accum spot (parse-char)
] }
{ [ in-dtd? get char CHAR: % = and ] [
accum parse-pe
quot accum spot (parse-char)
] }
[
char accum push
spot next*
quot accum spot (parse-char)
]
} cond ; inline recursive } cond ; inline recursive
: parse-char ( quot: ( ch -- ? ) -- seq ) : parse-char ( quot: ( ch -- ? ) -- seq )
1024 <sbuf> [ (parse-char) ] keep >string ; inline 1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline
: assure-no-]]> ( circular -- ) : assure-no-]]> ( circular -- )
"]]>" sequence= [ text-w/]]> ] when ; "]]>" sequence= [ text-w/]]> ] when ;