Making XML slightly faster

2009-01-29 22:17:55 -06:00 · 2009-01-29 22:17:55 -06:00 · b7975de956
parent eccabfea12
commit b7975de956
3 changed files with 36 additions and 22 deletions
--- a/basis/xml/autoencoding/autoencoding.factor
+++ b/basis/xml/autoencoding/autoencoding.factor
@ -6,11 +6,14 @@ io.encodings.string io.encodings combinators accessors
 xml.data io.encodings.iana ;
 IN: xml.autoencoding

+: decode-stream ( encoding -- )
+    spot get [ swap re-decode ] change-stream drop ;
+
 : continue-make-tag ( str -- tag )
    parse-name-starting middle-tag end-tag ;

 : start-utf16le ( -- tag )
-    utf16le decode-input
+    utf16le decode-stream
    "?\0" expect
    check instruct ;

@ -22,25 +25,25 @@ IN: xml.autoencoding
    ! that the first letter of the document is < and second is
    ! not ASCII
    ascii?
-    [ utf8 decode-input next make-tag ] [
+    [ utf8 decode-stream next make-tag ] [
        next
        [ get-next 10xxxxxx? not ] take-until
        get-char suffix utf8 decode
-        utf8 decode-input next
+        utf8 decode-stream next
        continue-make-tag
    ] if ;

 : prolog-encoding ( prolog -- )
    encoding>> dup "UTF-16" =
-    [ drop ] [ name>encoding [ decode-input ] when* ] if ;
+    [ drop ] [ name>encoding [ decode-stream ] when* ] if ;

 : instruct-encoding ( instruct/prolog -- )
    dup prolog?
    [ prolog-encoding ]
-    [ drop utf8 decode-input ] if ;
+    [ drop utf8 decode-stream ] if ;

 : go-utf8 ( -- )
-    check utf8 decode-input next next ;
+    check utf8 decode-stream next next ;

 : start< ( -- tag )
    ! What if first letter of processing instruction is non-ASCII?
@ -52,11 +55,11 @@ IN: xml.autoencoding
    } case ;

 : skip-utf8-bom ( -- tag )
-    "\u0000bb\u0000bf" expect utf8 decode-input
+    "\u0000bb\u0000bf" expect utf8 decode-stream
    "<" expect check make-tag ;

 : decode-expecting ( encoding string -- tag )
-    [ decode-input next ] [ expect ] bi* check make-tag ;
+    [ decode-stream next ] [ expect ] bi* check make-tag ;

 : start-utf16be ( -- tag )
    utf16be "<" decode-expecting ;
@ -74,6 +77,6 @@ IN: xml.autoencoding
        { HEX: EF [ skip-utf8-bom ] }
        { HEX: FF [ skip-utf16le-bom ] }
        { HEX: FE [ skip-utf16be-bom ] }
-        [ drop utf8 decode-input check f ]
+        [ drop utf8 decode-stream check f ]
    } case ;

--- a/basis/xml/state/state.factor
+++ b/basis/xml/state/state.factor
@ -4,8 +4,7 @@ USING: accessors kernel namespaces io math ;
 IN: xml.state

 TUPLE: spot
-    char { line fixnum } { column fixnum }
-    next check version-1.0? ;
+    char line column next check version-1.0? stream ;

 C: <spot> spot

--- a/basis/xml/tokenize/tokenize.factor
+++ b/basis/xml/tokenize/tokenize.factor
@ -2,7 +2,7 @@
 ! See http://factorcode.org/license.txt for BSD license.
 USING: namespaces xml.state kernel sequences accessors
 xml.char-classes xml.errors math io sbufs fry strings ascii
-circular xml.entities assocs make splitting math.parser
+circular xml.entities assocs splitting math.parser
 locals combinators arrays hints ;
 IN: xml.tokenize

@ -31,11 +31,11 @@ HINTS: record { spot fixnum } ;

 :: (next) ( spot -- spot char )
    spot next>> :> old-next
-    read1 :> new-next
+    spot stream>> stream-read1 :> new-next
    old-next CHAR: \r = [
        spot CHAR: \n >>char
        new-next CHAR: \n =
-        [ read1 >>next ]
+        [ spot stream>> stream-read1 >>next ]
        [ new-next >>next ] if
    ] [ spot old-next >>char new-next >>next ] if
    spot next>> ; inline
@ -50,7 +50,9 @@ HINTS: next* { spot } ;
    spot get next* ;

 : init-parser ( -- )
-    0 1 0 0 f t <spot> spot set
+    0 1 0 0 f t f <spot>
+        input-stream get >>stream
+    spot set
    read1 set-next next ;

 : with-state ( stream quot -- )
@ -116,18 +118,28 @@ HINTS: next* { spot } ;
    take-; dup pe-table get at
    [ swap push-all ] [ no-entity ] ?if ;

-:: (parse-char) ( quot: ( ch -- ? ) accum -- )
-    get-char :> char
+:: (parse-char) ( quot: ( ch -- ? ) accum spot -- )
+    spot char>> :> char
    {
        { [ char not ] [ ] }
-        { [ char quot call ] [ next ] }
-        { [ char CHAR: & = ] [ accum parse-entity quot accum (parse-char) ] }
-        { [ in-dtd? get char CHAR: % = and ] [ accum parse-pe quot accum (parse-char) ] }
-        [ char accum push next quot accum (parse-char) ]
+        { [ char quot call ] [ spot next* ] }
+        { [ char CHAR: & = ] [
+            accum parse-entity
+            quot accum spot (parse-char)
+        ] }
+        { [ in-dtd? get char CHAR: % = and ] [
+            accum parse-pe
+            quot accum spot (parse-char)
+        ] }
+        [
+            char accum push
+            spot next*
+            quot accum spot (parse-char)
+        ]
    } cond ; inline recursive

 : parse-char ( quot: ( ch -- ? ) -- seq )
-    1024 <sbuf> [ (parse-char) ] keep >string ; inline
+    1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline

 : assure-no-]]> ( circular -- )
    "]]>" sequence= [ text-w/]]> ] when ;