From 81e3bef5078ca670a6439831b7e013965689f5ab Mon Sep 17 00:00:00 2001 From: John Benediktsson Date: Fri, 30 Sep 2011 12:47:38 -0700 Subject: [PATCH] xml: 25% (or more) faster. Main performance improvements from: - improving text? check performance - fewer "spot get char>>" in skip-until - better string matching (don't use circular-string sequence=) --- basis/xml/autoencoding/autoencoding.factor | 4 +- basis/xml/char-classes/char-classes.factor | 29 ++++++-- basis/xml/elements/elements.factor | 22 +++--- basis/xml/name/name.factor | 4 +- basis/xml/state/state.factor | 28 +++----- basis/xml/tests/state-parser-tests.factor | 4 +- basis/xml/tokenize/tokenize.factor | 82 ++++++++++++---------- basis/xml/xml.factor | 6 +- 8 files changed, 95 insertions(+), 84 deletions(-) diff --git a/basis/xml/autoencoding/autoencoding.factor b/basis/xml/autoencoding/autoencoding.factor index 63482ff706..1057463d4f 100644 --- a/basis/xml/autoencoding/autoencoding.factor +++ b/basis/xml/autoencoding/autoencoding.factor @@ -19,7 +19,7 @@ IN: xml.autoencoding : 10xxxxxx? ( ch -- ? ) -6 shift 3 bitand 2 = ; - + : start + +: text? ( 1.0? char -- ? ) + swap [ 1.0-text? ] [ 1.1-text? ] if ; HINTS: text? { object fixnum } ; diff --git a/basis/xml/elements/elements.factor b/basis/xml/elements/elements.factor index 1e59c19909..8d7f963625 100644 --- a/basis/xml/elements/elements.factor +++ b/basis/xml/elements/elements.factor @@ -9,7 +9,7 @@ IN: xml.elements : take-interpolated ( quot -- interpolated ) interpolating? get [ - drop get-char CHAR: > = + drop get-char CHAR: > eq? [ next f ] [ "->" take-string [ blank? ] trim ] if @@ -20,13 +20,13 @@ IN: xml.elements : parse-attr ( -- ) parse-name pass-blank "=" expect pass-blank - get-char CHAR: < = + get-char CHAR: < eq? [ "<-" expect interpolate-quote ] [ t parse-quote* ] if 2array , ; : start-tag ( -- name ? ) #! Outputs the name and whether this is a closing tag - get-char CHAR: / = dup [ next ] when + get-char CHAR: / eq? dup [ next ] when parse-name swap ; : (middle-tag) ( -- ) @@ -41,10 +41,10 @@ IN: xml.elements : middle-tag ( -- attrs-alist ) ! f make will make a vector if it has any elements [ (middle-tag) ] f make pass-blank - assure-no-duplicates ; + dup length 1 > [ assure-no-duplicates ] when ; : end-tag ( name attrs-alist -- tag ) - tag-ns pass-blank get-char CHAR: / = + tag-ns pass-blank get-char CHAR: / eq? [ pop-ns next ">" expect ] [ depth inc close ] if ; @@ -136,7 +136,7 @@ DEFER: make-tag ! Is this unavoidable? [ take-external-id ] [ f ] if ; : take-internal ( -- dtd/f ) - get-char CHAR: [ = + get-char CHAR: [ eq? [ next take-internal-subset ] [ f ] if ; : take-doctype-decl ( -- doctype-decl ) @@ -169,9 +169,9 @@ DEFER: make-tag ! Is this unavoidable? [ "-" bad-name ] take-interpolated ; : make-tag ( -- tag ) - { - { [ get-char dup CHAR: ! = ] [ drop next direct ] } - { [ dup CHAR: ? = ] [ drop next instruct ] } - { [ dup CHAR: - = ] [ drop next interpolate-tag ] } + get-char { + { CHAR: ! [ next direct ] } + { CHAR: ? [ next instruct ] } + { CHAR: - [ next interpolate-tag ] } [ drop normal-tag ] - } cond ; + } case ; diff --git a/basis/xml/name/name.factor b/basis/xml/name/name.factor index 1907a83a83..7f79622392 100644 --- a/basis/xml/name/name.factor +++ b/basis/xml/name/name.factor @@ -66,7 +66,7 @@ SYMBOL: ns-stack ] ?if ; : take-name ( -- string ) - version-1.0? '[ _ get-char name-char? not ] take-until ; + version-1.0? '[ _ swap name-char? not ] take-until ; : parse-name ( -- name ) take-name interpret-name ; @@ -88,7 +88,7 @@ SYMBOL: ns-stack } case ; : take-word ( -- string ) - [ get-char blank? ] take-until ; + [ blank? ] take-until ; : take-external-id ( -- external-id ) take-word (take-external-id) ; diff --git a/basis/xml/state/state.factor b/basis/xml/state/state.factor index cf103f141b..03f8c8f3fb 100644 --- a/basis/xml/state/state.factor +++ b/basis/xml/state/state.factor @@ -3,33 +3,23 @@ USING: accessors kernel namespaces io math ; IN: xml.state -TUPLE: spot - char line column next check version-1.0? stream ; +TUPLE: spot char line column next check version-1.0? stream ; C: spot -: get-char ( -- char ) spot get char>> ; -: set-char ( char -- ) spot get swap >>char drop ; -: get-line ( -- line ) spot get line>> ; -: set-line ( line -- ) spot get swap >>line drop ; -: get-column ( -- column ) spot get column>> ; -: set-column ( column -- ) spot get swap >>column drop ; -: get-next ( -- char ) spot get next>> ; -: set-next ( char -- ) spot get swap >>next drop ; -: get-check ( -- ? ) spot get check>> ; -: check ( -- ) spot get t >>check drop ; -: version-1.0? ( -- ? ) spot get version-1.0?>> ; +: get-char ( -- char ) spot get char>> ; inline +: get-line ( -- line ) spot get line>> ; inline +: get-column ( -- column ) spot get column>> ; inline +: get-next ( -- char ) spot get next>> ; inline +: get-check ( -- ? ) spot get check>> ; inline +: check ( -- ) spot get t >>check drop ; inline +: version-1.0? ( -- ? ) spot get version-1.0?>> ; inline : set-version ( string -- ) - spot get swap "1.0" = >>version-1.0? drop ; + spot get swap "1.0" = >>version-1.0? drop ; inline SYMBOL: xml-stack - SYMBOL: depth - SYMBOL: interpolating? - SYMBOL: in-dtd? - SYMBOL: pe-table - SYMBOL: extra-entities diff --git a/basis/xml/tests/state-parser-tests.factor b/basis/xml/tests/state-parser-tests.factor index 5e214dc4a3..550378fea0 100644 --- a/basis/xml/tests/state-parser-tests.factor +++ b/basis/xml/tests/state-parser-tests.factor @@ -5,14 +5,14 @@ IN: xml.test.state [ ] dip with-state ; inline : take-rest ( -- string ) - [ f ] take-until ; + [ drop f ] take-until ; : take-char ( char -- string ) 1string take-to ; [ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test [ 2 3 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test -[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test +[ "hi" " how are you?" ] [ "hi how are you?" [ [ blank? ] take-until take-rest ] string-parse ] unit-test [ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test [ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test [ "baz" ] [ " \n\t baz" [ pass-blank take-rest ] string-parse ] unit-test diff --git a/basis/xml/tokenize/tokenize.factor b/basis/xml/tokenize/tokenize.factor index 8978c660f4..f1f8bc83fb 100644 --- a/basis/xml/tokenize/tokenize.factor +++ b/basis/xml/tokenize/tokenize.factor @@ -2,7 +2,7 @@ ! See http://factorcode.org/license.txt for BSD license. USING: namespaces xml.state kernel sequences accessors xml.char-classes xml.errors math io sbufs fry strings ascii -circular xml.entities assocs splitting math.parser +xml.entities assocs splitting math.parser locals combinators arrays hints ; IN: xml.tokenize @@ -10,19 +10,20 @@ IN: xml.tokenize : assure-good-char ( spot ch -- ) [ - swap + over [ version-1.0?>> over text? not ] - [ check>> ] bi and [ - spot get [ 1 + ] change-column drop + [ check>> ] bi and + [ + [ [ 1 + ] change-column drop ] dip disallowed-char - ] [ drop ] if + ] [ 2drop ] if ] [ drop ] if* ; HINTS: assure-good-char { spot fixnum } ; : record ( spot char -- spot ) over char>> [ - CHAR: \n = + CHAR: \n eq? [ [ 1 + ] change-line -1 ] [ dup column>> 1 + ] if >>column ] [ drop ] if ; @@ -32,9 +33,9 @@ HINTS: record { spot fixnum } ; :: (next) ( spot -- spot char ) spot next>> :> old-next spot stream>> stream-read1 :> new-next - old-next CHAR: \r = [ + old-next CHAR: \r eq? [ spot CHAR: \n >>char - new-next CHAR: \n = + new-next CHAR: \n eq? [ spot stream>> stream-read1 >>next ] [ new-next >>next ] if ] [ spot old-next >>char new-next >>next ] if @@ -52,46 +53,46 @@ HINTS: next* { spot } ; : init-parser ( -- ) 0 1 0 0 f t f input-stream get >>stream - spot set - read1 set-next next ; + read1 >>next + spot set next ; : with-state ( stream quot -- ) ! with-input-stream implicitly creates a new scope which we use swap [ init-parser call ] with-input-stream ; inline -:: (skip-until) ( ... quot: ( ... -- ... ? ) spot -- ... ) +:: (skip-until) ( ... quot: ( ... char -- ... ? ) spot -- ... ) spot char>> [ quot call [ spot next* quot spot (skip-until) ] unless - ] when ; inline recursive + ] when* ; inline recursive -: skip-until ( ... quot: ( ... -- ... ? ) -- ... ) +: skip-until ( ... quot: ( ... char -- ... ? ) -- ... ) spot get (skip-until) ; inline -: take-until ( quot -- string ) +: take-until ( ... quot: ( ... char -- ... ? ) -- ... string ) #! Take the substring of a string starting at spot #! from code until the quotation given is true and #! advance spot to after the substring. - 10 [ - spot get swap - '[ @ [ t ] [ _ char>> _ push f ] if ] skip-until - ] keep >string ; inline + 10 [ + '[ _ keep over [ drop ] [ _ push ] if ] skip-until + ] keep >string ; inline : take-to ( seq -- string ) - spot get swap '[ _ char>> _ member? ] take-until ; + '[ _ member? ] take-until ; : pass-blank ( -- ) #! Advance code past any whitespace, including newlines - spot get '[ _ char>> blank? not ] skip-until ; + [ blank? not ] skip-until ; -: string-matches? ( string circular spot -- ? ) - char>> over circular-push sequence= ; +: string-matcher ( str -- quot: ( pos char -- pos ? ) ) + dup length 1 - '[ + over _ nth eq? [ 1 + ] [ drop 0 ] if dup _ > + ] ; inline : take-string ( match -- string ) - dup length - spot get '[ 2dup _ string-matches? ] take-until nip - dup length rot length 1 - - head + [ 0 swap string-matcher take-until nip ] keep + dupd [ length ] bi@ 1 - - head get-char [ missing-close ] unless next ; : expect ( string -- ) @@ -123,11 +124,11 @@ HINTS: next* { spot } ; { { [ char not ] [ ] } { [ char quot call ] [ spot next* ] } - { [ char CHAR: & = ] [ + { [ char CHAR: & eq? ] [ accum parse-entity quot accum spot (parse-char) ] } - { [ in-dtd? get char CHAR: % = and ] [ + { [ char CHAR: % eq? in-dtd? get and ] [ accum parse-pe quot accum spot (parse-char) ] } @@ -141,18 +142,21 @@ HINTS: next* { spot } ; : parse-char ( quot: ( ch -- ? ) -- seq ) 1024 [ spot get (parse-char) ] keep >string ; inline -: assure-no-]]> ( circular -- ) - "]]>" sequence= [ text-w/]]> ] when ; +: assure-no-]]> ( pos char -- pos' ) + over "]]>" nth eq? [ 1 + ] [ drop 0 ] if + dup 2 > [ text-w/]]> ] when ; :: parse-text ( -- string ) - 3 f :> circ - depth get zero? :> no-text [| char | - char circ circular-push - circ assure-no-]]> - no-text [ char blank? char CHAR: < = or [ - char 1string t pre/post-content - ] unless ] when - char CHAR: < = + 0 :> pos! + depth get zero? :> no-text + [| char | + pos char assure-no-]]> pos! + no-text [ + char blank? char CHAR: < eq? or [ + char 1string t pre/post-content + ] unless + ] when + char CHAR: < eq? ] parse-char ; : close ( -- ) @@ -163,8 +167,8 @@ HINTS: next* { spot } ; : (parse-quote) ( <-disallowed? ch -- string ) swap '[ - dup _ = [ drop t ] - [ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if + dup _ eq? [ drop t ] + [ CHAR: < eq? _ and [ attr-w/< ] [ f ] if ] if ] parse-char normalize-quote get-char [ unclosed-quote ] unless ; inline diff --git a/basis/xml/xml.factor b/basis/xml/xml.factor index ac6fbfcddc..7667ff3b21 100644 --- a/basis/xml/xml.factor +++ b/basis/xml/xml.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2005, 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays io io.encodings.binary io.files -io.streams.string kernel namespaces sequences strings +io.streams.string kernel math namespaces sequences strings io.encodings.utf8 xml.data xml.errors xml.elements ascii xml.entities xml.state xml.autoencoding assocs xml.tokenize combinators.short-circuit xml.name splitting @@ -147,8 +147,8 @@ PRIVATE> swap [ call ] keep ; inline : xml-loop ( quot: ( xml-elem -- ) -- ) - parse-text call-under - get-char [ make-tag call-under xml-loop ] + parse-text call-under get-char + [ make-tag call-under xml-loop ] [ drop ] if ; inline recursive : read-seq ( stream quot n -- seq )