xml: 25% (or more) faster.

Main performance improvements from:
- improving text? check performance
- fewer "spot get char>>" in skip-until
- better string matching (don't use circular-string sequence=)
db4
John Benediktsson 2011-09-30 12:47:38 -07:00
parent 7bdf805a0f
commit 81e3bef507
8 changed files with 95 additions and 84 deletions

View File

@ -27,7 +27,7 @@ IN: xml.autoencoding
ascii? ascii?
[ utf8 decode-stream next make-tag ] [ [ utf8 decode-stream next make-tag ] [
next next
[ get-next 10xxxxxx? not ] take-until [ drop get-next 10xxxxxx? not ] take-until
get-char suffix utf8 decode get-char suffix utf8 decode
utf8 decode-stream next utf8 decode-stream next
continue-make-tag continue-make-tag

View File

@ -28,16 +28,33 @@ CATEGORY: 1.1name-char
: name-char? ( 1.0? char -- ? ) : name-char? ( 1.0? char -- ? )
swap [ 1.0name-char? ] [ 1.1name-char? ] if ; swap [ 1.0name-char? ] [ 1.1name-char? ] if ;
: text? ( 1.0? char -- ? ) HINTS: name-start? { object fixnum } ;
HINTS: name-char? { object fixnum } ;
<PRIVATE
: 1.0-text? ( char -- ? )
! 1.0: ! 1.0:
! #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] ! #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
{
[ HEX: 20 HEX: D7FF between? ]
[ "\t\r\n" member? ]
[ HEX: E000 HEX: FFFD between? ]
[ HEX: 10000 HEX: 10FFFFF between? ]
} 1|| ; inline
: 1.1-text? ( char -- ? )
! 1.1: ! 1.1:
! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] ! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
{ {
{ [ dup HEX: 20 < ] [ swap [ "\t\r\n" member? ] [ zero? not ] if ] } [ HEX: 1 HEX: D7FF between? ]
{ [ nip dup HEX: D800 < ] [ drop t ] } [ HEX: E000 HEX: FFFD between? ]
{ [ dup HEX: E000 < ] [ drop f ] } [ HEX: 10000 HEX: 10FFFF between? ]
[ { HEX: FFFE HEX: FFFF } member? not ] } 1|| ; inline
} cond ;
PRIVATE>
: text? ( 1.0? char -- ? )
swap [ 1.0-text? ] [ 1.1-text? ] if ;
HINTS: text? { object fixnum } ; HINTS: text? { object fixnum } ;

View File

@ -9,7 +9,7 @@ IN: xml.elements
: take-interpolated ( quot -- interpolated ) : take-interpolated ( quot -- interpolated )
interpolating? get [ interpolating? get [
drop get-char CHAR: > = drop get-char CHAR: > eq?
[ next f ] [ next f ]
[ "->" take-string [ blank? ] trim ] [ "->" take-string [ blank? ] trim ]
if <interpolated> if <interpolated>
@ -20,13 +20,13 @@ IN: xml.elements
: parse-attr ( -- ) : parse-attr ( -- )
parse-name pass-blank "=" expect pass-blank parse-name pass-blank "=" expect pass-blank
get-char CHAR: < = get-char CHAR: < eq?
[ "<-" expect interpolate-quote ] [ "<-" expect interpolate-quote ]
[ t parse-quote* ] if 2array , ; [ t parse-quote* ] if 2array , ;
: start-tag ( -- name ? ) : start-tag ( -- name ? )
#! Outputs the name and whether this is a closing tag #! Outputs the name and whether this is a closing tag
get-char CHAR: / = dup [ next ] when get-char CHAR: / eq? dup [ next ] when
parse-name swap ; parse-name swap ;
: (middle-tag) ( -- ) : (middle-tag) ( -- )
@ -41,10 +41,10 @@ IN: xml.elements
: middle-tag ( -- attrs-alist ) : middle-tag ( -- attrs-alist )
! f make will make a vector if it has any elements ! f make will make a vector if it has any elements
[ (middle-tag) ] f make pass-blank [ (middle-tag) ] f make pass-blank
assure-no-duplicates ; dup length 1 > [ assure-no-duplicates ] when ;
: end-tag ( name attrs-alist -- tag ) : end-tag ( name attrs-alist -- tag )
tag-ns pass-blank get-char CHAR: / = tag-ns pass-blank get-char CHAR: / eq?
[ pop-ns <contained> next ">" expect ] [ pop-ns <contained> next ">" expect ]
[ depth inc <opener> close ] if ; [ depth inc <opener> close ] if ;
@ -136,7 +136,7 @@ DEFER: make-tag ! Is this unavoidable?
[ take-external-id ] [ f ] if ; [ take-external-id ] [ f ] if ;
: take-internal ( -- dtd/f ) : take-internal ( -- dtd/f )
get-char CHAR: [ = get-char CHAR: [ eq?
[ next take-internal-subset ] [ f ] if ; [ next take-internal-subset ] [ f ] if ;
: take-doctype-decl ( -- doctype-decl ) : take-doctype-decl ( -- doctype-decl )
@ -169,9 +169,9 @@ DEFER: make-tag ! Is this unavoidable?
[ "-" bad-name ] take-interpolated ; [ "-" bad-name ] take-interpolated ;
: make-tag ( -- tag ) : make-tag ( -- tag )
{ get-char {
{ [ get-char dup CHAR: ! = ] [ drop next direct ] } { CHAR: ! [ next direct ] }
{ [ dup CHAR: ? = ] [ drop next instruct ] } { CHAR: ? [ next instruct ] }
{ [ dup CHAR: - = ] [ drop next interpolate-tag ] } { CHAR: - [ next interpolate-tag ] }
[ drop normal-tag ] [ drop normal-tag ]
} cond ; } case ;

View File

@ -66,7 +66,7 @@ SYMBOL: ns-stack
] ?if ; ] ?if ;
: take-name ( -- string ) : take-name ( -- string )
version-1.0? '[ _ get-char name-char? not ] take-until ; version-1.0? '[ _ swap name-char? not ] take-until ;
: parse-name ( -- name ) : parse-name ( -- name )
take-name interpret-name ; take-name interpret-name ;
@ -88,7 +88,7 @@ SYMBOL: ns-stack
} case ; } case ;
: take-word ( -- string ) : take-word ( -- string )
[ get-char blank? ] take-until ; [ blank? ] take-until ;
: take-external-id ( -- external-id ) : take-external-id ( -- external-id )
take-word (take-external-id) ; take-word (take-external-id) ;

View File

@ -3,33 +3,23 @@
USING: accessors kernel namespaces io math ; USING: accessors kernel namespaces io math ;
IN: xml.state IN: xml.state
TUPLE: spot TUPLE: spot char line column next check version-1.0? stream ;
char line column next check version-1.0? stream ;
C: <spot> spot C: <spot> spot
: get-char ( -- char ) spot get char>> ; : get-char ( -- char ) spot get char>> ; inline
: set-char ( char -- ) spot get swap >>char drop ; : get-line ( -- line ) spot get line>> ; inline
: get-line ( -- line ) spot get line>> ; : get-column ( -- column ) spot get column>> ; inline
: set-line ( line -- ) spot get swap >>line drop ; : get-next ( -- char ) spot get next>> ; inline
: get-column ( -- column ) spot get column>> ; : get-check ( -- ? ) spot get check>> ; inline
: set-column ( column -- ) spot get swap >>column drop ; : check ( -- ) spot get t >>check drop ; inline
: get-next ( -- char ) spot get next>> ; : version-1.0? ( -- ? ) spot get version-1.0?>> ; inline
: set-next ( char -- ) spot get swap >>next drop ;
: get-check ( -- ? ) spot get check>> ;
: check ( -- ) spot get t >>check drop ;
: version-1.0? ( -- ? ) spot get version-1.0?>> ;
: set-version ( string -- ) : set-version ( string -- )
spot get swap "1.0" = >>version-1.0? drop ; spot get swap "1.0" = >>version-1.0? drop ; inline
SYMBOL: xml-stack SYMBOL: xml-stack
SYMBOL: depth SYMBOL: depth
SYMBOL: interpolating? SYMBOL: interpolating?
SYMBOL: in-dtd? SYMBOL: in-dtd?
SYMBOL: pe-table SYMBOL: pe-table
SYMBOL: extra-entities SYMBOL: extra-entities

View File

@ -5,14 +5,14 @@ IN: xml.test.state
[ <string-reader> ] dip with-state ; inline [ <string-reader> ] dip with-state ; inline
: take-rest ( -- string ) : take-rest ( -- string )
[ f ] take-until ; [ drop f ] take-until ;
: take-char ( char -- string ) : take-char ( char -- string )
1string take-to ; 1string take-to ;
[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test [ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
[ 2 3 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test [ 2 3 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test
[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test [ "hi" " how are you?" ] [ "hi how are you?" [ [ blank? ] take-until take-rest ] string-parse ] unit-test
[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test [ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
[ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test [ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test
[ "baz" ] [ " \n\t baz" [ pass-blank take-rest ] string-parse ] unit-test [ "baz" ] [ " \n\t baz" [ pass-blank take-rest ] string-parse ] unit-test

View File

@ -2,7 +2,7 @@
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: namespaces xml.state kernel sequences accessors USING: namespaces xml.state kernel sequences accessors
xml.char-classes xml.errors math io sbufs fry strings ascii xml.char-classes xml.errors math io sbufs fry strings ascii
circular xml.entities assocs splitting math.parser xml.entities assocs splitting math.parser
locals combinators arrays hints ; locals combinators arrays hints ;
IN: xml.tokenize IN: xml.tokenize
@ -10,19 +10,20 @@ IN: xml.tokenize
: assure-good-char ( spot ch -- ) : assure-good-char ( spot ch -- )
[ [
swap over
[ version-1.0?>> over text? not ] [ version-1.0?>> over text? not ]
[ check>> ] bi and [ [ check>> ] bi and
spot get [ 1 + ] change-column drop [
[ [ 1 + ] change-column drop ] dip
disallowed-char disallowed-char
] [ drop ] if ] [ 2drop ] if
] [ drop ] if* ; ] [ drop ] if* ;
HINTS: assure-good-char { spot fixnum } ; HINTS: assure-good-char { spot fixnum } ;
: record ( spot char -- spot ) : record ( spot char -- spot )
over char>> [ over char>> [
CHAR: \n = CHAR: \n eq?
[ [ 1 + ] change-line -1 ] [ dup column>> 1 + ] if [ [ 1 + ] change-line -1 ] [ dup column>> 1 + ] if
>>column >>column
] [ drop ] if ; ] [ drop ] if ;
@ -32,9 +33,9 @@ HINTS: record { spot fixnum } ;
:: (next) ( spot -- spot char ) :: (next) ( spot -- spot char )
spot next>> :> old-next spot next>> :> old-next
spot stream>> stream-read1 :> new-next spot stream>> stream-read1 :> new-next
old-next CHAR: \r = [ old-next CHAR: \r eq? [
spot CHAR: \n >>char spot CHAR: \n >>char
new-next CHAR: \n = new-next CHAR: \n eq?
[ spot stream>> stream-read1 >>next ] [ spot stream>> stream-read1 >>next ]
[ new-next >>next ] if [ new-next >>next ] if
] [ spot old-next >>char new-next >>next ] if ] [ spot old-next >>char new-next >>next ] if
@ -52,46 +53,46 @@ HINTS: next* { spot } ;
: init-parser ( -- ) : init-parser ( -- )
0 1 0 0 f t f <spot> 0 1 0 0 f t f <spot>
input-stream get >>stream input-stream get >>stream
spot set read1 >>next
read1 set-next next ; spot set next ;
: with-state ( stream quot -- ) : with-state ( stream quot -- )
! with-input-stream implicitly creates a new scope which we use ! with-input-stream implicitly creates a new scope which we use
swap [ init-parser call ] with-input-stream ; inline swap [ init-parser call ] with-input-stream ; inline
:: (skip-until) ( ... quot: ( ... -- ... ? ) spot -- ... ) :: (skip-until) ( ... quot: ( ... char -- ... ? ) spot -- ... )
spot char>> [ spot char>> [
quot call [ quot call [
spot next* quot spot (skip-until) spot next* quot spot (skip-until)
] unless ] unless
] when ; inline recursive ] when* ; inline recursive
: skip-until ( ... quot: ( ... -- ... ? ) -- ... ) : skip-until ( ... quot: ( ... char -- ... ? ) -- ... )
spot get (skip-until) ; inline spot get (skip-until) ; inline
: take-until ( quot -- string ) : take-until ( ... quot: ( ... char -- ... ? ) -- ... string )
#! Take the substring of a string starting at spot #! Take the substring of a string starting at spot
#! from code until the quotation given is true and #! from code until the quotation given is true and
#! advance spot to after the substring. #! advance spot to after the substring.
10 <sbuf> [ 10 <sbuf> [
spot get swap '[ _ keep over [ drop ] [ _ push ] if ] skip-until
'[ @ [ t ] [ _ char>> _ push f ] if ] skip-until
] keep >string ; inline ] keep >string ; inline
: take-to ( seq -- string ) : take-to ( seq -- string )
spot get swap '[ _ char>> _ member? ] take-until ; '[ _ member? ] take-until ;
: pass-blank ( -- ) : pass-blank ( -- )
#! Advance code past any whitespace, including newlines #! Advance code past any whitespace, including newlines
spot get '[ _ char>> blank? not ] skip-until ; [ blank? not ] skip-until ;
: string-matches? ( string circular spot -- ? ) : string-matcher ( str -- quot: ( pos char -- pos ? ) )
char>> over circular-push sequence= ; dup length 1 - '[
over _ nth eq? [ 1 + ] [ drop 0 ] if dup _ >
] ; inline
: take-string ( match -- string ) : take-string ( match -- string )
dup length <circular-string> [ 0 swap string-matcher take-until nip ] keep
spot get '[ 2dup _ string-matches? ] take-until nip dupd [ length ] bi@ 1 - - head
dup length rot length 1 - - head
get-char [ missing-close ] unless next ; get-char [ missing-close ] unless next ;
: expect ( string -- ) : expect ( string -- )
@ -123,11 +124,11 @@ HINTS: next* { spot } ;
{ {
{ [ char not ] [ ] } { [ char not ] [ ] }
{ [ char quot call ] [ spot next* ] } { [ char quot call ] [ spot next* ] }
{ [ char CHAR: & = ] [ { [ char CHAR: & eq? ] [
accum parse-entity accum parse-entity
quot accum spot (parse-char) quot accum spot (parse-char)
] } ] }
{ [ in-dtd? get char CHAR: % = and ] [ { [ char CHAR: % eq? in-dtd? get and ] [
accum parse-pe accum parse-pe
quot accum spot (parse-char) quot accum spot (parse-char)
] } ] }
@ -141,18 +142,21 @@ HINTS: next* { spot } ;
: parse-char ( quot: ( ch -- ? ) -- seq ) : parse-char ( quot: ( ch -- ? ) -- seq )
1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline 1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline
: assure-no-]]> ( circular -- ) : assure-no-]]> ( pos char -- pos' )
"]]>" sequence= [ text-w/]]> ] when ; over "]]>" nth eq? [ 1 + ] [ drop 0 ] if
dup 2 > [ text-w/]]> ] when ;
:: parse-text ( -- string ) :: parse-text ( -- string )
3 f <array> <circular> :> circ 0 :> pos!
depth get zero? :> no-text [| char | depth get zero? :> no-text
char circ circular-push [| char |
circ assure-no-]]> pos char assure-no-]]> pos!
no-text [ char blank? char CHAR: < = or [ no-text [
char blank? char CHAR: < eq? or [
char 1string t pre/post-content char 1string t pre/post-content
] unless ] when ] unless
char CHAR: < = ] when
char CHAR: < eq?
] parse-char ; ] parse-char ;
: close ( -- ) : close ( -- )
@ -163,8 +167,8 @@ HINTS: next* { spot } ;
: (parse-quote) ( <-disallowed? ch -- string ) : (parse-quote) ( <-disallowed? ch -- string )
swap '[ swap '[
dup _ = [ drop t ] dup _ eq? [ drop t ]
[ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if [ CHAR: < eq? _ and [ attr-w/< ] [ f ] if ] if
] parse-char normalize-quote get-char ] parse-char normalize-quote get-char
[ unclosed-quote ] unless ; inline [ unclosed-quote ] unless ; inline

View File

@ -1,7 +1,7 @@
! Copyright (C) 2005, 2009 Daniel Ehrenberg ! Copyright (C) 2005, 2009 Daniel Ehrenberg
! See http://factorcode.org/license.txt for BSD license. ! See http://factorcode.org/license.txt for BSD license.
USING: accessors arrays io io.encodings.binary io.files USING: accessors arrays io io.encodings.binary io.files
io.streams.string kernel namespaces sequences strings io.streams.string kernel math namespaces sequences strings
io.encodings.utf8 xml.data xml.errors xml.elements ascii io.encodings.utf8 xml.data xml.errors xml.elements ascii
xml.entities xml.state xml.autoencoding assocs xml.tokenize xml.entities xml.state xml.autoencoding assocs xml.tokenize
combinators.short-circuit xml.name splitting combinators.short-circuit xml.name splitting
@ -147,8 +147,8 @@ PRIVATE>
swap [ call ] keep ; inline swap [ call ] keep ; inline
: xml-loop ( quot: ( xml-elem -- ) -- ) : xml-loop ( quot: ( xml-elem -- ) -- )
parse-text call-under parse-text call-under get-char
get-char [ make-tag call-under xml-loop ] [ make-tag call-under xml-loop ]
[ drop ] if ; inline recursive [ drop ] if ; inline recursive
: read-seq ( stream quot n -- seq ) : read-seq ( stream quot n -- seq )