xml: 25% (or more) faster.
Main performance improvements from: - improving text? check performance - fewer "spot get char>>" in skip-until - better string matching (don't use circular-string sequence=)db4
							parent
							
								
									7bdf805a0f
								
							
						
					
					
						commit
						81e3bef507
					
				| 
						 | 
				
			
			@ -27,7 +27,7 @@ IN: xml.autoencoding
 | 
			
		|||
    ascii?
 | 
			
		||||
    [ utf8 decode-stream next make-tag ] [
 | 
			
		||||
        next
 | 
			
		||||
        [ get-next 10xxxxxx? not ] take-until
 | 
			
		||||
        [ drop get-next 10xxxxxx? not ] take-until
 | 
			
		||||
        get-char suffix utf8 decode
 | 
			
		||||
        utf8 decode-stream next
 | 
			
		||||
        continue-make-tag
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,16 +28,33 @@ CATEGORY: 1.1name-char
 | 
			
		|||
: name-char? ( 1.0? char -- ? )
 | 
			
		||||
    swap [ 1.0name-char? ] [ 1.1name-char? ] if ;
 | 
			
		||||
 | 
			
		||||
: text? ( 1.0? char -- ? )
 | 
			
		||||
HINTS: name-start? { object fixnum } ;
 | 
			
		||||
HINTS: name-char? { object fixnum } ;
 | 
			
		||||
 | 
			
		||||
<PRIVATE
 | 
			
		||||
 | 
			
		||||
: 1.0-text? ( char -- ? )
 | 
			
		||||
    ! 1.0:
 | 
			
		||||
    ! #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 | 
			
		||||
    {
 | 
			
		||||
        [ HEX: 20 HEX: D7FF between? ]
 | 
			
		||||
        [ "\t\r\n" member? ]
 | 
			
		||||
        [ HEX: E000 HEX: FFFD between? ]
 | 
			
		||||
        [ HEX: 10000 HEX: 10FFFFF between? ]
 | 
			
		||||
    } 1|| ; inline
 | 
			
		||||
 | 
			
		||||
: 1.1-text? ( char -- ? )
 | 
			
		||||
    ! 1.1:
 | 
			
		||||
    ! [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 | 
			
		||||
    {
 | 
			
		||||
        { [ dup HEX: 20 < ] [ swap [ "\t\r\n" member? ] [ zero? not ] if ] }
 | 
			
		||||
        { [ nip dup HEX: D800 < ] [ drop t ] }
 | 
			
		||||
        { [ dup HEX: E000 < ] [ drop f ] }
 | 
			
		||||
        [ { HEX: FFFE HEX: FFFF } member? not ]
 | 
			
		||||
    } cond ;
 | 
			
		||||
        [ HEX: 1 HEX: D7FF between? ]
 | 
			
		||||
        [ HEX: E000 HEX: FFFD between? ]
 | 
			
		||||
        [ HEX: 10000 HEX: 10FFFF between? ]
 | 
			
		||||
    } 1|| ; inline
 | 
			
		||||
 | 
			
		||||
PRIVATE>
 | 
			
		||||
 | 
			
		||||
: text? ( 1.0? char -- ? )
 | 
			
		||||
    swap [ 1.0-text? ] [ 1.1-text? ] if ;
 | 
			
		||||
 | 
			
		||||
HINTS: text? { object fixnum } ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,7 +9,7 @@ IN: xml.elements
 | 
			
		|||
 | 
			
		||||
: take-interpolated ( quot -- interpolated )
 | 
			
		||||
    interpolating? get [
 | 
			
		||||
        drop get-char CHAR: > =
 | 
			
		||||
        drop get-char CHAR: > eq?
 | 
			
		||||
        [ next f ]
 | 
			
		||||
        [ "->" take-string [ blank? ] trim ]
 | 
			
		||||
        if <interpolated>
 | 
			
		||||
| 
						 | 
				
			
			@ -20,13 +20,13 @@ IN: xml.elements
 | 
			
		|||
 | 
			
		||||
: parse-attr ( -- )
 | 
			
		||||
    parse-name pass-blank "=" expect pass-blank
 | 
			
		||||
    get-char CHAR: < =
 | 
			
		||||
    get-char CHAR: < eq?
 | 
			
		||||
    [ "<-" expect interpolate-quote ]
 | 
			
		||||
    [ t parse-quote* ] if 2array , ;
 | 
			
		||||
 | 
			
		||||
: start-tag ( -- name ? )
 | 
			
		||||
    #! Outputs the name and whether this is a closing tag
 | 
			
		||||
    get-char CHAR: / = dup [ next ] when
 | 
			
		||||
    get-char CHAR: / eq? dup [ next ] when
 | 
			
		||||
    parse-name swap ;
 | 
			
		||||
 | 
			
		||||
: (middle-tag) ( -- )
 | 
			
		||||
| 
						 | 
				
			
			@ -41,10 +41,10 @@ IN: xml.elements
 | 
			
		|||
: middle-tag ( -- attrs-alist )
 | 
			
		||||
    ! f make will make a vector if it has any elements
 | 
			
		||||
    [ (middle-tag) ] f make pass-blank
 | 
			
		||||
    assure-no-duplicates ;
 | 
			
		||||
    dup length 1 > [ assure-no-duplicates ] when ;
 | 
			
		||||
 | 
			
		||||
: end-tag ( name attrs-alist -- tag )
 | 
			
		||||
    tag-ns pass-blank get-char CHAR: / =
 | 
			
		||||
    tag-ns pass-blank get-char CHAR: / eq?
 | 
			
		||||
    [ pop-ns <contained> next ">" expect ]
 | 
			
		||||
    [ depth inc <opener> close ] if ;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -136,7 +136,7 @@ DEFER: make-tag ! Is this unavoidable?
 | 
			
		|||
    [ take-external-id ] [ f ] if ;
 | 
			
		||||
 | 
			
		||||
: take-internal ( -- dtd/f )
 | 
			
		||||
    get-char CHAR: [ =
 | 
			
		||||
    get-char CHAR: [ eq?
 | 
			
		||||
    [ next take-internal-subset ] [ f ] if ;
 | 
			
		||||
 | 
			
		||||
: take-doctype-decl ( -- doctype-decl )
 | 
			
		||||
| 
						 | 
				
			
			@ -169,9 +169,9 @@ DEFER: make-tag ! Is this unavoidable?
 | 
			
		|||
    [ "-" bad-name ] take-interpolated ;
 | 
			
		||||
 | 
			
		||||
: make-tag ( -- tag )
 | 
			
		||||
    {
 | 
			
		||||
        { [ get-char dup CHAR: ! = ] [ drop next direct ] }
 | 
			
		||||
        { [ dup CHAR: ? = ] [ drop next instruct ] }
 | 
			
		||||
        { [ dup CHAR: - = ] [ drop next interpolate-tag ] }
 | 
			
		||||
    get-char {
 | 
			
		||||
        { CHAR: ! [ next direct ] }
 | 
			
		||||
        { CHAR: ? [ next instruct ] }
 | 
			
		||||
        { CHAR: - [ next interpolate-tag ] }
 | 
			
		||||
        [ drop normal-tag ]
 | 
			
		||||
    } cond ;
 | 
			
		||||
    } case ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -66,7 +66,7 @@ SYMBOL: ns-stack
 | 
			
		|||
    ] ?if ;
 | 
			
		||||
 | 
			
		||||
: take-name ( -- string )
 | 
			
		||||
    version-1.0? '[ _ get-char name-char? not ] take-until ;
 | 
			
		||||
    version-1.0? '[ _ swap name-char? not ] take-until ;
 | 
			
		||||
 | 
			
		||||
: parse-name ( -- name )
 | 
			
		||||
    take-name interpret-name ;
 | 
			
		||||
| 
						 | 
				
			
			@ -88,7 +88,7 @@ SYMBOL: ns-stack
 | 
			
		|||
    } case ;
 | 
			
		||||
 | 
			
		||||
: take-word ( -- string )
 | 
			
		||||
    [ get-char blank? ] take-until ;
 | 
			
		||||
    [ blank? ] take-until ;
 | 
			
		||||
 | 
			
		||||
: take-external-id ( -- external-id )
 | 
			
		||||
    take-word (take-external-id) ;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,33 +3,23 @@
 | 
			
		|||
USING: accessors kernel namespaces io math ;
 | 
			
		||||
IN: xml.state
 | 
			
		||||
 | 
			
		||||
TUPLE: spot
 | 
			
		||||
    char line column next check version-1.0? stream ;
 | 
			
		||||
TUPLE: spot char line column next check version-1.0? stream ;
 | 
			
		||||
 | 
			
		||||
C: <spot> spot
 | 
			
		||||
 | 
			
		||||
: get-char ( -- char ) spot get char>> ;
 | 
			
		||||
: set-char ( char -- ) spot get swap >>char drop ;
 | 
			
		||||
: get-line ( -- line ) spot get line>> ;
 | 
			
		||||
: set-line ( line -- ) spot get swap >>line drop ;
 | 
			
		||||
: get-column ( -- column ) spot get column>> ;
 | 
			
		||||
: set-column ( column -- ) spot get swap >>column drop ;
 | 
			
		||||
: get-next ( -- char ) spot get next>> ;
 | 
			
		||||
: set-next ( char -- ) spot get swap >>next drop ;
 | 
			
		||||
: get-check ( -- ? ) spot get check>> ;
 | 
			
		||||
: check ( -- ) spot get t >>check drop ;
 | 
			
		||||
: version-1.0? ( -- ? ) spot get version-1.0?>> ;
 | 
			
		||||
: get-char ( -- char ) spot get char>> ; inline
 | 
			
		||||
: get-line ( -- line ) spot get line>> ; inline
 | 
			
		||||
: get-column ( -- column ) spot get column>> ; inline
 | 
			
		||||
: get-next ( -- char ) spot get next>> ; inline
 | 
			
		||||
: get-check ( -- ? ) spot get check>> ; inline
 | 
			
		||||
: check ( -- ) spot get t >>check drop ; inline
 | 
			
		||||
: version-1.0? ( -- ? ) spot get version-1.0?>> ; inline
 | 
			
		||||
: set-version ( string -- )
 | 
			
		||||
    spot get swap "1.0" = >>version-1.0? drop ;
 | 
			
		||||
    spot get swap "1.0" = >>version-1.0? drop ; inline
 | 
			
		||||
 | 
			
		||||
SYMBOL: xml-stack
 | 
			
		||||
 | 
			
		||||
SYMBOL: depth
 | 
			
		||||
 | 
			
		||||
SYMBOL: interpolating?
 | 
			
		||||
 | 
			
		||||
SYMBOL: in-dtd?
 | 
			
		||||
 | 
			
		||||
SYMBOL: pe-table
 | 
			
		||||
 | 
			
		||||
SYMBOL: extra-entities
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,14 +5,14 @@ IN: xml.test.state
 | 
			
		|||
    [ <string-reader> ] dip with-state ; inline
 | 
			
		||||
 | 
			
		||||
: take-rest ( -- string )
 | 
			
		||||
    [ f ] take-until ;
 | 
			
		||||
    [ drop f ] take-until ;
 | 
			
		||||
 | 
			
		||||
: take-char ( char -- string )
 | 
			
		||||
    1string take-to ;
 | 
			
		||||
 | 
			
		||||
[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
 | 
			
		||||
[ 2 3 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test
 | 
			
		||||
[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test
 | 
			
		||||
[ "hi" " how are you?" ] [ "hi how are you?" [ [ blank? ] take-until take-rest ] string-parse ] unit-test
 | 
			
		||||
[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
 | 
			
		||||
[ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test
 | 
			
		||||
[ "baz" ] [ " \n\t baz" [ pass-blank take-rest ] string-parse ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,7 +2,7 @@
 | 
			
		|||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: namespaces xml.state kernel sequences accessors
 | 
			
		||||
xml.char-classes xml.errors math io sbufs fry strings ascii
 | 
			
		||||
circular xml.entities assocs splitting math.parser
 | 
			
		||||
xml.entities assocs splitting math.parser
 | 
			
		||||
locals combinators arrays hints ;
 | 
			
		||||
IN: xml.tokenize
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -10,19 +10,20 @@ IN: xml.tokenize
 | 
			
		|||
 | 
			
		||||
: assure-good-char ( spot ch -- )
 | 
			
		||||
    [
 | 
			
		||||
        swap
 | 
			
		||||
        over
 | 
			
		||||
        [ version-1.0?>> over text? not ]
 | 
			
		||||
        [ check>> ] bi and [
 | 
			
		||||
            spot get [ 1 + ] change-column drop
 | 
			
		||||
        [ check>> ] bi and
 | 
			
		||||
        [
 | 
			
		||||
            [ [ 1 + ] change-column drop ] dip
 | 
			
		||||
            disallowed-char
 | 
			
		||||
        ] [ drop ] if
 | 
			
		||||
        ] [ 2drop ] if
 | 
			
		||||
    ] [ drop ] if* ;
 | 
			
		||||
 | 
			
		||||
HINTS: assure-good-char { spot fixnum } ;
 | 
			
		||||
 | 
			
		||||
: record ( spot char -- spot )
 | 
			
		||||
    over char>> [
 | 
			
		||||
        CHAR: \n =
 | 
			
		||||
        CHAR: \n eq?
 | 
			
		||||
        [ [ 1 + ] change-line -1 ] [ dup column>> 1 + ] if
 | 
			
		||||
        >>column
 | 
			
		||||
    ] [ drop ] if ;
 | 
			
		||||
| 
						 | 
				
			
			@ -32,9 +33,9 @@ HINTS: record { spot fixnum } ;
 | 
			
		|||
:: (next) ( spot -- spot char )
 | 
			
		||||
    spot next>> :> old-next
 | 
			
		||||
    spot stream>> stream-read1 :> new-next
 | 
			
		||||
    old-next CHAR: \r = [
 | 
			
		||||
    old-next CHAR: \r eq? [
 | 
			
		||||
        spot CHAR: \n >>char
 | 
			
		||||
        new-next CHAR: \n =
 | 
			
		||||
        new-next CHAR: \n eq?
 | 
			
		||||
        [ spot stream>> stream-read1 >>next ]
 | 
			
		||||
        [ new-next >>next ] if
 | 
			
		||||
    ] [ spot old-next >>char new-next >>next ] if
 | 
			
		||||
| 
						 | 
				
			
			@ -52,46 +53,46 @@ HINTS: next* { spot } ;
 | 
			
		|||
: init-parser ( -- )
 | 
			
		||||
    0 1 0 0 f t f <spot>
 | 
			
		||||
        input-stream get >>stream
 | 
			
		||||
    spot set
 | 
			
		||||
    read1 set-next next ;
 | 
			
		||||
        read1 >>next
 | 
			
		||||
    spot set next ;
 | 
			
		||||
 | 
			
		||||
: with-state ( stream quot -- )
 | 
			
		||||
    ! with-input-stream implicitly creates a new scope which we use
 | 
			
		||||
    swap [ init-parser call ] with-input-stream ; inline
 | 
			
		||||
 | 
			
		||||
:: (skip-until) ( ... quot: ( ... -- ... ? ) spot -- ... )
 | 
			
		||||
:: (skip-until) ( ... quot: ( ... char -- ... ? ) spot -- ... )
 | 
			
		||||
    spot char>> [
 | 
			
		||||
        quot call [
 | 
			
		||||
            spot next* quot spot (skip-until)
 | 
			
		||||
        ] unless
 | 
			
		||||
    ] when ; inline recursive
 | 
			
		||||
    ] when* ; inline recursive
 | 
			
		||||
 | 
			
		||||
: skip-until ( ... quot: ( ... -- ... ? ) -- ... )
 | 
			
		||||
: skip-until ( ... quot: ( ... char -- ... ? ) -- ... )
 | 
			
		||||
    spot get (skip-until) ; inline
 | 
			
		||||
 | 
			
		||||
: take-until ( quot -- string )
 | 
			
		||||
: take-until ( ... quot: ( ... char -- ... ? ) -- ... string )
 | 
			
		||||
    #! Take the substring of a string starting at spot
 | 
			
		||||
    #! from code until the quotation given is true and
 | 
			
		||||
    #! advance spot to after the substring.
 | 
			
		||||
   10 <sbuf> [
 | 
			
		||||
        spot get swap
 | 
			
		||||
        '[ @ [ t ] [ _ char>> _ push f ] if ] skip-until
 | 
			
		||||
       '[ _ keep over [ drop ] [ _ push ] if ] skip-until
 | 
			
		||||
   ] keep >string ; inline
 | 
			
		||||
 | 
			
		||||
: take-to ( seq -- string )
 | 
			
		||||
    spot get swap '[ _ char>> _ member? ] take-until ;
 | 
			
		||||
    '[ _ member? ] take-until ;
 | 
			
		||||
 | 
			
		||||
: pass-blank ( -- )
 | 
			
		||||
    #! Advance code past any whitespace, including newlines
 | 
			
		||||
    spot get '[ _ char>> blank? not ] skip-until ;
 | 
			
		||||
    [ blank? not ] skip-until ;
 | 
			
		||||
 | 
			
		||||
: string-matches? ( string circular spot -- ? )
 | 
			
		||||
    char>> over circular-push sequence= ;
 | 
			
		||||
: string-matcher ( str -- quot: ( pos char -- pos ? ) )
 | 
			
		||||
    dup length 1 - '[
 | 
			
		||||
        over _ nth eq? [ 1 + ] [ drop 0 ] if dup _ >
 | 
			
		||||
    ] ; inline
 | 
			
		||||
 | 
			
		||||
: take-string ( match -- string )
 | 
			
		||||
    dup length <circular-string>
 | 
			
		||||
    spot get '[ 2dup _ string-matches? ] take-until nip
 | 
			
		||||
    dup length rot length 1 - - head
 | 
			
		||||
    [ 0 swap string-matcher take-until nip ] keep
 | 
			
		||||
    dupd [ length ] bi@ 1 - - head
 | 
			
		||||
    get-char [ missing-close ] unless next ;
 | 
			
		||||
 | 
			
		||||
: expect ( string -- )
 | 
			
		||||
| 
						 | 
				
			
			@ -123,11 +124,11 @@ HINTS: next* { spot } ;
 | 
			
		|||
    {
 | 
			
		||||
        { [ char not ] [ ] }
 | 
			
		||||
        { [ char quot call ] [ spot next* ] }
 | 
			
		||||
        { [ char CHAR: & = ] [
 | 
			
		||||
        { [ char CHAR: & eq? ] [
 | 
			
		||||
            accum parse-entity
 | 
			
		||||
            quot accum spot (parse-char)
 | 
			
		||||
        ] }
 | 
			
		||||
        { [ in-dtd? get char CHAR: % = and ] [
 | 
			
		||||
        { [ char CHAR: % eq? in-dtd? get and ] [
 | 
			
		||||
            accum parse-pe
 | 
			
		||||
            quot accum spot (parse-char)
 | 
			
		||||
        ] }
 | 
			
		||||
| 
						 | 
				
			
			@ -141,18 +142,21 @@ HINTS: next* { spot } ;
 | 
			
		|||
: parse-char ( quot: ( ch -- ? ) -- seq )
 | 
			
		||||
    1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline
 | 
			
		||||
 | 
			
		||||
: assure-no-]]> ( circular -- )
 | 
			
		||||
    "]]>" sequence= [ text-w/]]> ] when ;
 | 
			
		||||
: assure-no-]]> ( pos char -- pos' )
 | 
			
		||||
    over "]]>" nth eq? [ 1 + ] [ drop 0 ] if
 | 
			
		||||
    dup 2 > [ text-w/]]> ] when ;
 | 
			
		||||
 | 
			
		||||
:: parse-text ( -- string )
 | 
			
		||||
    3 f <array> <circular> :> circ
 | 
			
		||||
    depth get zero? :> no-text [| char |
 | 
			
		||||
        char circ circular-push
 | 
			
		||||
        circ assure-no-]]>
 | 
			
		||||
        no-text [ char blank? char CHAR: < = or [
 | 
			
		||||
    0 :> pos!
 | 
			
		||||
    depth get zero? :> no-text
 | 
			
		||||
    [| char |
 | 
			
		||||
        pos char assure-no-]]> pos!
 | 
			
		||||
        no-text [
 | 
			
		||||
            char blank? char CHAR: < eq? or [
 | 
			
		||||
                char 1string t pre/post-content
 | 
			
		||||
        ] unless ] when
 | 
			
		||||
        char CHAR: < =
 | 
			
		||||
            ] unless
 | 
			
		||||
        ] when
 | 
			
		||||
        char CHAR: < eq?
 | 
			
		||||
    ] parse-char ;
 | 
			
		||||
 | 
			
		||||
: close ( -- )
 | 
			
		||||
| 
						 | 
				
			
			@ -163,8 +167,8 @@ HINTS: next* { spot } ;
 | 
			
		|||
 | 
			
		||||
: (parse-quote) ( <-disallowed? ch -- string )
 | 
			
		||||
    swap '[
 | 
			
		||||
        dup _ = [ drop t ]
 | 
			
		||||
        [ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if
 | 
			
		||||
        dup _ eq? [ drop t ]
 | 
			
		||||
        [ CHAR: < eq? _ and [ attr-w/< ] [ f ] if ] if
 | 
			
		||||
    ] parse-char normalize-quote get-char
 | 
			
		||||
    [ unclosed-quote ] unless ; inline
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
! Copyright (C) 2005, 2009 Daniel Ehrenberg
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays io io.encodings.binary io.files
 | 
			
		||||
io.streams.string kernel namespaces sequences strings
 | 
			
		||||
io.streams.string kernel math namespaces sequences strings
 | 
			
		||||
io.encodings.utf8 xml.data xml.errors xml.elements ascii
 | 
			
		||||
xml.entities xml.state xml.autoencoding assocs xml.tokenize
 | 
			
		||||
combinators.short-circuit xml.name splitting
 | 
			
		||||
| 
						 | 
				
			
			@ -147,8 +147,8 @@ PRIVATE>
 | 
			
		|||
    swap [ call ] keep ; inline
 | 
			
		||||
 | 
			
		||||
: xml-loop ( quot: ( xml-elem -- ) -- )
 | 
			
		||||
    parse-text call-under
 | 
			
		||||
    get-char [ make-tag call-under xml-loop ]
 | 
			
		||||
    parse-text call-under get-char
 | 
			
		||||
    [ make-tag call-under xml-loop ]
 | 
			
		||||
    [ drop ] if ; inline recursive
 | 
			
		||||
 | 
			
		||||
: read-seq ( stream quot n -- seq )
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue