Making XML 30% faster
parent
afacfc1a17
commit
eccabfea12
|
@ -1,6 +1,7 @@
|
||||||
! Copyright (C) 2005, 2007 Daniel Ehrenberg
|
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: kernel sequences unicode.syntax math math.order combinators ;
|
USING: kernel sequences unicode.syntax math math.order combinators
|
||||||
|
hints ;
|
||||||
IN: xml.char-classes
|
IN: xml.char-classes
|
||||||
|
|
||||||
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
|
CATEGORY: 1.0name-start* Ll Lu Lo Lt Nl \u000559\u0006E5\u0006E6_: ;
|
||||||
|
@ -31,3 +32,5 @@ CATEGORY: 1.1name-char Ll Lu Lo Lm Ln Nl Mc Mn Nd Pc Cf _-.\u0000b7: ;
|
||||||
{ [ dup HEX: E000 < ] [ drop f ] }
|
{ [ dup HEX: E000 < ] [ drop f ] }
|
||||||
[ { HEX: FFFE HEX: FFFF } member? not ]
|
[ { HEX: FFFE HEX: FFFF } member? not ]
|
||||||
} cond ;
|
} cond ;
|
||||||
|
|
||||||
|
HINTS: text? { object fixnum } ;
|
||||||
|
|
|
@ -6,11 +6,11 @@ IN: xml.errors.tests
|
||||||
'[ _ string>xml ] swap '[ _ = ] must-fail-with ;
|
'[ _ string>xml ] swap '[ _ = ] must-fail-with ;
|
||||||
|
|
||||||
T{ no-entity f 1 10 "nbsp" } "<x> </x>" xml-error-test
|
T{ no-entity f 1 10 "nbsp" } "<x> </x>" xml-error-test
|
||||||
T{ mismatched f 1 8 T{ name f "" "x" "" } T{ name f "" "y" "" } }
|
T{ mismatched f 1 7 T{ name f "" "x" "" } T{ name f "" "y" "" } }
|
||||||
"<x></y>" xml-error-test
|
"<x></y>" xml-error-test
|
||||||
T{ unclosed f 1 4 V{ T{ name f "" "x" "" } } } "<x>" xml-error-test
|
T{ unclosed f 1 3 V{ T{ name f "" "x" "" } } } "<x>" xml-error-test
|
||||||
T{ nonexist-ns f 1 5 "x" } "<x:y/>" xml-error-test
|
T{ nonexist-ns f 1 5 "x" } "<x:y/>" xml-error-test
|
||||||
T{ unopened f 1 5 } "</x>" xml-error-test
|
T{ unopened f 1 4 } "</x>" xml-error-test
|
||||||
T{ not-yes/no f 1 41 "maybe" }
|
T{ not-yes/no f 1 41 "maybe" }
|
||||||
"<?xml version='1.0' standalone='maybe'?><x/>" xml-error-test
|
"<?xml version='1.0' standalone='maybe'?><x/>" xml-error-test
|
||||||
T{ extra-attrs f 1 32 V{ T{ name f "" "foo" f } }
|
T{ extra-attrs f 1 32 V{ T{ name f "" "foo" f } }
|
||||||
|
@ -19,13 +19,13 @@ T{ bad-version f 1 28 "5 million" }
|
||||||
"<?xml version='5 million'?><x/>" xml-error-test
|
"<?xml version='5 million'?><x/>" xml-error-test
|
||||||
T{ notags f } "" xml-error-test
|
T{ notags f } "" xml-error-test
|
||||||
T{ multitags } "<x/><y/>" xml-error-test
|
T{ multitags } "<x/><y/>" xml-error-test
|
||||||
T{ bad-prolog f 1 26 T{ prolog f "1.0" "UTF-8" f } }
|
T{ bad-prolog f 1 25 T{ prolog f "1.0" "UTF-8" f } }
|
||||||
"<x/><?xml version='1.0'?>" xml-error-test
|
"<x/><?xml version='1.0'?>" xml-error-test
|
||||||
T{ capitalized-prolog f 1 6 "XmL" } "<?XmL version='1.0'?><x/>"
|
T{ capitalized-prolog f 1 6 "XmL" } "<?XmL version='1.0'?><x/>"
|
||||||
xml-error-test
|
xml-error-test
|
||||||
T{ pre/post-content f "x" t } "x<y/>" xml-error-test
|
T{ pre/post-content f "x" t } "x<y/>" xml-error-test
|
||||||
T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test
|
T{ versionless-prolog f 1 8 } "<?xml?><x/>" xml-error-test
|
||||||
T{ unclosed-quote f 1 13 } "<x value='/>" xml-error-test
|
T{ unclosed-quote f 1 12 } "<x value='/>" xml-error-test
|
||||||
T{ bad-name f 1 3 "-" } "<-/>" xml-error-test
|
T{ bad-name f 1 3 "-" } "<-/>" xml-error-test
|
||||||
T{ quoteless-attr f 1 12 } "<x value=<->/>" xml-error-test
|
T{ quoteless-attr f 1 12 } "<x value=<->/>" xml-error-test
|
||||||
T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test
|
T{ quoteless-attr f 1 10 } "<x value=3/>" xml-error-test
|
||||||
|
@ -37,6 +37,6 @@ T{ bad-cdata f 1 7 } "<x/><![CDATA[]]>" xml-error-test
|
||||||
T{ pre/post-content f "&" t } " <x/>" xml-error-test
|
T{ pre/post-content f "&" t } " <x/>" xml-error-test
|
||||||
T{ bad-doctype f 1 17 "a" } "<!DOCTYPE foo [ a ]><x/>" xml-error-test
|
T{ bad-doctype f 1 17 "a" } "<!DOCTYPE foo [ a ]><x/>" xml-error-test
|
||||||
T{ bad-doctype f 1 22 T{ opener { name T{ name f "" "foo" "" } } { attrs T{ attrs } } } } "<!DOCTYPE foo [ <foo> ]><x/>" xml-error-test
|
T{ bad-doctype f 1 22 T{ opener { name T{ name f "" "foo" "" } } { attrs T{ attrs } } } } "<!DOCTYPE foo [ <foo> ]><x/>" xml-error-test
|
||||||
T{ disallowed-char f 1 3 1 } "<x>\u000001</x>" xml-error-test
|
T{ disallowed-char f 1 4 1 } "<x>\u000001</x>" xml-error-test
|
||||||
T{ missing-close f 1 9 } "<!-- foo" xml-error-test
|
T{ missing-close f 1 8 } "<!-- foo" xml-error-test
|
||||||
T{ misplaced-directive f 1 9 "ENTITY" } "<!ENTITY foo 'bar'><x/>" xml-error-test
|
T{ misplaced-directive f 1 9 "ENTITY" } "<!ENTITY foo 'bar'><x/>" xml-error-test
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors kernel namespaces io ;
|
USING: accessors kernel namespaces io math ;
|
||||||
IN: xml.state
|
IN: xml.state
|
||||||
|
|
||||||
TUPLE: spot char line column next check version-1.0? ;
|
TUPLE: spot
|
||||||
|
char { line fixnum } { column fixnum }
|
||||||
|
next check version-1.0? ;
|
||||||
|
|
||||||
C: <spot> spot
|
C: <spot> spot
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ IN: xml.test.state
|
||||||
1string take-to ;
|
1string take-to ;
|
||||||
|
|
||||||
[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
|
[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test
|
||||||
[ 2 4 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test
|
[ 2 3 ] [ "12\n123" [ take-rest drop get-line get-column ] string-parse ] unit-test
|
||||||
[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test
|
[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test
|
||||||
[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
|
[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test
|
||||||
[ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test
|
[ "foo " " bar" ] [ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test
|
||||||
|
|
|
@ -3,111 +3,131 @@
|
||||||
USING: namespaces xml.state kernel sequences accessors
|
USING: namespaces xml.state kernel sequences accessors
|
||||||
xml.char-classes xml.errors math io sbufs fry strings ascii
|
xml.char-classes xml.errors math io sbufs fry strings ascii
|
||||||
circular xml.entities assocs make splitting math.parser
|
circular xml.entities assocs make splitting math.parser
|
||||||
locals combinators arrays ;
|
locals combinators arrays hints ;
|
||||||
IN: xml.tokenize
|
IN: xml.tokenize
|
||||||
|
|
||||||
: assure-good-char ( ch -- ch )
|
|
||||||
[
|
|
||||||
version-1.0? over text? not get-check and
|
|
||||||
[ disallowed-char ] when
|
|
||||||
] [ f ] if* ;
|
|
||||||
|
|
||||||
! * Basic utility words
|
! * Basic utility words
|
||||||
|
|
||||||
: record ( char -- )
|
: assure-good-char ( spot ch -- )
|
||||||
CHAR: \n =
|
[
|
||||||
[ 0 get-line 1+ set-line ] [ get-column 1+ ] if
|
swap
|
||||||
set-column ;
|
[ version-1.0?>> over text? not ]
|
||||||
|
[ check>> ] bi and [
|
||||||
|
spot get [ 1+ ] change-column drop
|
||||||
|
disallowed-char
|
||||||
|
] [ drop ] if
|
||||||
|
] [ drop ] if* ;
|
||||||
|
|
||||||
! (next) normalizes \r\n and \r
|
HINTS: assure-good-char { spot fixnum } ;
|
||||||
: (next) ( -- char )
|
|
||||||
get-next read1
|
: record ( spot char -- spot )
|
||||||
2dup swap CHAR: \r = [
|
over char>> [
|
||||||
CHAR: \n =
|
CHAR: \n =
|
||||||
[ nip read1 ] [ nip CHAR: \n swap ] if
|
[ [ 1+ ] change-line -1 ] [ dup column>> 1+ ] if
|
||||||
] [ drop ] if
|
>>column
|
||||||
set-next dup set-char assure-good-char ;
|
] [ drop ] if ;
|
||||||
|
|
||||||
|
HINTS: record { spot fixnum } ;
|
||||||
|
|
||||||
|
:: (next) ( spot -- spot char )
|
||||||
|
spot next>> :> old-next
|
||||||
|
read1 :> new-next
|
||||||
|
old-next CHAR: \r = [
|
||||||
|
spot CHAR: \n >>char
|
||||||
|
new-next CHAR: \n =
|
||||||
|
[ read1 >>next ]
|
||||||
|
[ new-next >>next ] if
|
||||||
|
] [ spot old-next >>char new-next >>next ] if
|
||||||
|
spot next>> ; inline
|
||||||
|
|
||||||
|
: next* ( spot -- )
|
||||||
|
dup char>> [ unexpected-end ] unless
|
||||||
|
(next) [ record ] keep assure-good-char ;
|
||||||
|
|
||||||
|
HINTS: next* { spot } ;
|
||||||
|
|
||||||
: next ( -- )
|
: next ( -- )
|
||||||
#! Increment spot.
|
spot get next* ;
|
||||||
get-char [ unexpected-end ] unless (next) record ;
|
|
||||||
|
|
||||||
: init-parser ( -- )
|
: init-parser ( -- )
|
||||||
0 1 0 f f t <spot> spot set
|
0 1 0 0 f t <spot> spot set
|
||||||
read1 set-next next ;
|
read1 set-next next ;
|
||||||
|
|
||||||
: with-state ( stream quot -- )
|
: with-state ( stream quot -- )
|
||||||
! with-input-stream implicitly creates a new scope which we use
|
! with-input-stream implicitly creates a new scope which we use
|
||||||
swap [ init-parser call ] with-input-stream ; inline
|
swap [ init-parser call ] with-input-stream ; inline
|
||||||
|
|
||||||
|
:: (skip-until) ( quot: ( -- ? ) spot -- )
|
||||||
|
spot char>> [
|
||||||
|
quot call [
|
||||||
|
spot next* quot spot (skip-until)
|
||||||
|
] unless
|
||||||
|
] when ; inline recursive
|
||||||
|
|
||||||
: skip-until ( quot: ( -- ? ) -- )
|
: skip-until ( quot: ( -- ? ) -- )
|
||||||
get-char [
|
spot get (skip-until) ; inline
|
||||||
[ call ] keep swap [ drop ] [
|
|
||||||
next skip-until
|
|
||||||
] if
|
|
||||||
] [ drop ] if ; inline recursive
|
|
||||||
|
|
||||||
: take-until ( quot -- string )
|
: take-until ( quot -- string )
|
||||||
#! Take the substring of a string starting at spot
|
#! Take the substring of a string starting at spot
|
||||||
#! from code until the quotation given is true and
|
#! from code until the quotation given is true and
|
||||||
#! advance spot to after the substring.
|
#! advance spot to after the substring.
|
||||||
10 <sbuf> [
|
10 <sbuf> [
|
||||||
'[ @ [ t ] [ get-char _ push f ] if ] skip-until
|
spot get swap
|
||||||
|
'[ @ [ t ] [ _ char>> _ push f ] if ] skip-until
|
||||||
] keep >string ; inline
|
] keep >string ; inline
|
||||||
|
|
||||||
: take-to ( seq -- string )
|
: take-to ( seq -- string )
|
||||||
'[ get-char _ member? ] take-until ;
|
spot get swap '[ _ char>> _ member? ] take-until ;
|
||||||
|
|
||||||
: pass-blank ( -- )
|
: pass-blank ( -- )
|
||||||
#! Advance code past any whitespace, including newlines
|
#! Advance code past any whitespace, including newlines
|
||||||
[ get-char blank? not ] skip-until ;
|
spot get '[ _ char>> blank? not ] skip-until ;
|
||||||
|
|
||||||
: string-matches? ( string circular -- ? )
|
: string-matches? ( string circular spot -- ? )
|
||||||
get-char over push-circular
|
char>> over push-circular sequence= ;
|
||||||
sequence= ;
|
|
||||||
|
|
||||||
: take-string ( match -- string )
|
: take-string ( match -- string )
|
||||||
dup length <circular-string>
|
dup length <circular-string>
|
||||||
[ 2dup string-matches? ] take-until nip
|
spot get '[ 2dup _ string-matches? ] take-until nip
|
||||||
dup length rot length 1- - head
|
dup length rot length 1- - head
|
||||||
get-char [ missing-close ] unless next ;
|
get-char [ missing-close ] unless next ;
|
||||||
|
|
||||||
: expect ( string -- )
|
: expect ( string -- )
|
||||||
dup [ get-char next ] replicate 2dup =
|
dup spot get '[ _ [ char>> ] keep next* ] replicate
|
||||||
[ 2drop ] [ expected ] if ;
|
2dup = [ 2drop ] [ expected ] if ;
|
||||||
|
|
||||||
! Suddenly XML-specific
|
! Suddenly XML-specific
|
||||||
|
|
||||||
: parse-named-entity ( string -- )
|
: parse-named-entity ( accum string -- )
|
||||||
dup entities at [ , ] [
|
dup entities at [ swap push ] [
|
||||||
dup extra-entities get at
|
dup extra-entities get at
|
||||||
[ % ] [ no-entity ] ?if
|
[ swap push-all ] [ no-entity ] ?if
|
||||||
] ?if ;
|
] ?if ;
|
||||||
|
|
||||||
: take-; ( -- string )
|
: take-; ( -- string )
|
||||||
next ";" take-to next ;
|
next ";" take-to next ;
|
||||||
|
|
||||||
: parse-entity ( -- )
|
: parse-entity ( accum -- )
|
||||||
take-; "#" ?head [
|
take-; "#" ?head [
|
||||||
"x" ?head 16 10 ? base> ,
|
"x" ?head 16 10 ? base> swap push
|
||||||
] [ parse-named-entity ] if ;
|
] [ parse-named-entity ] if ;
|
||||||
|
|
||||||
: parse-pe ( -- )
|
: parse-pe ( accum -- )
|
||||||
take-; dup pe-table get at
|
take-; dup pe-table get at
|
||||||
[ % ] [ no-entity ] ?if ;
|
[ swap push-all ] [ no-entity ] ?if ;
|
||||||
|
|
||||||
:: (parse-char) ( quot: ( ch -- ? ) -- )
|
:: (parse-char) ( quot: ( ch -- ? ) accum -- )
|
||||||
get-char :> char
|
get-char :> char
|
||||||
{
|
{
|
||||||
{ [ char not ] [ ] }
|
{ [ char not ] [ ] }
|
||||||
{ [ char quot call ] [ next ] }
|
{ [ char quot call ] [ next ] }
|
||||||
{ [ char CHAR: & = ] [ parse-entity quot (parse-char) ] }
|
{ [ char CHAR: & = ] [ accum parse-entity quot accum (parse-char) ] }
|
||||||
{ [ in-dtd? get char CHAR: % = and ] [ parse-pe quot (parse-char) ] }
|
{ [ in-dtd? get char CHAR: % = and ] [ accum parse-pe quot accum (parse-char) ] }
|
||||||
[ char , next quot (parse-char) ]
|
[ char accum push next quot accum (parse-char) ]
|
||||||
} cond ; inline recursive
|
} cond ; inline recursive
|
||||||
|
|
||||||
: parse-char ( quot: ( ch -- ? ) -- seq )
|
: parse-char ( quot: ( ch -- ? ) -- seq )
|
||||||
[ (parse-char) ] "" make ; inline
|
1024 <sbuf> [ (parse-char) ] keep >string ; inline
|
||||||
|
|
||||||
: assure-no-]]> ( circular -- )
|
: assure-no-]]> ( circular -- )
|
||||||
"]]>" sequence= [ text-w/]]> ] when ;
|
"]]>" sequence= [ text-w/]]> ] when ;
|
||||||
|
|
Loading…
Reference in New Issue