2009-01-21 19:16:51 -05:00
|
|
|
! Copyright (C) 2005, 2009 Daniel Ehrenberg
|
2007-09-20 18:09:08 -04:00
|
|
|
! See http://factorcode.org/license.txt for BSD license.
|
2009-01-21 19:16:51 -05:00
|
|
|
USING: namespaces xml.state kernel sequences accessors
|
|
|
|
xml.char-classes xml.errors math io sbufs fry strings ascii
|
2009-01-29 23:17:55 -05:00
|
|
|
circular xml.entities assocs splitting math.parser
|
2009-01-29 22:41:08 -05:00
|
|
|
locals combinators arrays hints ;
|
2007-09-20 18:09:08 -04:00
|
|
|
IN: xml.tokenize
|
|
|
|
|
2009-01-29 22:41:08 -05:00
|
|
|
! * Basic utility words
|
|
|
|
|
|
|
|
: assure-good-char ( spot ch -- )
|
2009-01-21 00:54:33 -05:00
|
|
|
[
|
2009-01-29 22:41:08 -05:00
|
|
|
swap
|
|
|
|
[ version-1.0?>> over text? not ]
|
|
|
|
[ check>> ] bi and [
|
2009-08-13 20:21:44 -04:00
|
|
|
spot get [ 1 + ] change-column drop
|
2009-01-29 22:41:08 -05:00
|
|
|
disallowed-char
|
|
|
|
] [ drop ] if
|
|
|
|
] [ drop ] if* ;
|
|
|
|
|
|
|
|
HINTS: assure-good-char { spot fixnum } ;
|
|
|
|
|
|
|
|
: record ( spot char -- spot )
|
|
|
|
over char>> [
|
|
|
|
CHAR: \n =
|
2009-08-13 20:21:44 -04:00
|
|
|
[ [ 1 + ] change-line -1 ] [ dup column>> 1 + ] if
|
2009-01-29 22:41:08 -05:00
|
|
|
>>column
|
|
|
|
] [ drop ] if ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
2009-01-29 22:41:08 -05:00
|
|
|
HINTS: record { spot fixnum } ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
2009-01-29 22:41:08 -05:00
|
|
|
:: (next) ( spot -- spot char )
|
|
|
|
spot next>> :> old-next
|
2009-01-29 23:17:55 -05:00
|
|
|
spot stream>> stream-read1 :> new-next
|
2009-01-29 22:41:08 -05:00
|
|
|
old-next CHAR: \r = [
|
|
|
|
spot CHAR: \n >>char
|
|
|
|
new-next CHAR: \n =
|
2009-01-29 23:17:55 -05:00
|
|
|
[ spot stream>> stream-read1 >>next ]
|
2009-01-29 22:41:08 -05:00
|
|
|
[ new-next >>next ] if
|
|
|
|
] [ spot old-next >>char new-next >>next ] if
|
|
|
|
spot next>> ; inline
|
2009-01-21 00:54:33 -05:00
|
|
|
|
2009-01-29 22:41:08 -05:00
|
|
|
: next* ( spot -- )
|
|
|
|
dup char>> [ unexpected-end ] unless
|
|
|
|
(next) [ record ] keep assure-good-char ;
|
|
|
|
|
|
|
|
HINTS: next* { spot } ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
|
|
|
: next ( -- )
|
2009-01-29 22:41:08 -05:00
|
|
|
spot get next* ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
2009-01-21 19:16:51 -05:00
|
|
|
: init-parser ( -- )
|
2009-01-29 23:17:55 -05:00
|
|
|
0 1 0 0 f t f <spot>
|
|
|
|
input-stream get >>stream
|
|
|
|
spot set
|
2009-01-21 19:16:51 -05:00
|
|
|
read1 set-next next ;
|
|
|
|
|
|
|
|
: with-state ( stream quot -- )
|
|
|
|
! with-input-stream implicitly creates a new scope which we use
|
|
|
|
swap [ init-parser call ] with-input-stream ; inline
|
|
|
|
|
2010-03-09 03:56:07 -05:00
|
|
|
:: (skip-until) ( ... quot: ( ... -- ... ? ) spot -- ... )
|
2009-01-29 22:41:08 -05:00
|
|
|
spot char>> [
|
|
|
|
quot call [
|
|
|
|
spot next* quot spot (skip-until)
|
|
|
|
] unless
|
|
|
|
] when ; inline recursive
|
|
|
|
|
2010-03-09 03:56:07 -05:00
|
|
|
: skip-until ( ... quot: ( ... -- ... ? ) -- ... )
|
2009-01-29 22:41:08 -05:00
|
|
|
spot get (skip-until) ; inline
|
2009-01-21 00:54:33 -05:00
|
|
|
|
|
|
|
: take-until ( quot -- string )
|
|
|
|
#! Take the substring of a string starting at spot
|
|
|
|
#! from code until the quotation given is true and
|
|
|
|
#! advance spot to after the substring.
|
|
|
|
10 <sbuf> [
|
2009-01-29 22:41:08 -05:00
|
|
|
spot get swap
|
|
|
|
'[ @ [ t ] [ _ char>> _ push f ] if ] skip-until
|
2009-01-21 00:54:33 -05:00
|
|
|
] keep >string ; inline
|
|
|
|
|
2009-01-25 22:06:45 -05:00
|
|
|
: take-to ( seq -- string )
|
2009-01-29 22:41:08 -05:00
|
|
|
spot get swap '[ _ char>> _ member? ] take-until ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
|
|
|
: pass-blank ( -- )
|
|
|
|
#! Advance code past any whitespace, including newlines
|
2009-01-29 22:41:08 -05:00
|
|
|
spot get '[ _ char>> blank? not ] skip-until ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
2009-01-29 22:41:08 -05:00
|
|
|
: string-matches? ( string circular spot -- ? )
|
2009-12-06 18:20:46 -05:00
|
|
|
char>> over circular-push sequence= ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
|
|
|
: take-string ( match -- string )
|
|
|
|
dup length <circular-string>
|
2009-01-29 22:41:08 -05:00
|
|
|
spot get '[ 2dup _ string-matches? ] take-until nip
|
2009-08-13 20:21:44 -04:00
|
|
|
dup length rot length 1 - - head
|
2009-01-21 00:54:33 -05:00
|
|
|
get-char [ missing-close ] unless next ;
|
|
|
|
|
2009-01-23 16:29:28 -05:00
|
|
|
: expect ( string -- )
|
2010-01-14 10:10:13 -05:00
|
|
|
dup length spot get '[ _ [ char>> ] keep next* ] "" replicate-as
|
2009-01-29 22:41:08 -05:00
|
|
|
2dup = [ 2drop ] [ expected ] if ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
2009-01-25 22:06:45 -05:00
|
|
|
! Suddenly XML-specific
|
|
|
|
|
2009-01-29 22:41:08 -05:00
|
|
|
: parse-named-entity ( accum string -- )
|
|
|
|
dup entities at [ swap push ] [
|
2009-01-15 23:20:24 -05:00
|
|
|
dup extra-entities get at
|
2009-01-29 22:41:08 -05:00
|
|
|
[ swap push-all ] [ no-entity ] ?if
|
2007-09-20 18:09:08 -04:00
|
|
|
] ?if ;
|
|
|
|
|
2009-01-25 22:06:45 -05:00
|
|
|
: take-; ( -- string )
|
|
|
|
next ";" take-to next ;
|
|
|
|
|
2009-01-29 22:41:08 -05:00
|
|
|
: parse-entity ( accum -- )
|
2009-01-25 22:06:45 -05:00
|
|
|
take-; "#" ?head [
|
2009-01-29 22:41:08 -05:00
|
|
|
"x" ?head 16 10 ? base> swap push
|
2009-01-15 23:20:24 -05:00
|
|
|
] [ parse-named-entity ] if ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2009-01-29 22:41:08 -05:00
|
|
|
: parse-pe ( accum -- )
|
2009-01-25 22:06:45 -05:00
|
|
|
take-; dup pe-table get at
|
2009-01-29 22:41:08 -05:00
|
|
|
[ swap push-all ] [ no-entity ] ?if ;
|
2009-01-21 00:54:33 -05:00
|
|
|
|
2009-01-29 23:17:55 -05:00
|
|
|
:: (parse-char) ( quot: ( ch -- ? ) accum spot -- )
|
|
|
|
spot char>> :> char
|
2009-01-19 23:25:15 -05:00
|
|
|
{
|
|
|
|
{ [ char not ] [ ] }
|
2009-01-29 23:17:55 -05:00
|
|
|
{ [ char quot call ] [ spot next* ] }
|
|
|
|
{ [ char CHAR: & = ] [
|
|
|
|
accum parse-entity
|
|
|
|
quot accum spot (parse-char)
|
|
|
|
] }
|
|
|
|
{ [ in-dtd? get char CHAR: % = and ] [
|
|
|
|
accum parse-pe
|
|
|
|
quot accum spot (parse-char)
|
|
|
|
] }
|
|
|
|
[
|
|
|
|
char accum push
|
|
|
|
spot next*
|
|
|
|
quot accum spot (parse-char)
|
|
|
|
]
|
2009-01-19 23:25:15 -05:00
|
|
|
} cond ; inline recursive
|
|
|
|
|
|
|
|
: parse-char ( quot: ( ch -- ? ) -- seq )
|
2009-01-29 23:17:55 -05:00
|
|
|
1024 <sbuf> [ spot get (parse-char) ] keep >string ; inline
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2009-01-19 23:25:15 -05:00
|
|
|
: assure-no-]]> ( circular -- )
|
|
|
|
"]]>" sequence= [ text-w/]]> ] when ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2009-01-20 16:37:21 -05:00
|
|
|
:: parse-text ( -- string )
|
|
|
|
3 f <array> <circular> :> circ
|
|
|
|
depth get zero? :> no-text [| char |
|
2009-12-06 18:20:46 -05:00
|
|
|
char circ circular-push
|
2009-01-20 16:37:21 -05:00
|
|
|
circ assure-no-]]>
|
|
|
|
no-text [ char blank? char CHAR: < = or [
|
|
|
|
char 1string t pre/post-content
|
|
|
|
] unless ] when
|
|
|
|
char CHAR: < =
|
2009-01-19 23:25:15 -05:00
|
|
|
] parse-char ;
|
|
|
|
|
2009-01-21 19:16:51 -05:00
|
|
|
: close ( -- )
|
2009-01-23 16:29:28 -05:00
|
|
|
pass-blank ">" expect ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2009-01-21 00:54:33 -05:00
|
|
|
: normalize-quote ( str -- str )
|
|
|
|
[ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map ;
|
|
|
|
|
2009-01-19 23:25:15 -05:00
|
|
|
: (parse-quote) ( <-disallowed? ch -- string )
|
|
|
|
swap '[
|
|
|
|
dup _ = [ drop t ]
|
|
|
|
[ CHAR: < = _ and [ attr-w/< ] [ f ] if ] if
|
2009-01-21 00:54:33 -05:00
|
|
|
] parse-char normalize-quote get-char
|
2009-01-19 23:25:15 -05:00
|
|
|
[ unclosed-quote ] unless ; inline
|
2009-01-15 23:20:24 -05:00
|
|
|
|
2009-01-19 23:25:15 -05:00
|
|
|
: parse-quote* ( <-disallowed? -- seq )
|
2009-01-15 23:20:24 -05:00
|
|
|
pass-blank get-char dup "'\"" member?
|
2009-01-19 23:25:15 -05:00
|
|
|
[ next (parse-quote) ] [ quoteless-attr ] if ; inline
|
|
|
|
|
|
|
|
: parse-quote ( -- seq )
|
|
|
|
f parse-quote* ;
|
|
|
|
|