| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | ! Copyright (C) 2005, 2009 Daniel Ehrenberg | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | USING: namespaces xml.state kernel sequences accessors | 
					
						
							|  |  |  | xml.char-classes xml.errors math io sbufs fry strings ascii | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  | xml.entities assocs splitting math.parser | 
					
						
							| 
									
										
										
										
											2012-07-12 22:06:37 -04:00
										 |  |  | locals combinators combinators.short-circuit arrays hints ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | IN: xml.tokenize | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  | ! * Basic utility words | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : assure-good-char ( spot ch -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  |     [ | 
					
						
							| 
									
										
										
										
											2012-07-12 22:06:37 -04:00
										 |  |  |         over { | 
					
						
							|  |  |  |             [ version-1.0?>> over text? not ] | 
					
						
							|  |  |  |             [ check>> ] | 
					
						
							|  |  |  |         } 1&& | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |         [ | 
					
						
							|  |  |  |             [ [ 1 + ] change-column drop ] dip
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |             disallowed-char | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |         ] [ 2drop ] if
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |     ] [ drop ] if* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | HINTS: assure-good-char { spot fixnum } ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : record ( spot char -- spot )
 | 
					
						
							|  |  |  |     over char>> [ | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |         CHAR: \n eq?
 | 
					
						
							| 
									
										
										
										
											2009-08-13 20:21:44 -04:00
										 |  |  |         [ [ 1 + ] change-line -1 ] [ dup column>> 1 + ] if
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |         >>column | 
					
						
							|  |  |  |     ] [ drop ] if ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  | HINTS: record { spot fixnum } ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  | :: (next) ( spot -- spot char )
 | 
					
						
							|  |  |  |     spot next>> :> old-next | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |     spot stream>> stream-read1 :> new-next | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |     old-next CHAR: \r eq? [ | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |         spot CHAR: \n >>char | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |         new-next CHAR: \n eq?
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |         [ spot stream>> stream-read1 >>next ] | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |         [ new-next >>next ] if
 | 
					
						
							|  |  |  |     ] [ spot old-next >>char new-next >>next ] if
 | 
					
						
							|  |  |  |     spot next>> ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  | : next* ( spot -- )
 | 
					
						
							|  |  |  |     dup char>> [ unexpected-end ] unless
 | 
					
						
							|  |  |  |     (next) [ record ] keep assure-good-char ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | HINTS: next* { spot } ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : next ( -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |     spot get next* ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | : init-parser ( -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |     0 1 0 0 f t f <spot> | 
					
						
							|  |  |  |         input-stream get >>stream | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |         read1 >>next | 
					
						
							|  |  |  |     spot set next ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : with-state ( stream quot -- )
 | 
					
						
							|  |  |  |     ! with-input-stream implicitly creates a new scope which we use | 
					
						
							|  |  |  |     swap [ init-parser call ] with-input-stream ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  | :: (skip-until) ( ... quot: ( ... char -- ... ? ) spot -- ... )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |     spot char>> [ | 
					
						
							|  |  |  |         quot call [ | 
					
						
							|  |  |  |             spot next* quot spot (skip-until) | 
					
						
							|  |  |  |         ] unless
 | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |     ] when* ; inline recursive
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  | : skip-until ( ... quot: ( ... char -- ... ? ) -- ... )
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |     spot get (skip-until) ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  | : take-until ( ... quot: ( ... char -- ... ? ) -- ... string )
 | 
					
						
							| 
									
										
										
										
											2015-09-08 19:15:10 -04:00
										 |  |  |     ! Take the substring of a string starting at spot | 
					
						
							|  |  |  |     ! from code until the quotation given is true and | 
					
						
							|  |  |  |     ! advance spot to after the substring. | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |    10 <sbuf> [ | 
					
						
							|  |  |  |        '[ _ keep over [ drop ] [ _ push ] if ] skip-until | 
					
						
							| 
									
										
										
										
											2012-08-24 18:16:04 -04:00
										 |  |  |    ] keep "" like ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-25 22:06:45 -05:00
										 |  |  | : take-to ( seq -- string )
 | 
					
						
							| 
									
										
										
										
											2012-08-24 18:16:04 -04:00
										 |  |  |     '[ _ member? ] take-until ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | : pass-blank ( -- )
 | 
					
						
							| 
									
										
										
										
											2015-09-08 19:15:10 -04:00
										 |  |  |     ! Advance code past any whitespace, including newlines | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |     [ blank? not ] skip-until ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-10-16 16:01:58 -04:00
										 |  |  | : next-matching ( pos ch str -- pos' )
 | 
					
						
							| 
									
										
										
										
											2012-08-24 18:16:04 -04:00
										 |  |  |     [ over ] dip nth eq? [ 1 + ] [ drop 0 ] if ; inline
 | 
					
						
							| 
									
										
										
										
											2011-10-16 16:01:58 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  | : string-matcher ( str -- quot: ( pos char -- pos ? ) )
 | 
					
						
							| 
									
										
										
										
											2011-10-16 16:01:58 -04:00
										 |  |  |     dup length 1 - '[ _ next-matching dup _ > ] ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-02-07 00:12:57 -05:00
										 |  |  | :: (take-string) ( match spot -- sbuf matched? )
 | 
					
						
							|  |  |  |     10 <sbuf> f [ | 
					
						
							|  |  |  |         spot char>> [ | 
					
						
							|  |  |  |             nip over push
 | 
					
						
							|  |  |  |             spot next* | 
					
						
							|  |  |  |             dup match tail? dup not
 | 
					
						
							|  |  |  |         ] [ f ] if*
 | 
					
						
							|  |  |  |     ] loop ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | : take-string ( match -- string )
 | 
					
						
							| 
									
										
										
										
											2014-02-07 00:12:57 -05:00
										 |  |  |     [ spot get (take-string) [ missing-close ] unless ] | 
					
						
							|  |  |  |     [ dupd [ length ] bi@ - over shorten "" like ] bi ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-23 16:29:28 -05:00
										 |  |  | : expect ( string -- )
 | 
					
						
							| 
									
										
										
										
											2010-01-14 10:10:13 -05:00
										 |  |  |     dup length spot get '[ _ [ char>> ] keep next* ] "" replicate-as
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |     2dup = [ 2drop ] [ expected ] if ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-25 22:06:45 -05:00
										 |  |  | ! Suddenly XML-specific | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  | : parse-named-entity ( accum string -- )
 | 
					
						
							|  |  |  |     dup entities at [ swap push ] [ | 
					
						
							| 
									
										
										
										
											2009-01-15 23:20:24 -05:00
										 |  |  |         dup extra-entities get at
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |         [ swap push-all ] [ no-entity ] ?if
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     ] ?if ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-25 22:06:45 -05:00
										 |  |  | : take-; ( -- string )
 | 
					
						
							|  |  |  |     next ";" take-to next ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  | : parse-entity ( accum -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-25 22:06:45 -05:00
										 |  |  |     take-; "#" ?head [ | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |         "x" ?head 16 10 ? base> swap push
 | 
					
						
							| 
									
										
										
										
											2009-01-15 23:20:24 -05:00
										 |  |  |     ] [ parse-named-entity ] if ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  | : parse-pe ( accum -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-25 22:06:45 -05:00
										 |  |  |     take-; dup pe-table get at
 | 
					
						
							| 
									
										
										
										
											2009-01-29 22:41:08 -05:00
										 |  |  |     [ swap push-all ] [ no-entity ] ?if ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  | :: (parse-char) ( quot: ( ch -- ? ) accum spot -- )
 | 
					
						
							|  |  |  |     spot char>> :> char | 
					
						
							| 
									
										
										
										
											2009-01-19 23:25:15 -05:00
										 |  |  |     { | 
					
						
							|  |  |  |         { [ char not ] [ ] } | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |         { [ char quot call ] [ spot next* ] } | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |         { [ char CHAR: & eq? ] [ | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |             accum parse-entity | 
					
						
							|  |  |  |             quot accum spot (parse-char) | 
					
						
							|  |  |  |         ] } | 
					
						
							| 
									
										
										
										
											2012-09-11 17:43:19 -04:00
										 |  |  |         { [ char CHAR: % eq? [ in-dtd? get ] [ f ] if ] [ | 
					
						
							| 
									
										
										
										
											2009-01-29 23:17:55 -05:00
										 |  |  |             accum parse-pe | 
					
						
							|  |  |  |             quot accum spot (parse-char) | 
					
						
							|  |  |  |         ] } | 
					
						
							|  |  |  |         [ | 
					
						
							|  |  |  |             char accum push
 | 
					
						
							|  |  |  |             spot next* | 
					
						
							|  |  |  |             quot accum spot (parse-char) | 
					
						
							|  |  |  |         ] | 
					
						
							| 
									
										
										
										
											2009-01-19 23:25:15 -05:00
										 |  |  |     } cond ; inline recursive
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-char ( quot: ( ch -- ? ) -- seq )
 | 
					
						
							| 
									
										
										
										
											2012-08-24 18:16:04 -04:00
										 |  |  |     512 <sbuf> [ spot get (parse-char) ] keep "" like ; inline
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  | : assure-no-]]> ( pos char -- pos' )
 | 
					
						
							| 
									
										
										
										
											2012-08-24 18:16:04 -04:00
										 |  |  |     "]]>" next-matching dup 2 > [ text-w/]]> ] when ; inline
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-20 16:37:21 -05:00
										 |  |  | :: parse-text ( -- string )
 | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |     depth get zero? :> no-text | 
					
						
							| 
									
										
										
										
											2012-08-24 18:16:04 -04:00
										 |  |  |     0 :> pos! | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |     [| char | | 
					
						
							|  |  |  |         pos char assure-no-]]> pos! | 
					
						
							|  |  |  |         no-text [ | 
					
						
							|  |  |  |             char blank? char CHAR: < eq? or [ | 
					
						
							|  |  |  |                 char 1string t pre/post-content | 
					
						
							|  |  |  |             ] unless
 | 
					
						
							|  |  |  |         ] when
 | 
					
						
							|  |  |  |         char CHAR: < eq?
 | 
					
						
							| 
									
										
										
										
											2009-01-19 23:25:15 -05:00
										 |  |  |     ] parse-char ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-21 19:16:51 -05:00
										 |  |  | : close ( -- )
 | 
					
						
							| 
									
										
										
										
											2009-01-23 16:29:28 -05:00
										 |  |  |     pass-blank ">" expect ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | : normalize-quote ( str -- str )
 | 
					
						
							| 
									
										
										
										
											2012-08-24 18:16:04 -04:00
										 |  |  |     [ dup "\t\r\n" member? [ drop CHAR: \s ] when ] map! ;
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-19 23:25:15 -05:00
										 |  |  | : (parse-quote) ( <-disallowed? ch -- string )
 | 
					
						
							|  |  |  |     swap '[ | 
					
						
							| 
									
										
										
										
											2011-09-30 15:47:38 -04:00
										 |  |  |         dup _ eq? [ drop t ] | 
					
						
							|  |  |  |         [ CHAR: < eq? _ and [ attr-w/< ] [ f ] if ] if
 | 
					
						
							| 
									
										
										
										
											2009-01-21 00:54:33 -05:00
										 |  |  |     ] parse-char normalize-quote get-char | 
					
						
							| 
									
										
										
										
											2009-01-19 23:25:15 -05:00
										 |  |  |     [ unclosed-quote ] unless ; inline
 | 
					
						
							| 
									
										
										
										
											2009-01-15 23:20:24 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-19 23:25:15 -05:00
										 |  |  | : parse-quote* ( <-disallowed? -- seq )
 | 
					
						
							| 
									
										
										
										
											2009-01-15 23:20:24 -05:00
										 |  |  |     pass-blank get-char dup "'\"" member?
 | 
					
						
							| 
									
										
										
										
											2009-01-19 23:25:15 -05:00
										 |  |  |     [ next (parse-quote) ] [ quoteless-attr ] if ; inline
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : parse-quote ( -- seq )
 | 
					
						
							|  |  |  |    f parse-quote* ;
 |