| 
									
										
										
										
											2008-09-25 03:02:47 -04:00
										 |  |  | ! Copyright (C) 2008 Doug Coleman. | 
					
						
							|  |  |  | ! See http://factorcode.org/license.txt for BSD license. | 
					
						
							| 
									
										
										
										
											2009-11-05 18:12:10 -05:00
										 |  |  | USING: accessors arrays hashtables sequences.parser | 
					
						
							| 
									
										
										
										
											2009-05-20 16:50:01 -04:00
										 |  |  | html.parser.utils kernel namespaces sequences math | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  | unicode.case unicode.categories combinators.short-circuit | 
					
						
							| 
									
										
										
										
											2009-04-01 12:48:44 -04:00
										 |  |  | quoting fry ;
 | 
					
						
							| 
									
										
										
										
											2007-12-04 15:14:33 -05:00
										 |  |  | IN: html.parser | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-17 11:38:34 -04:00
										 |  |  | TUPLE: tag name attributes text closing? ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-17 11:38:34 -04:00
										 |  |  | SINGLETON: text | 
					
						
							|  |  |  | SINGLETON: dtd | 
					
						
							|  |  |  | SINGLETON: comment | 
					
						
							| 
									
										
										
										
											2009-04-01 13:48:51 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | <PRIVATE
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | SYMBOL: tagstack | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : push-tag ( tag -- )
 | 
					
						
							|  |  |  |     tagstack get push ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | : closing-tag? ( string -- ? )
 | 
					
						
							| 
									
										
										
										
											2008-08-17 11:38:34 -04:00
										 |  |  |     [ f ] | 
					
						
							| 
									
										
										
										
											2009-05-25 17:38:33 -04:00
										 |  |  |     [ { [ first CHAR: / = ] [ last CHAR: / = ] } 1|| ] if-empty ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : <tag> ( name attributes closing? -- tag )
 | 
					
						
							| 
									
										
										
										
											2008-08-14 00:09:43 -04:00
										 |  |  |     tag new
 | 
					
						
							|  |  |  |         swap >>closing? | 
					
						
							|  |  |  |         swap >>attributes | 
					
						
							|  |  |  |         swap >>name ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-08-17 11:38:34 -04:00
										 |  |  | : make-tag ( string attribs -- tag )
 | 
					
						
							| 
									
										
										
										
											2008-11-29 13:18:28 -05:00
										 |  |  |     [ [ closing-tag? ] keep "/" trim1 ] dip rot <tag> ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-01 12:43:30 -04:00
										 |  |  | : new-tag ( text name -- tag )
 | 
					
						
							| 
									
										
										
										
											2008-08-17 11:38:34 -04:00
										 |  |  |     tag new
 | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |         swap >>name | 
					
						
							|  |  |  |         swap >>text ; inline
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : (read-quote) ( sequence-parser ch -- string )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 13:52:02 -04:00
										 |  |  |     '[ [ current _ = ] take-until ] [ advance drop ] bi ;
 | 
					
						
							| 
									
										
										
										
											2009-04-01 12:48:44 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-single-quote ( sequence-parser -- string )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 12:48:44 -04:00
										 |  |  |     CHAR: ' (read-quote) ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-double-quote ( sequence-parser -- string )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 12:48:44 -04:00
										 |  |  |     CHAR: " (read-quote) ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-quote ( sequence-parser -- string )
 | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |     dup get+increment CHAR: ' =
 | 
					
						
							|  |  |  |     [ read-single-quote ] [ read-double-quote ] if ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-key ( sequence-parser -- string )
 | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |     skip-whitespace | 
					
						
							| 
									
										
										
										
											2009-04-01 04:13:38 -04:00
										 |  |  |     [ current { [ CHAR: = = ] [ blank? ] } 1|| ] take-until ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-token ( sequence-parser -- string )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 04:13:38 -04:00
										 |  |  |     [ current blank? ] take-until ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-value ( sequence-parser -- string )
 | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |     skip-whitespace | 
					
						
							| 
									
										
										
										
											2009-04-01 03:33:38 -04:00
										 |  |  |     dup current quote? [ read-quote ] [ read-token ] if
 | 
					
						
							| 
									
										
										
										
											2008-08-17 11:38:34 -04:00
										 |  |  |     [ blank? ] trim ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-comment ( sequence-parser -- )
 | 
					
						
							| 
									
										
										
										
											2009-05-20 16:50:01 -04:00
										 |  |  |     [ "-->" take-until-sequence comment new-tag push-tag ] | 
					
						
							|  |  |  |     [ '[ _ advance drop ] 3 swap times ] bi ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-dtd ( sequence-parser -- )
 | 
					
						
							| 
									
										
										
										
											2009-05-20 16:50:01 -04:00
										 |  |  |     [ ">" take-until-sequence dtd new-tag push-tag ] | 
					
						
							|  |  |  |     [ advance drop ] bi ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-bang ( sequence-parser -- )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 13:52:02 -04:00
										 |  |  |     advance dup { [ current CHAR: - = ] [ peek-next CHAR: - = ] } 1&& | 
					
						
							|  |  |  |     [ advance advance read-comment ] [ read-dtd ] if ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-tag ( sequence-parser -- string )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 04:13:38 -04:00
										 |  |  |     [ [ current "><" member? ] take-until ] | 
					
						
							| 
									
										
										
										
											2009-04-01 13:52:02 -04:00
										 |  |  |     [ dup current CHAR: < = [ advance ] unless drop ] bi ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-until-< ( sequence-parser -- string )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 04:13:38 -04:00
										 |  |  |     [ current CHAR: < = ] take-until ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : parse-text ( sequence-parser -- )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 12:43:30 -04:00
										 |  |  |     read-until-< [ text new-tag push-tag ] unless-empty ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : parse-key/value ( sequence-parser -- key value )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 13:44:38 -04:00
										 |  |  |     [ read-key >lower ] | 
					
						
							|  |  |  |     [ skip-whitespace "=" take-sequence ] | 
					
						
							| 
									
										
										
										
											2009-04-01 14:42:38 -04:00
										 |  |  |     [ swap [ read-value ] [ drop dup ] if ] tri ;
 | 
					
						
							| 
									
										
										
										
											2009-04-01 13:44:38 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : (parse-attributes) ( sequence-parser -- )
 | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |     skip-whitespace | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  |     dup sequence-parse-end? [ | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |         drop
 | 
					
						
							|  |  |  |     ] [ | 
					
						
							| 
									
										
										
										
											2009-04-01 13:44:38 -04:00
										 |  |  |         [ parse-key/value swap set ] [ (parse-attributes) ] bi
 | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |     ] if ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : parse-attributes ( sequence-parser -- hashtable )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 12:43:30 -04:00
										 |  |  |     [ (parse-attributes) ] H{ } make-assoc ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-06-14 01:41:48 -04:00
										 |  |  | : (parse-tag) ( string -- string' hashtable )
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  |     [ | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |         [ read-token >lower ] [ parse-attributes ] bi
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  |     ] parse-sequence ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : read-< ( sequence-parser -- string/f )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 13:52:02 -04:00
										 |  |  |     advance dup current [ | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |         CHAR: ! = [ read-bang f ] [ read-tag ] if
 | 
					
						
							|  |  |  |     ] [ | 
					
						
							|  |  |  |         drop f
 | 
					
						
							|  |  |  |     ] if* ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : parse-tag ( sequence-parser -- )
 | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |     read-< [ (parse-tag) make-tag push-tag ] unless-empty ;
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  | : (parse-html) ( sequence-parser -- )
 | 
					
						
							| 
									
										
										
										
											2009-04-01 03:33:38 -04:00
										 |  |  |     dup peek-next [ | 
					
						
							| 
									
										
										
										
											2009-03-31 17:05:11 -04:00
										 |  |  |         [ parse-text ] [ parse-tag ] [ (parse-html) ] tri
 | 
					
						
							|  |  |  |     ] [ drop ] if ;
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | : tag-parse ( quot -- vector )
 | 
					
						
							| 
									
										
										
										
											2009-04-11 13:11:00 -04:00
										 |  |  |     V{ } clone tagstack [ parse-sequence ] with-variable ; inline
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-04-01 13:48:51 -04:00
										 |  |  | PRIVATE>
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-09-20 18:09:08 -04:00
										 |  |  | : parse-html ( string -- vector )
 | 
					
						
							| 
									
										
										
										
											2008-08-17 11:38:34 -04:00
										 |  |  |     [ (parse-html) tagstack get ] tag-parse ;
 |