2008-09-25 03:02:47 -04:00
|
|
|
! Copyright (C) 2008 Doug Coleman.
|
|
|
|
! See http://factorcode.org/license.txt for BSD license.
|
2008-08-14 00:09:43 -04:00
|
|
|
USING: accessors arrays html.parser.utils hashtables io kernel
|
2008-09-10 23:11:40 -04:00
|
|
|
namespaces make prettyprint quotations sequences splitting
|
2009-01-21 22:57:44 -05:00
|
|
|
html.parser.state strings unicode.categories unicode.case ;
|
2007-12-04 15:14:33 -05:00
|
|
|
IN: html.parser
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
TUPLE: tag name attributes text closing? ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
SINGLETON: text
|
|
|
|
SINGLETON: dtd
|
|
|
|
SINGLETON: comment
|
2007-09-20 18:09:08 -04:00
|
|
|
SYMBOL: tagstack
|
|
|
|
|
|
|
|
: push-tag ( tag -- )
|
|
|
|
tagstack get push ;
|
|
|
|
|
|
|
|
: closing-tag? ( string -- ? )
|
2008-08-17 11:38:34 -04:00
|
|
|
[ f ]
|
|
|
|
[ [ first ] [ peek ] bi [ CHAR: / = ] bi@ or ] if-empty ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: <tag> ( name attributes closing? -- tag )
|
2008-08-14 00:09:43 -04:00
|
|
|
tag new
|
|
|
|
swap >>closing?
|
|
|
|
swap >>attributes
|
|
|
|
swap >>name ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: make-tag ( string attribs -- tag )
|
2008-11-29 13:18:28 -05:00
|
|
|
[ [ closing-tag? ] keep "/" trim1 ] dip rot <tag> ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: make-text-tag ( string -- tag )
|
|
|
|
tag new
|
|
|
|
text >>name
|
|
|
|
swap >>text ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: make-comment-tag ( string -- tag )
|
|
|
|
tag new
|
|
|
|
comment >>name
|
|
|
|
swap >>text ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: make-dtd-tag ( string -- tag )
|
|
|
|
tag new
|
|
|
|
dtd >>name
|
|
|
|
swap >>text ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-whitespace ( -- string )
|
2007-09-20 18:09:08 -04:00
|
|
|
[ get-char blank? not ] take-until ;
|
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-whitespace* ( -- ) read-whitespace drop ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-token ( -- string )
|
2007-09-20 18:09:08 -04:00
|
|
|
read-whitespace*
|
|
|
|
[ get-char blank? ] take-until ;
|
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-single-quote ( -- string )
|
2007-09-20 18:09:08 -04:00
|
|
|
[ get-char CHAR: ' = ] take-until ;
|
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-double-quote ( -- string )
|
2007-09-20 18:09:08 -04:00
|
|
|
[ get-char CHAR: " = ] take-until ;
|
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-quote ( -- string )
|
2009-01-21 22:57:44 -05:00
|
|
|
get-char next CHAR: ' =
|
|
|
|
[ read-single-quote ] [ read-double-quote ] if next ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-key ( -- string )
|
2007-09-20 18:09:08 -04:00
|
|
|
read-whitespace*
|
2008-08-17 11:38:34 -04:00
|
|
|
[ get-char [ CHAR: = = ] [ blank? ] bi or ] take-until ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: read-= ( -- )
|
|
|
|
read-whitespace*
|
2009-01-21 22:57:44 -05:00
|
|
|
[ get-char CHAR: = = ] take-until drop next ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-value ( -- string )
|
2007-09-20 18:09:08 -04:00
|
|
|
read-whitespace*
|
2008-08-17 11:38:34 -04:00
|
|
|
get-char quote? [ read-quote ] [ read-token ] if
|
|
|
|
[ blank? ] trim ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: read-comment ( -- )
|
2009-01-21 22:57:44 -05:00
|
|
|
"-->" take-string make-comment-tag push-tag ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: read-dtd ( -- )
|
2009-01-21 22:57:44 -05:00
|
|
|
">" take-string make-dtd-tag push-tag ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: read-bang ( -- )
|
2009-01-21 22:57:44 -05:00
|
|
|
next get-char CHAR: - = get-next CHAR: - = and [
|
|
|
|
next next
|
2007-09-20 18:09:08 -04:00
|
|
|
read-comment
|
|
|
|
] [
|
|
|
|
read-dtd
|
|
|
|
] if ;
|
|
|
|
|
2008-05-19 19:58:56 -04:00
|
|
|
: read-tag ( -- string )
|
2007-09-20 18:09:08 -04:00
|
|
|
[ get-char CHAR: > = get-char CHAR: < = or ] take-until
|
2009-01-21 22:57:44 -05:00
|
|
|
get-char CHAR: < = [ next ] unless ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-< ( -- string )
|
2009-01-21 22:57:44 -05:00
|
|
|
next get-char CHAR: ! = [
|
2007-09-20 18:09:08 -04:00
|
|
|
read-bang f
|
|
|
|
] [
|
|
|
|
read-tag
|
|
|
|
] if ;
|
|
|
|
|
2008-08-17 11:38:34 -04:00
|
|
|
: read-until-< ( -- string )
|
2007-09-20 18:09:08 -04:00
|
|
|
[ get-char CHAR: < = ] take-until ;
|
|
|
|
|
|
|
|
: parse-text ( -- )
|
2008-09-06 18:15:25 -04:00
|
|
|
read-until-< [
|
2007-09-20 18:09:08 -04:00
|
|
|
make-text-tag push-tag
|
2008-09-06 18:15:25 -04:00
|
|
|
] unless-empty ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: (parse-attributes) ( -- )
|
|
|
|
read-whitespace*
|
|
|
|
string-parse-end? [
|
|
|
|
read-key >lower read-= read-value
|
|
|
|
2array , (parse-attributes)
|
|
|
|
] unless ;
|
|
|
|
|
|
|
|
: parse-attributes ( -- hashtable )
|
2007-12-04 15:14:33 -05:00
|
|
|
[ (parse-attributes) ] { } make >hashtable ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-06-14 01:41:48 -04:00
|
|
|
: (parse-tag) ( string -- string' hashtable )
|
2007-09-20 18:09:08 -04:00
|
|
|
[
|
|
|
|
read-token >lower
|
|
|
|
parse-attributes
|
|
|
|
] string-parse ;
|
|
|
|
|
|
|
|
: parse-tag ( -- )
|
2008-08-17 11:38:34 -04:00
|
|
|
read-< [
|
2007-09-20 18:09:08 -04:00
|
|
|
(parse-tag) make-tag push-tag
|
2008-08-17 11:38:34 -04:00
|
|
|
] unless-empty ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
2008-05-19 19:58:56 -04:00
|
|
|
: (parse-html) ( -- )
|
2007-09-20 18:09:08 -04:00
|
|
|
get-next [
|
|
|
|
parse-text
|
|
|
|
parse-tag
|
|
|
|
(parse-html)
|
|
|
|
] when ;
|
|
|
|
|
|
|
|
: tag-parse ( quot -- vector )
|
2008-08-17 11:38:34 -04:00
|
|
|
V{ } clone tagstack [ string-parse ] with-variable ;
|
2007-09-20 18:09:08 -04:00
|
|
|
|
|
|
|
: parse-html ( string -- vector )
|
2008-08-17 11:38:34 -04:00
|
|
|
[ (parse-html) tagstack get ] tag-parse ;
|