156 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Factor
		
	
	
			
		
		
	
	
			156 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Factor
		
	
	
USING: arrays html.parser.utils hashtables io kernel
 | 
						|
namespaces prettyprint quotations
 | 
						|
sequences splitting state-parser strings ;
 | 
						|
IN: html.parser
 | 
						|
 | 
						|
TUPLE: tag name attributes text matched? closing? ;
 | 
						|
 | 
						|
SYMBOL: text
 | 
						|
SYMBOL: dtd
 | 
						|
SYMBOL: comment
 | 
						|
SYMBOL: javascript
 | 
						|
SYMBOL: tagstack
 | 
						|
 | 
						|
: push-tag ( tag -- )
 | 
						|
    tagstack get push ;
 | 
						|
 | 
						|
: closing-tag? ( string -- ? )
 | 
						|
    dup empty? [
 | 
						|
        drop f
 | 
						|
    ] [
 | 
						|
        dup first CHAR: / =
 | 
						|
        swap peek CHAR: / = or
 | 
						|
    ] if ;
 | 
						|
 | 
						|
: <tag> ( name attributes closing? -- tag )
 | 
						|
    { set-tag-name set-tag-attributes set-tag-closing? }
 | 
						|
    tag construct ;
 | 
						|
 | 
						|
: make-tag ( str attribs -- tag )
 | 
						|
    >r [ closing-tag? ] keep "/" trim1 r> rot <tag> ;
 | 
						|
 | 
						|
: make-text-tag ( str -- tag )
 | 
						|
    T{ tag f text } clone [ set-tag-text ] keep ;
 | 
						|
 | 
						|
: make-comment-tag ( str -- tag )
 | 
						|
    T{ tag f comment } clone [ set-tag-text ] keep ;
 | 
						|
 | 
						|
: make-dtd-tag ( str -- tag )
 | 
						|
    T{ tag f dtd } clone [ set-tag-text ] keep ;
 | 
						|
 | 
						|
: read-whitespace ( -- str )
 | 
						|
    [ get-char blank? not ] take-until ;
 | 
						|
 | 
						|
: read-whitespace* ( -- )
 | 
						|
    read-whitespace drop ;
 | 
						|
 | 
						|
: read-token ( -- str )
 | 
						|
    read-whitespace*
 | 
						|
    [ get-char blank? ] take-until ;
 | 
						|
 | 
						|
: read-single-quote ( -- str )
 | 
						|
    [ get-char CHAR: ' = ] take-until ;
 | 
						|
 | 
						|
: read-double-quote ( -- str )
 | 
						|
    [ get-char CHAR: " = ] take-until ;
 | 
						|
 | 
						|
: read-quote ( -- str )
 | 
						|
    get-char next* CHAR: ' = [
 | 
						|
        read-single-quote
 | 
						|
    ] [
 | 
						|
        read-double-quote
 | 
						|
    ] if next* ;
 | 
						|
 | 
						|
: read-key ( -- str )
 | 
						|
    read-whitespace*
 | 
						|
    [ get-char CHAR: = = get-char blank? or ] take-until ;
 | 
						|
 | 
						|
: read-= ( -- )
 | 
						|
    read-whitespace*
 | 
						|
    [ get-char CHAR: = = ] take-until drop next* ;
 | 
						|
 | 
						|
: read-value ( -- str )
 | 
						|
    read-whitespace*
 | 
						|
    get-char quote? [
 | 
						|
        read-quote
 | 
						|
    ] [
 | 
						|
        read-token
 | 
						|
    ] if ;
 | 
						|
 | 
						|
: read-comment ( -- )
 | 
						|
    "-->" take-string* make-comment-tag push-tag ;
 | 
						|
 | 
						|
: read-dtd ( -- )
 | 
						|
    ">" take-string* make-dtd-tag push-tag ;
 | 
						|
 | 
						|
: read-bang ( -- )
 | 
						|
    next* get-char CHAR: - = get-next CHAR: - = and [
 | 
						|
        next* next*
 | 
						|
        read-comment
 | 
						|
    ] [
 | 
						|
        read-dtd
 | 
						|
    ] if ;
 | 
						|
 | 
						|
: read-tag ( -- )
 | 
						|
    [ get-char CHAR: > = get-char CHAR: < = or ] take-until
 | 
						|
    get-char CHAR: < = [ next* ] unless ;
 | 
						|
 | 
						|
: read-< ( -- str )
 | 
						|
    next* get-char CHAR: ! = [
 | 
						|
        read-bang f
 | 
						|
    ] [
 | 
						|
        read-tag
 | 
						|
    ] if ;
 | 
						|
 | 
						|
: read-until-< ( -- str )
 | 
						|
    [ get-char CHAR: < = ] take-until ;
 | 
						|
 | 
						|
: parse-text ( -- )
 | 
						|
    read-until-< dup empty? [
 | 
						|
        drop
 | 
						|
    ] [
 | 
						|
        make-text-tag push-tag
 | 
						|
    ] if ;
 | 
						|
 | 
						|
: (parse-attributes) ( -- )
 | 
						|
    read-whitespace*
 | 
						|
    string-parse-end? [
 | 
						|
        read-key >lower read-= read-value
 | 
						|
        2array , (parse-attributes)
 | 
						|
    ] unless ;
 | 
						|
 | 
						|
: parse-attributes ( -- hashtable )
 | 
						|
    [ (parse-attributes) ] { } make >hashtable ;
 | 
						|
 | 
						|
: (parse-tag)
 | 
						|
    [
 | 
						|
        read-token >lower
 | 
						|
        parse-attributes
 | 
						|
    ] string-parse ;
 | 
						|
 | 
						|
: parse-tag ( -- )
 | 
						|
    read-< dup empty? [
 | 
						|
        drop
 | 
						|
    ] [
 | 
						|
        (parse-tag) make-tag push-tag
 | 
						|
    ] if ;
 | 
						|
 | 
						|
: (parse-html) ( tag -- )
 | 
						|
    get-next [
 | 
						|
        parse-text
 | 
						|
        parse-tag
 | 
						|
        (parse-html)
 | 
						|
    ] when ;
 | 
						|
 | 
						|
: tag-parse ( quot -- vector )
 | 
						|
    [
 | 
						|
        V{ } clone tagstack set
 | 
						|
        string-parse
 | 
						|
    ] with-scope ;
 | 
						|
 | 
						|
: parse-html ( string -- vector )
 | 
						|
    [
 | 
						|
        (parse-html)
 | 
						|
        tagstack get
 | 
						|
    ] tag-parse ;
 |