diff --git a/extra/html/parser/parser-tests.factor b/extra/html/parser/parser-tests.factor index 9757f70a67..25251159b1 100644 --- a/extra/html/parser/parser-tests.factor +++ b/extra/html/parser/parser-tests.factor @@ -42,6 +42,19 @@ V{ } ] [ "" parse-html ] unit-test +[ +V{ + T{ tag f "a" + H{ + { "a" "pirsqd" } + { "foo" "bar" } + { "href" "http://factorcode.org/" } + { "baz" "quux" } + { "nofollow" f } + } f f } +} +] [ "" parse-html ] unit-test + [ V{ T{ tag f "html" H{ } f f } diff --git a/extra/html/parser/parser.factor b/extra/html/parser/parser.factor index 61088d1b5e..4aae6a25c4 100644 --- a/extra/html/parser/parser.factor +++ b/extra/html/parser/parser.factor @@ -1,17 +1,19 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays hashtables html.parser.state -html.parser.utils kernel make namespaces sequences +html.parser.utils kernel namespaces sequences unicode.case unicode.categories combinators.short-circuit -quoting ; +quoting fry ; IN: html.parser - TUPLE: tag name attributes text closing? ; SINGLETON: text SINGLETON: dtd SINGLETON: comment + + ( name attributes closing? -- tag ) tag new @@ -30,22 +32,19 @@ SYMBOL: tagstack : make-tag ( string attribs -- tag ) [ [ closing-tag? ] keep "/" trim1 ] dip rot ; -: new-tag ( string type -- tag ) +: new-tag ( text name -- tag ) tag new swap >>name swap >>text ; inline -: make-text-tag ( string -- tag ) text new-tag ; inline - -: make-comment-tag ( string -- tag ) comment new-tag ; inline - -: make-dtd-tag ( string -- tag ) dtd new-tag ; inline +: (read-quote) ( state-parser ch -- string ) + '[ [ current _ = ] take-until ] [ advance drop ] bi ; : read-single-quote ( state-parser -- string ) - [ [ current CHAR: ' = ] take-until ] [ next drop ] bi ; + CHAR: ' (read-quote) ; : read-double-quote ( state-parser -- string ) - [ [ current CHAR: " = ] take-until ] [ next drop ] bi ; + CHAR: " (read-quote) ; : read-quote ( state-parser -- string ) dup get+increment CHAR: ' = @@ -55,10 +54,6 @@ SYMBOL: tagstack skip-whitespace [ current { [ CHAR: = = ] [ blank? ] } 1|| ] take-until ; -: read-= ( state-parser -- ) - skip-whitespace - [ [ current CHAR: = = ] take-until drop ] [ next drop ] bi ; - : read-token ( state-parser -- string ) [ current blank? ] take-until ; @@ -68,42 +63,40 @@ SYMBOL: tagstack [ blank? ] trim ; : read-comment ( state-parser -- ) - "-->" take-until-sequence make-comment-tag push-tag ; + "-->" take-until-sequence comment new-tag push-tag ; : read-dtd ( state-parser -- ) - ">" take-until-sequence make-dtd-tag push-tag ; + ">" take-until-sequence dtd new-tag push-tag ; : read-bang ( state-parser -- ) - next dup { [ current CHAR: - = ] [ peek-next CHAR: - = ] } 1&& [ - next next - read-comment - ] [ - read-dtd - ] if ; + advance dup { [ current CHAR: - = ] [ peek-next CHAR: - = ] } 1&& + [ advance advance read-comment ] [ read-dtd ] if ; : read-tag ( state-parser -- string ) [ [ current "><" member? ] take-until ] - [ dup current CHAR: < = [ next ] unless drop ] bi ; + [ dup current CHAR: < = [ advance ] unless drop ] bi ; : read-until-< ( state-parser -- string ) [ current CHAR: < = ] take-until ; : parse-text ( state-parser -- ) - read-until-< [ make-text-tag push-tag ] unless-empty ; + read-until-< [ text new-tag push-tag ] unless-empty ; + +: parse-key/value ( state-parser -- key value ) + [ read-key >lower ] + [ skip-whitespace "=" take-sequence ] + [ swap [ read-value ] [ drop f ] if ] tri ; : (parse-attributes) ( state-parser -- ) skip-whitespace dup state-parse-end? [ drop ] [ - [ - [ read-key >lower ] [ read-= ] [ read-value ] tri - 2array , - ] keep (parse-attributes) + [ parse-key/value swap set ] [ (parse-attributes) ] bi ] if ; : parse-attributes ( state-parser -- hashtable ) - [ (parse-attributes) ] { } make >hashtable ; + [ (parse-attributes) ] H{ } make-assoc ; : (parse-tag) ( string -- string' hashtable ) [ @@ -111,7 +104,7 @@ SYMBOL: tagstack ] state-parse ; : read-< ( state-parser -- string/f ) - next dup current [ + advance dup current [ CHAR: ! = [ read-bang f ] [ read-tag ] if ] [ drop f @@ -128,5 +121,7 @@ SYMBOL: tagstack : tag-parse ( quot -- vector ) V{ } clone tagstack [ state-parse ] with-variable ; inline +PRIVATE> + : parse-html ( string -- vector ) [ (parse-html) tagstack get ] tag-parse ; diff --git a/extra/html/parser/state/state-tests.factor b/extra/html/parser/state/state-tests.factor index 835b54d0d3..6766cfddc2 100644 --- a/extra/html/parser/state/state-tests.factor +++ b/extra/html/parser/state/state-tests.factor @@ -34,3 +34,21 @@ IN: html.parser.state.tests [ { 1 2 } ] [ { 1 2 3 4 } { 3 4 } take-until-sequence ] unit-test + +[ "ab" ] +[ "abcd" "ab" take-sequence ] unit-test + +[ f ] +[ "abcd" "lol" take-sequence ] unit-test + +[ "ab" ] +[ + "abcd" + [ "lol" take-sequence drop ] [ "ab" take-sequence ] bi +] unit-test + +[ "" ] +[ "abcd" "" take-sequence ] unit-test + +[ "cd" ] +[ "abcd" [ "ab" take-sequence drop ] [ "cd" take-sequence ] bi ] unit-test diff --git a/extra/html/parser/state/state.factor b/extra/html/parser/state/state.factor index 3f899446c0..4a050306e9 100644 --- a/extra/html/parser/state/state.factor +++ b/extra/html/parser/state/state.factor @@ -12,32 +12,32 @@ TUPLE: state-parser sequence n ; swap >>sequence 0 >>n ; -: state-parser-nth ( n state -- char/f ) +: state-parser-nth ( n state-parser -- char/f ) sequence>> ?nth ; inline -: current ( state -- char/f ) +: current ( state-parser -- char/f ) [ n>> ] keep state-parser-nth ; inline -: previous ( state -- char/f ) +: previous ( state-parser -- char/f ) [ n>> 1 - ] keep state-parser-nth ; inline -: peek-next ( state -- char/f ) +: peek-next ( state-parser -- char/f ) [ n>> 1 + ] keep state-parser-nth ; inline -: next ( state -- state ) +: advance ( state-parser -- state-parser ) [ 1 + ] change-n ; inline -: get+increment ( state -- char/f ) - [ current ] [ next drop ] bi ; inline +: get+increment ( state-parser -- char/f ) + [ current ] [ advance drop ] bi ; inline -:: skip-until ( state quot: ( obj -- ? ) -- ) - state current [ - state quot call [ state next quot skip-until ] unless +:: skip-until ( state-parser quot: ( obj -- ? ) -- ) + state-parser current [ + state-parser quot call [ state-parser advance quot skip-until ] unless ] when ; inline recursive -: state-parse-end? ( state -- ? ) peek-next not ; +: state-parse-end? ( state-parser -- ? ) peek-next not ; -: take-until ( state quot: ( obj -- ? ) -- sequence/f ) +: take-until ( state-parser quot: ( obj -- ? ) -- sequence/f ) over state-parse-end? [ 2drop f ] [ @@ -46,9 +46,18 @@ TUPLE: state-parser sequence n ; [ drop [ n>> ] [ sequence>> ] bi ] 2tri subseq ] if ; inline -: take-while ( state quot: ( obj -- ? ) -- sequence/f ) +: take-while ( state-parser quot: ( obj -- ? ) -- sequence/f ) [ not ] compose take-until ; inline +:: take-sequence ( state-parser sequence -- obj/f ) + state-parser [ n>> dup sequence length + ] [ sequence>> ] bi + sequence sequence= [ + sequence + state-parser [ sequence length + ] change-n drop + ] [ + f + ] if ; + :: take-until-sequence ( state-parser sequence -- sequence' ) sequence length :> growing state-parser @@ -58,15 +67,15 @@ TUPLE: state-parser sequence n ; ] take-until :> found found dup length growing length 1- - head - state-parser next drop ; + state-parser advance drop ; -: skip-whitespace ( state -- state ) +: skip-whitespace ( state-parser -- state-parser ) [ [ current blank? not ] take-until drop ] keep ; -: take-rest ( state -- sequence ) +: take-rest ( state-parser -- sequence ) [ drop f ] take-until ; inline -: take-until-object ( state obj -- sequence ) +: take-until-object ( state-parser obj -- sequence ) '[ current _ = ] take-until ; : state-parse ( sequence quot -- )