diff --git a/extra/html/parser/parser.factor b/extra/html/parser/parser.factor index 60e5ddbf54..677737618b 100644 --- a/extra/html/parser/parser.factor +++ b/extra/html/parser/parser.factor @@ -1,10 +1,12 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays html.parser.utils hashtables io kernel -namespaces make prettyprint quotations sequences splitting -html.parser.state strings unicode.categories unicode.case ; +USING: accessors arrays hashtables html.parser.state +html.parser.utils kernel make namespaces sequences +unicode.case unicode.categories combinators.short-circuit +quoting ; IN: html.parser + TUPLE: tag name attributes text closing? ; SINGLETON: text @@ -28,113 +30,100 @@ SYMBOL: tagstack : make-tag ( string attribs -- tag ) [ [ closing-tag? ] keep "/" trim1 ] dip rot ; -: make-text-tag ( string -- tag ) +: new-tag ( string type -- tag ) tag new - text >>name - swap >>text ; + swap >>name + swap >>text ; inline -: make-comment-tag ( string -- tag ) - tag new - comment >>name - swap >>text ; +: make-text-tag ( string -- tag ) text new-tag ; inline -: make-dtd-tag ( string -- tag ) - tag new - dtd >>name - swap >>text ; +: make-comment-tag ( string -- tag ) comment new-tag ; inline -: read-whitespace ( -- string ) - [ get-char blank? not ] take-until ; +: make-dtd-tag ( string -- tag ) dtd new-tag ; inline -: read-whitespace* ( -- ) read-whitespace drop ; +: read-single-quote ( state-parser -- string ) + [ [ CHAR: ' = ] take-until ] [ next drop ] bi ; -: read-token ( -- string ) - read-whitespace* - [ get-char blank? ] take-until ; +: read-double-quote ( state-parser -- string ) + [ [ CHAR: " = ] take-until ] [ next drop ] bi ; -: read-single-quote ( -- string ) - [ get-char CHAR: ' = ] take-until ; +: read-quote ( state-parser -- string ) + dup get+increment CHAR: ' = + [ read-single-quote ] [ read-double-quote ] if ; -: read-double-quote ( -- string ) - [ get-char CHAR: " = ] take-until ; +: read-key ( state-parser -- string ) + skip-whitespace + [ { [ CHAR: = = ] [ blank? ] } 1|| ] take-until ; -: read-quote ( -- string ) - get-char next CHAR: ' = - [ read-single-quote ] [ read-double-quote ] if next ; +: read-= ( state-parser -- ) + skip-whitespace + [ [ CHAR: = = ] take-until drop ] [ next drop ] bi ; -: read-key ( -- string ) - read-whitespace* - [ get-char [ CHAR: = = ] [ blank? ] bi or ] take-until ; +: read-token ( state-parser -- string ) + [ blank? ] take-until ; -: read-= ( -- ) - read-whitespace* - [ get-char CHAR: = = ] take-until drop next ; - -: read-value ( -- string ) - read-whitespace* - get-char quote? [ read-quote ] [ read-token ] if +: read-value ( state-parser -- string ) + skip-whitespace + dup get-char quote? [ read-quote ] [ read-token ] if [ blank? ] trim ; -: read-comment ( -- ) - "-->" take-string make-comment-tag push-tag ; +: read-comment ( state-parser -- ) + "-->" take-until-string make-comment-tag push-tag ; -: read-dtd ( -- ) - ">" take-string make-dtd-tag push-tag ; +: read-dtd ( state-parser -- ) + ">" take-until-string make-dtd-tag push-tag ; -: read-bang ( -- ) - next get-char CHAR: - = get-next CHAR: - = and [ +: read-bang ( state-parser -- ) + next dup { [ get-char CHAR: - = ] [ get-next CHAR: - = ] } 1&& [ next next read-comment ] [ read-dtd ] if ; -: read-tag ( -- string ) - [ get-char CHAR: > = get-char CHAR: < = or ] take-until - get-char CHAR: < = [ next ] unless ; +: read-tag ( state-parser -- string ) + [ [ "><" member? ] take-until ] + [ dup get-char CHAR: < = [ next ] unless drop ] bi ; -: read-< ( -- string ) - next get-char CHAR: ! = [ - read-bang f +: read-until-< ( state-parser -- string ) + [ CHAR: < = ] take-until ; + +: parse-text ( state-parser -- ) + read-until-< [ make-text-tag push-tag ] unless-empty ; + +: (parse-attributes) ( state-parser -- ) + skip-whitespace + dup string-parse-end? [ + drop ] [ - read-tag + [ + [ read-key >lower ] [ read-= ] [ read-value ] tri + 2array , + ] keep (parse-attributes) ] if ; -: read-until-< ( -- string ) - [ get-char CHAR: < = ] take-until ; - -: parse-text ( -- ) - read-until-< [ - make-text-tag push-tag - ] unless-empty ; - -: (parse-attributes) ( -- ) - read-whitespace* - string-parse-end? [ - read-key >lower read-= read-value - 2array , (parse-attributes) - ] unless ; - -: parse-attributes ( -- hashtable ) +: parse-attributes ( state-parser -- hashtable ) [ (parse-attributes) ] { } make >hashtable ; : (parse-tag) ( string -- string' hashtable ) [ - read-token >lower - parse-attributes + [ read-token >lower ] [ parse-attributes ] bi ] string-parse ; -: parse-tag ( -- ) - read-< [ - (parse-tag) make-tag push-tag - ] unless-empty ; +: read-< ( state-parser -- string/f ) + next dup get-char [ + CHAR: ! = [ read-bang f ] [ read-tag ] if + ] [ + drop f + ] if* ; -: (parse-html) ( -- ) - get-next [ - parse-text - parse-tag - (parse-html) - ] when ; +: parse-tag ( state-parser -- ) + read-< [ (parse-tag) make-tag push-tag ] unless-empty ; + +: (parse-html) ( state-parser -- ) + dup get-next [ + [ parse-text ] [ parse-tag ] [ (parse-html) ] tri + ] [ drop ] if ; : tag-parse ( quot -- vector ) V{ } clone tagstack [ string-parse ] with-variable ; inline diff --git a/extra/html/parser/state/state-tests.factor b/extra/html/parser/state/state-tests.factor index da70d0fa12..f676649aa8 100644 --- a/extra/html/parser/state/state-tests.factor +++ b/extra/html/parser/state/state-tests.factor @@ -1,14 +1,30 @@ -USING: tools.test html.parser.state ascii kernel ; +USING: tools.test html.parser.state ascii kernel accessors ; IN: html.parser.state.tests -: take-rest ( -- string ) - [ f ] take-until ; +[ "hello" ] +[ "hello" [ take-rest ] string-parse ] unit-test -: take-char ( -- string ) - [ get-char = ] curry take-until ; +[ "hi" " how are you?" ] +[ + "hi how are you?" + [ [ [ blank? ] take-until ] [ take-rest ] bi ] string-parse +] unit-test + +[ "foo" ";bar" ] +[ + "foo;bar" [ + [ CHAR: ; take-until-char ] [ take-rest ] bi + ] string-parse +] unit-test -[ "hello" ] [ "hello" [ take-rest ] string-parse ] unit-test -[ "hi" " how are you?" ] [ "hi how are you?" [ [ get-char blank? ] take-until take-rest ] string-parse ] unit-test -[ "foo" ";bar" ] [ "foo;bar" [ CHAR: ; take-char take-rest ] string-parse ] unit-test [ "foo " " bar" ] -[ "foo and bar" [ "and" take-string take-rest ] string-parse ] unit-test +[ + "foo and bar" [ + [ "and" take-until-string ] [ take-rest ] bi + ] string-parse +] unit-test + +[ 6 ] +[ + " foo " [ skip-whitespace i>> ] string-parse +] unit-test diff --git a/extra/html/parser/state/state.factor b/extra/html/parser/state/state.factor index 1b3f188a78..c69fd76af5 100644 --- a/extra/html/parser/state/state.factor +++ b/extra/html/parser/state/state.factor @@ -1,41 +1,62 @@ ! Copyright (C) 2005, 2009 Daniel Ehrenberg ! See http://factorcode.org/license.txt for BSD license. -USING: namespaces math kernel sequences accessors fry circular ; +USING: namespaces math kernel sequences accessors fry circular +unicode.case unicode.categories locals ; IN: html.parser.state -TUPLE: state string i ; +TUPLE: state-parser string i ; -: get-i ( -- i ) state get i>> ; inline +: ( string -- state-parser ) + state-parser new + swap >>string + 0 >>i ; -: get-char ( -- char ) - state get [ i>> ] [ string>> ] bi ?nth ; inline +: (get-char) ( i state -- char/f ) + string>> ?nth ; inline -: get-next ( -- char ) - state get [ i>> 1+ ] [ string>> ] bi ?nth ; inline +: get-char ( state -- char/f ) + [ i>> ] keep (get-char) ; inline -: next ( -- ) - state get [ 1+ ] change-i drop ; inline +: get-next ( state -- char/f ) + [ i>> 1+ ] keep (get-char) ; inline + +: next ( state -- state ) + [ 1+ ] change-i ; inline + +: get+increment ( state -- char/f ) + [ get-char ] [ next drop ] bi ; inline : string-parse ( string quot -- ) - [ 0 state boa state ] dip with-variable ; inline + [ ] dip call ; inline -: short* ( n seq -- n' seq ) - over [ nip dup length swap ] unless ; inline +:: skip-until ( state quot: ( obj -- ? ) -- ) + state get-char [ + quot call [ state next quot skip-until ] unless + ] when* ; inline recursive -: skip-until ( quot: ( -- ? ) -- ) - get-char [ - [ call ] keep swap - [ drop ] [ next skip-until ] if - ] [ drop ] if ; inline recursive +: take-until ( state quot: ( obj -- ? ) -- string ) + [ drop i>> ] + [ skip-until ] + [ drop [ i>> ] [ string>> ] bi ] 2tri subseq ; inline -: take-until ( quot: ( -- ? ) -- ) - get-i [ skip-until ] dip get-i - state get string>> subseq ; inline +:: take-until-string ( state-parser string -- string' ) + string length :> growing + state-parser + [ + growing push-growing-circular + string growing sequence= + ] take-until :> found + found dup length + growing length 1- - head + state-parser next drop ; + +: skip-whitespace ( state -- state ) + [ [ blank? not ] take-until drop ] keep ; -: string-matches? ( string circular -- ? ) - get-char over push-growing-circular sequence= ; inline +: take-rest ( state -- string ) + [ drop f ] take-until ; inline -: take-string ( match -- string ) - dup length - [ 2dup string-matches? ] take-until nip - dup length rot length 1- - head next ; inline +: take-until-char ( state ch -- string ) + '[ _ = ] take-until ; + +: string-parse-end? ( state -- ? ) get-next not ; diff --git a/extra/html/parser/utils/utils-tests.factor b/extra/html/parser/utils/utils-tests.factor index 6d8e3bc05f..ec6780687d 100644 --- a/extra/html/parser/utils/utils-tests.factor +++ b/extra/html/parser/utils/utils-tests.factor @@ -1,20 +1,13 @@ USING: assocs combinators continuations hashtables hashtables.private io kernel math namespaces prettyprint quotations sequences splitting -strings tools.test ; -USING: html.parser.utils ; +strings tools.test html.parser.utils quoting ; IN: html.parser.utils.tests [ "'Rome'" ] [ "Rome" single-quote ] unit-test [ "\"Roma\"" ] [ "Roma" double-quote ] unit-test [ "'Firenze'" ] [ "Firenze" quote ] unit-test [ "\"Caesar's\"" ] [ "Caesar's" quote ] unit-test -[ f ] [ "" quoted? ] unit-test -[ t ] [ "''" quoted? ] unit-test -[ t ] [ "\"\"" quoted? ] unit-test -[ t ] [ "\"Circus Maximus\"" quoted? ] unit-test -[ t ] [ "'Circus Maximus'" quoted? ] unit-test -[ f ] [ "Circus Maximus" quoted? ] unit-test [ "'Italy'" ] [ "Italy" ?quote ] unit-test [ "'Italy'" ] [ "'Italy'" ?quote ] unit-test [ "\"Italy\"" ] [ "\"Italy\"" ?quote ] unit-test diff --git a/extra/html/parser/utils/utils.factor b/extra/html/parser/utils/utils.factor index c913b9d306..7abd2fcdf7 100644 --- a/extra/html/parser/utils/utils.factor +++ b/extra/html/parser/utils/utils.factor @@ -3,16 +3,12 @@ USING: assocs circular combinators continuations hashtables hashtables.private io kernel math namespaces prettyprint quotations sequences splitting html.parser.state strings -combinators.short-circuit ; +combinators.short-circuit quoting ; IN: html.parser.utils -: string-parse-end? ( -- ? ) get-next not ; - : trim1 ( seq ch -- newseq ) [ [ ?head-slice drop ] [ ?tail-slice drop ] bi ] 2keep drop like ; -: quote? ( ch -- ? ) "'\"" member? ; - : single-quote ( str -- newstr ) "'" dup surround ; : double-quote ( str -- newstr ) "\"" dup surround ; @@ -21,14 +17,4 @@ IN: html.parser.utils CHAR: ' over member? [ double-quote ] [ single-quote ] if ; -: quoted? ( str -- ? ) - { - [ length 1 > ] - [ first quote? ] - [ [ first ] [ peek ] bi = ] - } 1&& ; - : ?quote ( str -- newstr ) dup quoted? [ quote ] unless ; - -: unquote ( str -- newstr ) - dup quoted? [ but-last-slice rest-slice >string ] when ;