From 34c1170963820b19c728453d00c46ba870443b2a Mon Sep 17 00:00:00 2001 From: Doug Coleman <doug.coleman@gmail.com> Date: Mon, 11 Feb 2008 16:27:40 -0600 Subject: [PATCH] add a few utility words --- extra/html/parser/analyzer/analyzer.factor | 35 +++++++++------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/extra/html/parser/analyzer/analyzer.factor b/extra/html/parser/analyzer/analyzer.factor index e4f11cd91e..dcfbd1e197 100755 --- a/extra/html/parser/analyzer/analyzer.factor +++ b/extra/html/parser/analyzer/analyzer.factor @@ -1,5 +1,5 @@ -USING: assocs html.parser kernel math sequences strings unicode.categories - unicode.case ; +USING: assocs html.parser kernel math sequences strings ascii +arrays shuffle unicode.case namespaces ; IN: html.parser.analyzer : remove-blank-text ( vector -- vector' ) @@ -65,28 +65,21 @@ IN: html.parser.analyzer [ tag-attributes "href" swap at ] map [ ] subset ; +: (find-all) ( n seq quot -- ) + 2dup >r >r find* [ + dupd 2array , 1+ r> r> (find-all) + ] [ + r> r> 3drop + ] if* ; +: find-all ( seq quot -- alist ) + [ 0 -rot (find-all) ] { } make ; -! : find-last-tag ( name vector -- index tag ) - ! [ - ! dup tag-matched? [ 2drop f ] [ tag-name = ] if - ! ] with find-last ; +: find-opening-tags-by-name ( name seq -- seq ) + [ [ tag-name = ] keep tag-closing? not and ] with find-all ; -! : find-last-tag* ( name n vector -- tag ) - ! 0 -rot <slice> find-last-tag ; +: href-contains? ( str tag -- ? ) + tag-attributes "href" swap at* [ subseq? ] [ 2drop f ] if ; -! : find-matching-tag ( tag -- tag ) - ! dup tag-closing? [ - ! find-last-tag - ! ] [ - ! ] if ; - - -! clear "/Users/erg/web/fark.html" file-contents parse-html find-links [ "go.pl" swap start ] subset [ "=" split peek ] map ! clear "http://fark.com" http-get parse-html find-links [ "go.pl" swap start ] subset [ "=" split peek ] map -! clear "/Users/erg/web/hostels.html" file-contents parse-html "Currency" "name" pick find-first-attribute-key-value - -! clear "/Users/erg/web/hostels.html" file-contents parse-html -! "Currency" "name" pick find-first-attribute-key-value -! pick find-between remove-blank-text