diff --git a/extra/html/parser/analyzer/analyzer-tests.factor b/extra/html/parser/analyzer/analyzer-tests.factor index 4d2378c7ea..426fd75b26 100644 --- a/extra/html/parser/analyzer/analyzer-tests.factor +++ b/extra/html/parser/analyzer/analyzer-tests.factor @@ -1,6 +1,6 @@ ! Copyright (C) 2010 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. -USING: html.parser.analyzer math tools.test ; +USING: html.parser html.parser.analyzer math tools.test ; IN: html.parser.analyzer.tests [ 0 3 ] @@ -27,3 +27,46 @@ IN: html.parser.analyzer.tests [ 0 { 3 5 7 9 11 } [ odd? ] find-last-nth ] [ undefined-find-nth? ] must-fail-with + +[ V{ + T{ tag f text f "foo" f } +} +] [ + "foo" parse-html + "title" find-between-first +] unit-test + +[ V{ + T{ tag f "p" H{ } f f } + T{ tag f text f "para" f } + T{ tag f "p" H{ } f t } +} +] [ + "

para

" parse-html "div" find-between-first +] unit-test + +[ V{ + T{ tag f "div" H{ { "class" "foo" } } f f } + T{ tag f "p" H{ } f f } + T{ tag f text f "para" f } + T{ tag f "p" H{ } f t } + T{ tag f "div" H{ } f t } +} +] [ + "

para

" parse-html + "foo" find-by-class-between +] unit-test + +[ V{ + T{ tag f "div" H{ { "class" "foo" } } f f } + T{ tag f "div" H{ } f f } + T{ tag f "p" H{ } f f } + T{ tag f text f "para" f } + T{ tag f "p" H{ } f t } + T{ tag f "div" H{ } f t } + T{ tag f "div" H{ } f t } +} +] [ + "

para

" parse-html + "foo" find-by-class-between +] unit-test diff --git a/extra/html/parser/analyzer/analyzer.factor b/extra/html/parser/analyzer/analyzer.factor index d9ae88675a..eeb15950a7 100644 --- a/extra/html/parser/analyzer/analyzer.factor +++ b/extra/html/parser/analyzer/analyzer.factor @@ -1,8 +1,8 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors assocs combinators combinators.short-circuit -fry html.parser http.client io kernel locals math sequences -sets splitting unicode.case unicode.categories urls +fry html.parser http.client io kernel locals math math.statistics +sequences sets splitting unicode.case unicode.categories urls urls.encoding shuffle ; IN: html.parser.analyzer @@ -51,14 +51,24 @@ ERROR: undefined-find-nth m n seq quot ; : find-first-name ( vector string -- i/f tag/f ) >lower '[ name>> _ = ] find ; inline -: find-matching-close ( vector string -- i/f tag/f ) +! Takes a sequence and a quotation expected to return -1 if the +! element decrements the stack, 0 if it doesnt affect it and 1 if it +! increments it. Then finds the matching element where the stack is +! empty. +: stack-find ( seq quot -- i/f ) + map cum-sum [ 0 = ] find drop ; inline + +! Produces a function which returns 1 if the input item is an opening +! tag element with the specified name, -1 if it is a closing tag of +! the same name and 0 otherwise. +: tag-classifier ( string -- quot ) >lower - '[ [ name>> _ = ] [ closing?>> ] bi and ] find ; inline + '[ dup name>> _ = [ closing?>> [ -1 ] [ 1 ] if ] [ drop 0 ] if ] ; inline : find-between* ( vector i/f tag/f -- vector ) over integer? [ [ tail-slice ] [ name>> ] bi* - dupd find-matching-close drop [ 1 + ] [ 1 ] if* + dupd tag-classifier stack-find [ 1 + ] [ 1 ] if* head ] [ 3drop V{ } clone