From a10c3f425c570a6c6726fce1a03704e1b82b0df9 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Wed, 1 Oct 2008 21:25:37 -0500 Subject: [PATCH 1/2] let the unique heap retain hashtable entries after elements are popped to ensure uniqueness forever --- extra/assoc-heaps/assoc-heaps-docs.factor | 9 ++++--- extra/assoc-heaps/assoc-heaps.factor | 3 +-- extra/spider/spider.factor | 33 ++++++++++++----------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/extra/assoc-heaps/assoc-heaps-docs.factor b/extra/assoc-heaps/assoc-heaps-docs.factor index 6a80bcc6c6..8beaf9c4b1 100644 --- a/extra/assoc-heaps/assoc-heaps-docs.factor +++ b/extra/assoc-heaps/assoc-heaps-docs.factor @@ -10,16 +10,17 @@ HELP: { $values { "unique-heap" assoc-heap } } -{ $description "Creates a new " { $link assoc-heap } " where the assoc is a hashtable and the heap is a max-heap." } ; +{ $description "Creates a new " { $link assoc-heap } " where the assoc is a hashtable and the heap is a max-heap. Popping an element from the heap leaves this element in the hashtable to ensure that the element will not be processed again." } ; HELP: { $values - { "unique-heap" assoc-heap } } -{ $description "Creates a new " { $link assoc-heap } " where the assoc is a hashtable and the heap is a min-heap." } ; +{ $description "Creates a new " { $link assoc-heap } " where the assoc is a hashtable and the heap is a min-heap. Popping an element from the heap leaves this element in the hashtable to ensure that the element will not be processed again." } ; + +{ } related-words HELP: assoc-heap -{ $description "A data structure containing an assoc and a heap to get certain properties with better time constraints at the expense of more space and complexity. For instance, a hashtable and a heap can be combined into one assoc-heap to get a sorted data structure with O(1) lookup. Operations on assoc-heap should update both the assoc and the heap." } ; +{ $description "A data structure containing an assoc and a heap to get certain properties with better time constraints at the expense of more space and complexity. For instance, a hashtable and a heap can be combined into one assoc-heap to get a sorted data structure with O(1) lookup. Operations on assoc-heap may update both the assoc and the heap or leave them out of sync if it's advantageous." } ; ARTICLE: "assoc-heaps" "Associative heaps" "The " { $vocab-link "assoc-heaps" } " vocabulary combines exists to synthesize data structures with better time properties than either of the two component data structures alone." $nl diff --git a/extra/assoc-heaps/assoc-heaps.factor b/extra/assoc-heaps/assoc-heaps.factor index d2154002b9..a495aed626 100644 --- a/extra/assoc-heaps/assoc-heaps.factor +++ b/extra/assoc-heaps/assoc-heaps.factor @@ -21,8 +21,7 @@ M: assoc-heap heap-push* ( value key assoc-heap -- entry ) ] if ; M: assoc-heap heap-pop ( assoc-heap -- value key ) - [ heap>> heap-pop ] keep - [ over ] dip assoc>> delete-at ; + heap>> heap-pop ; M: assoc-heap heap-peek ( assoc-heap -- value key ) heap>> heap-peek ; diff --git a/extra/spider/spider.factor b/extra/spider/spider.factor index 90326d21cd..fb7757d448 100644 --- a/extra/spider/spider.factor +++ b/extra/spider/spider.factor @@ -8,7 +8,7 @@ combinators.short-circuit continuations calendar prettyprint ; IN: spider TUPLE: spider base count max-count sleep max-depth initial-links -filters spidered todo nonmatching ; +filters spidered todo nonmatching quiet ; ! secure? agent page-timeout data-timeout overall-timeout TUPLE: spider-result url depth headers fetch-time parsed-html @@ -27,19 +27,19 @@ links processing-time timestamp ; > not ; + : apply-filters ( links spider -- links' ) filters>> [ '[ _ 1&& ] filter ] when* ; +: push-links ( links level assoc-heap -- ) + '[ _ _ heap-push ] each ; + : add-todo ( links level spider -- ) - tuck [ apply-filters ] 2dip - tuck - [ spidered>> keys diff ] - [ todo>> ] 2bi* '[ _ _ heap-push ] each ; + todo>> push-links ; : add-nonmatching ( links level spider -- ) - nonmatching>> '[ _ _ heap-push ] each ; - -: relative-url? ( url -- ? ) protocol>> not ; + nonmatching>> push-links ; : filter-base ( spider spider-result -- base-links nonmatching-links ) [ base>> host>> ] [ links>> prune ] bi* @@ -51,26 +51,27 @@ links processing-time timestamp ; [ filter-base ] 2keep depth>> 1+ swap [ add-nonmatching ] - [ add-todo ] 2bi ; - -: print-spidering ( url depth -- ) - "depth: " write number>string write - ", spidering: " write . yield ; + [ tuck [ apply-filters ] 2dip add-todo ] 2bi ; : normalize-hrefs ( links -- links' ) [ >url ] map spider get base>> swap [ derive-url ] with map ; +: print-spidering ( url depth -- ) + "depth: " write number>string write + ", spidering: " write . yield ; + : (spider-page) ( url depth -- spider-result ) - 2dup print-spidering f pick spider get spidered>> set-at over '[ _ http-get ] benchmark swap [ parse-html dup find-hrefs normalize-hrefs ] benchmark now spider-result boa - dup describe ; : spider-page ( url depth -- ) - (spider-page) spider get swap add-spidered ; + spider get quiet>> [ 2dup print-spidering ] unless + (spider-page) + spider get [ quiet>> [ dup describe ] unless ] + [ swap add-spidered ] bi ; \ spider-page ERROR add-error-logging From 7308f2fb52c9fdd34ed6fac7c0793b2fc7399546 Mon Sep 17 00:00:00 2001 From: Doug Coleman Date: Wed, 1 Oct 2008 21:30:12 -0500 Subject: [PATCH 2/2] fix shiz --- extra/spider/spider-docs.factor | 8 +------- extra/spider/spider.factor | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/extra/spider/spider-docs.factor b/extra/spider/spider-docs.factor index 458c1d14d8..5f820ca368 100644 --- a/extra/spider/spider-docs.factor +++ b/extra/spider/spider-docs.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2008 Doug Coleman. ! See http://factorcode.org/license.txt for BSD license. USING: help.markup help.syntax io.streams.string urls -multiline ; +multiline spider.private quotations ; IN: spider HELP: @@ -16,12 +16,6 @@ HELP: run-spider { "spider" spider } } { $description "Runs a spider until completion. See the " { $subsection "spider-tutorial" } " for a complete description of the tuple slots that affect how thet spider works." } ; -HELP: spider -{ $description "" } ; - -HELP: spider-result -{ $description "" } ; - HELP: slurp-heap-when { $values { "heap" "a heap" } { "quot1" quotation } { "quot2" quotation } } diff --git a/extra/spider/spider.factor b/extra/spider/spider.factor index fa1cd2ac0f..64ee081ecc 100644 --- a/extra/spider/spider.factor +++ b/extra/spider/spider.factor @@ -65,7 +65,7 @@ links processing-time timestamp ; f pick spider get spidered>> set-at over '[ _ http-get ] benchmark swap [ parse-html dup find-hrefs normalize-hrefs ] benchmark - now spider-result boa + now spider-result boa ; : spider-page ( url depth -- ) spider get quiet>> [ 2dup print-spidering ] unless