From ff04cf82fe1e55bc30803e45e1b0e1ec1a6812ea Mon Sep 17 00:00:00 2001 From: Alec Berryman Date: Fri, 8 May 2009 23:30:01 -0400 Subject: [PATCH] bloom-filters: clean up creation More readable, less allocation, signals invalid input. --- extra/bloom-filters/bloom-filters-docs.factor | 6 +- .../bloom-filters/bloom-filters-tests.factor | 24 +++++-- extra/bloom-filters/bloom-filters.factor | 66 ++++++++++++------- 3 files changed, 63 insertions(+), 33 deletions(-) diff --git a/extra/bloom-filters/bloom-filters-docs.factor b/extra/bloom-filters/bloom-filters-docs.factor index 4af1a82af6..bc5df8611c 100644 --- a/extra/bloom-filters/bloom-filters-docs.factor +++ b/extra/bloom-filters/bloom-filters-docs.factor @@ -3,9 +3,11 @@ IN: bloom-filters HELP: { $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." } - { "number-objects" "The expected number of object in the set. An " { $link integer } "." } + { "number-objects" "The expected number of object in the set. A positive " { $link integer } "." } { "bloom-filter" bloom-filter } } -{ $description "Creates an empty Bloom filter." } ; +{ $description "Creates an empty Bloom filter." } +{ $errors "Throws a " { $link capacity-error } " when unable to produce a filter meeting the given constraints. Throws a " { $link invalid-error-rate } " or a " { $link invalid-n-objects } " when input is invalid." } ; + HELP: bloom-filter-insert { $values { "object" object } diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor index 40fd1469b2..b4fd69d849 100644 --- a/extra/bloom-filters/bloom-filters-tests.factor +++ b/extra/bloom-filters/bloom-filters-tests.factor @@ -2,6 +2,10 @@ USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts math random sequences tools.test ; IN: bloom-filters.tests + +[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test +[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test + ! The sizing information was generated using the subroutine ! calculate_shortest_filter_length from ! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html. @@ -19,13 +23,19 @@ IN: bloom-filters.tests [ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test [ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test -! This is a lot of bits. On linux-x86-32, max-array-capacity is 134217727, -! which is about 16MB (assuming I can do math), which is sort of pithy. I'm -! not sure how to handle this case. Returning a smaller-than-requested -! arrays is not the least surprising behavior, but is still surprising. -[ 383718189 ] [ 7 0.01 40000000 bits-to-satisfy-error-rate ] unit-test -! [ 7 383718189 ] [ 0.01 40000000 size-bloom-filter ] unit-test -! [ 383718189 ] [ 0.01 40000000 bits>> length ] unit-test +! This is a lot of bits. +: oversized-filter-params ( -- error-rate n-objects ) + 0.00000001 400000000000000 ; +[ oversized-filter-params size-bloom-filter ] [ capacity-error? ] must-fail-with +[ oversized-filter-params ] [ capacity-error? ] must-fail-with + +! Other error conditions. +[ 1.0 2000 ] [ invalid-error-rate? ] must-fail-with +[ 20 2000 ] [ invalid-error-rate? ] must-fail-with +[ 0.0 2000 ] [ invalid-error-rate? ] must-fail-with +[ -2 2000 ] [ invalid-error-rate? ] must-fail-with +[ 0.5 0 ] [ invalid-n-objects? ] must-fail-with +[ 0.5 -5 ] [ invalid-n-objects? ] must-fail-with ! Should not generate bignum hash codes. Enhanced double hashing may generate a ! lot of hash codes, and it's better to do this earlier than later. diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index 3e0aba175c..5440461892 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -1,17 +1,16 @@ ! Copyright (C) 2009 Alec Berryman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays bit-arrays fry kernel layouts locals math math.functions -math.ranges multiline sequences ; +multiline sequences ; IN: bloom-filters +FROM: math.ranges => [1,b] [0,b) ; +FROM: math.intervals => (a,b) interval-contains? ; + /* TODO: -- How to singal an error when too many bits? It looks like a built-in for some - types of arrays, but bit-array just returns a zero-length array. What we do - now is completely broken: -1 hash codes? Really? - - The false positive rate is 10x what it should be, based on informal testing. Better object hashes or a better method of generating extra hash codes would help. Another way is to increase the number of bits used. @@ -25,7 +24,9 @@ TODO: - Be sure to adjust the test that asserts the number of false positives isn't unreasonable. -- Should round bits up to next power of two, use wrap instead of mod. +- Could round bits up to next power of two and use wrap instead of mod. This + would cost a lot of bits on 32-bit platforms, though, and limit the bit-array + to 8MB. - Should allow user to specify the hash codes, either as inputs to enhanced double hashing or for direct use. @@ -47,6 +48,10 @@ TUPLE: bloom-filter { maximum-n-objects fixnum read-only } { current-n-objects fixnum } ; +ERROR: capacity-error ; +ERROR: invalid-error-rate ; +ERROR: invalid-n-objects ; + integer ; ! should check that it's below max-array-capacity -! TODO: this should be a constant -! -! TODO: after very little experimentation, I never see this increase after about -! 20 or so. Maybe it should be smaller. +! 100 hashes ought to be enough for anybody. : n-hashes-range ( -- range ) 100 [1,b] ; -! Ends up with a list of arrays - { n-bits position } -: find-bloom-filter-sizes ( error-rate number-objects -- seq ) - [ bits-to-satisfy-error-rate ] 2curry - n-hashes-range swap - map - n-hashes-range zip ; +! { n-hashes n-bits } +: identity-configuration ( -- 2seq ) + 0 max-array-capacity 2array ; -: smallest-first ( seq1 seq2 -- seq ) - [ [ first ] bi@ <= ] most ; +: smaller-second ( 2seq 2seq -- 2seq ) + [ [ second ] bi@ <= ] most ; + +! If the number of hashes isn't positive, we haven't found anything smaller than the +! identity configuration. +: validate-sizes ( 2seq -- ) + first 0 <= [ capacity-error ] when* ; ! The consensus on the tradeoff between increasing the number of bits and ! increasing the number of hash functions seems to be "go for the smallest @@ -80,17 +84,31 @@ TUPLE: bloom-filter ! seen any usage studies from the implementations that made this tradeoff to ! support it, and I haven't done my own, but we'll go with it anyway. ! -! TODO: check that error-rate is reasonable. : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits ) - find-bloom-filter-sizes - max-array-capacity -1 2array - [ smallest-first ] - reduce - [ second ] [ first ] bi ; + '[ _ _ bits-to-satisfy-error-rate ] + '[ dup _ call 2array smaller-second ] + '[ n-hashes-range identity-configuration _ reduce ] + call + dup validate-sizes + first2 ; + +: validate-n-objects ( n-objects -- ) + 0 <= [ invalid-n-objects ] when ; + +: valid-error-rate-interval ( -- interval ) + 0 1 (a,b) ; + +: validate-error-rate ( error-rate -- ) + valid-error-rate-interval interval-contains? + [ invalid-error-rate ] unless ; + +: validate-constraints ( error-rate n-objects -- ) + validate-n-objects validate-error-rate ; PRIVATE> : ( error-rate number-objects -- bloom-filter ) + [ validate-constraints ] 2keep [ size-bloom-filter ] keep 0 ! initially empty bloom-filter boa ;