bloom-filters: clean up creation
More readable, less allocation, signals invalid input.
parent
2cf079b2d3
commit
ff04cf82fe
|
@ -3,9 +3,11 @@ IN: bloom-filters
|
|||
|
||||
HELP: <bloom-filter>
|
||||
{ $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." }
|
||||
{ "number-objects" "The expected number of object in the set. An " { $link integer } "." }
|
||||
{ "number-objects" "The expected number of object in the set. A positive " { $link integer } "." }
|
||||
{ "bloom-filter" bloom-filter } }
|
||||
{ $description "Creates an empty Bloom filter." } ;
|
||||
{ $description "Creates an empty Bloom filter." }
|
||||
{ $errors "Throws a " { $link capacity-error } " when unable to produce a filter meeting the given constraints. Throws a " { $link invalid-error-rate } " or a " { $link invalid-n-objects } " when input is invalid." } ;
|
||||
|
||||
|
||||
HELP: bloom-filter-insert
|
||||
{ $values { "object" object }
|
||||
|
|
|
@ -2,6 +2,10 @@ USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
|
|||
math random sequences tools.test ;
|
||||
IN: bloom-filters.tests
|
||||
|
||||
|
||||
[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test
|
||||
[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test
|
||||
|
||||
! The sizing information was generated using the subroutine
|
||||
! calculate_shortest_filter_length from
|
||||
! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
|
||||
|
@ -19,13 +23,19 @@ IN: bloom-filters.tests
|
|||
[ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
|
||||
[ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test
|
||||
|
||||
! This is a lot of bits. On linux-x86-32, max-array-capacity is 134217727,
|
||||
! which is about 16MB (assuming I can do math), which is sort of pithy. I'm
|
||||
! not sure how to handle this case. Returning a smaller-than-requested
|
||||
! arrays is not the least surprising behavior, but is still surprising.
|
||||
[ 383718189 ] [ 7 0.01 40000000 bits-to-satisfy-error-rate ] unit-test
|
||||
! [ 7 383718189 ] [ 0.01 40000000 size-bloom-filter ] unit-test
|
||||
! [ 383718189 ] [ 0.01 40000000 <bloom-filter> bits>> length ] unit-test
|
||||
! This is a lot of bits.
|
||||
: oversized-filter-params ( -- error-rate n-objects )
|
||||
0.00000001 400000000000000 ;
|
||||
[ oversized-filter-params size-bloom-filter ] [ capacity-error? ] must-fail-with
|
||||
[ oversized-filter-params <bloom-filter> ] [ capacity-error? ] must-fail-with
|
||||
|
||||
! Other error conditions.
|
||||
[ 1.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||
[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||
[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||
[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||
[ 0.5 0 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
|
||||
[ 0.5 -5 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
|
||||
|
||||
! Should not generate bignum hash codes. Enhanced double hashing may generate a
|
||||
! lot of hash codes, and it's better to do this earlier than later.
|
||||
|
|
|
@ -1,17 +1,16 @@
|
|||
! Copyright (C) 2009 Alec Berryman.
|
||||
! See http://factorcode.org/license.txt for BSD license.
|
||||
USING: accessors arrays bit-arrays fry kernel layouts locals math math.functions
|
||||
math.ranges multiline sequences ;
|
||||
multiline sequences ;
|
||||
IN: bloom-filters
|
||||
|
||||
FROM: math.ranges => [1,b] [0,b) ;
|
||||
FROM: math.intervals => (a,b) interval-contains? ;
|
||||
|
||||
/*
|
||||
|
||||
TODO:
|
||||
|
||||
- How to singal an error when too many bits? It looks like a built-in for some
|
||||
types of arrays, but bit-array just returns a zero-length array. What we do
|
||||
now is completely broken: -1 hash codes? Really?
|
||||
|
||||
- The false positive rate is 10x what it should be, based on informal testing.
|
||||
Better object hashes or a better method of generating extra hash codes would
|
||||
help. Another way is to increase the number of bits used.
|
||||
|
@ -25,7 +24,9 @@ TODO:
|
|||
- Be sure to adjust the test that asserts the number of false positives isn't
|
||||
unreasonable.
|
||||
|
||||
- Should round bits up to next power of two, use wrap instead of mod.
|
||||
- Could round bits up to next power of two and use wrap instead of mod. This
|
||||
would cost a lot of bits on 32-bit platforms, though, and limit the bit-array
|
||||
to 8MB.
|
||||
|
||||
- Should allow user to specify the hash codes, either as inputs to enhanced
|
||||
double hashing or for direct use.
|
||||
|
@ -47,6 +48,10 @@ TUPLE: bloom-filter
|
|||
{ maximum-n-objects fixnum read-only }
|
||||
{ current-n-objects fixnum } ;
|
||||
|
||||
ERROR: capacity-error ;
|
||||
ERROR: invalid-error-rate ;
|
||||
ERROR: invalid-n-objects ;
|
||||
|
||||
<PRIVATE
|
||||
|
||||
! number-bits = -(n-objects * n-hashes) / ln(1 - error-rate ^ 1/n-hashes)
|
||||
|
@ -56,22 +61,21 @@ TUPLE: bloom-filter
|
|||
/
|
||||
ceiling >integer ; ! should check that it's below max-array-capacity
|
||||
|
||||
! TODO: this should be a constant
|
||||
!
|
||||
! TODO: after very little experimentation, I never see this increase after about
|
||||
! 20 or so. Maybe it should be smaller.
|
||||
! 100 hashes ought to be enough for anybody.
|
||||
: n-hashes-range ( -- range )
|
||||
100 [1,b] ;
|
||||
|
||||
! Ends up with a list of arrays - { n-bits position }
|
||||
: find-bloom-filter-sizes ( error-rate number-objects -- seq )
|
||||
[ bits-to-satisfy-error-rate ] 2curry
|
||||
n-hashes-range swap
|
||||
map
|
||||
n-hashes-range zip ;
|
||||
! { n-hashes n-bits }
|
||||
: identity-configuration ( -- 2seq )
|
||||
0 max-array-capacity 2array ;
|
||||
|
||||
: smallest-first ( seq1 seq2 -- seq )
|
||||
[ [ first ] bi@ <= ] most ;
|
||||
: smaller-second ( 2seq 2seq -- 2seq )
|
||||
[ [ second ] bi@ <= ] most ;
|
||||
|
||||
! If the number of hashes isn't positive, we haven't found anything smaller than the
|
||||
! identity configuration.
|
||||
: validate-sizes ( 2seq -- )
|
||||
first 0 <= [ capacity-error ] when* ;
|
||||
|
||||
! The consensus on the tradeoff between increasing the number of bits and
|
||||
! increasing the number of hash functions seems to be "go for the smallest
|
||||
|
@ -80,17 +84,31 @@ TUPLE: bloom-filter
|
|||
! seen any usage studies from the implementations that made this tradeoff to
|
||||
! support it, and I haven't done my own, but we'll go with it anyway.
|
||||
!
|
||||
! TODO: check that error-rate is reasonable.
|
||||
: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
|
||||
find-bloom-filter-sizes
|
||||
max-array-capacity -1 2array
|
||||
[ smallest-first ]
|
||||
reduce
|
||||
[ second ] [ first ] bi ;
|
||||
'[ _ _ bits-to-satisfy-error-rate ]
|
||||
'[ dup _ call 2array smaller-second ]
|
||||
'[ n-hashes-range identity-configuration _ reduce ]
|
||||
call
|
||||
dup validate-sizes
|
||||
first2 ;
|
||||
|
||||
: validate-n-objects ( n-objects -- )
|
||||
0 <= [ invalid-n-objects ] when ;
|
||||
|
||||
: valid-error-rate-interval ( -- interval )
|
||||
0 1 (a,b) ;
|
||||
|
||||
: validate-error-rate ( error-rate -- )
|
||||
valid-error-rate-interval interval-contains?
|
||||
[ invalid-error-rate ] unless ;
|
||||
|
||||
: validate-constraints ( error-rate n-objects -- )
|
||||
validate-n-objects validate-error-rate ;
|
||||
|
||||
PRIVATE>
|
||||
|
||||
: <bloom-filter> ( error-rate number-objects -- bloom-filter )
|
||||
[ validate-constraints ] 2keep
|
||||
[ size-bloom-filter <bit-array> ] keep
|
||||
0 ! initially empty
|
||||
bloom-filter boa ;
|
||||
|
|
Loading…
Reference in New Issue