bloom-filters: compact, probabilistic membership testing
parent
5cba4f694c
commit
99f7babcc0
|
@ -0,0 +1 @@
|
||||||
|
Alec Berryman
|
|
@ -0,0 +1,36 @@
|
||||||
|
USING: help.markup help.syntax kernel math ;
|
||||||
|
IN: bloom-filters
|
||||||
|
|
||||||
|
HELP: <bloom-filter>
|
||||||
|
{ $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." }
|
||||||
|
{ "number-objects" "The expected number of object in the set. An " { $link integer } "." }
|
||||||
|
{ "bloom-filter" bloom-filter } }
|
||||||
|
{ $description "Creates an empty Bloom filter." } ;
|
||||||
|
|
||||||
|
HELP: bloom-filter-insert
|
||||||
|
{ $values { "object" object }
|
||||||
|
{ "bloom-filter" bloom-filter } }
|
||||||
|
{ $description "Records the item as a member of the filter." }
|
||||||
|
{ $side-effects "bloom-filter" } ;
|
||||||
|
|
||||||
|
HELP: bloom-filter-member?
|
||||||
|
{ $values { "object" object }
|
||||||
|
{ "bloom-filter" bloom-filter }
|
||||||
|
{ "?" boolean } }
|
||||||
|
{ $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise. The false positive rate is configurable; there are no false negatives." } ;
|
||||||
|
|
||||||
|
HELP: bloom-filter
|
||||||
|
{ $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ;
|
||||||
|
|
||||||
|
ARTICLE: "bloom-filters" "Bloom filters"
|
||||||
|
"This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements."
|
||||||
|
$nl
|
||||||
|
"The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set."
|
||||||
|
$nl
|
||||||
|
"Bloom filters cannot be resized and do not support removal."
|
||||||
|
$nl
|
||||||
|
{ $subsection <bloom-filter> }
|
||||||
|
{ $subsection bloom-filter-insert }
|
||||||
|
{ $subsection bloom-filter-member? } ;
|
||||||
|
|
||||||
|
ABOUT: "bloom-filters"
|
|
@ -0,0 +1,71 @@
|
||||||
|
USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
|
||||||
|
math random sequences tools.test ;
|
||||||
|
IN: bloom-filters.tests
|
||||||
|
|
||||||
|
! The sizing information was generated using the subroutine
|
||||||
|
! calculate_shortest_filter_length from
|
||||||
|
! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
|
||||||
|
|
||||||
|
! Test bloom-filter creation
|
||||||
|
[ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
|
||||||
|
[ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test
|
||||||
|
[ 7 ] [ 0.01 5000 <bloom-filter> n-hashes>> ] unit-test
|
||||||
|
[ 47965 ] [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
|
||||||
|
[ 5000 ] [ 0.01 5000 <bloom-filter> maximum-n-objects>> ] unit-test
|
||||||
|
[ 0 ] [ 0.01 5000 <bloom-filter> current-n-objects>> ] unit-test
|
||||||
|
|
||||||
|
! Should return the fewest hashes to satisfy the bits requested, not the most.
|
||||||
|
[ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
|
||||||
|
[ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
|
||||||
|
[ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test
|
||||||
|
|
||||||
|
! This is a lot of bits. On linux-x86-32, max-array-capacity is 134217727,
|
||||||
|
! which is about 16MB (assuming I can do math), which is sort of pithy. I'm
|
||||||
|
! not sure how to handle this case. Returning a smaller-than-requested
|
||||||
|
! arrays is not the least surprising behavior, but is still surprising.
|
||||||
|
[ 383718189 ] [ 7 0.01 40000000 bits-to-satisfy-error-rate ] unit-test
|
||||||
|
! [ 7 383718189 ] [ 0.01 40000000 size-bloom-filter ] unit-test
|
||||||
|
! [ 383718189 ] [ 0.01 40000000 <bloom-filter> bits>> length ] unit-test
|
||||||
|
|
||||||
|
! Should not generate bignum hash codes. Enhanced double hashing may generate a
|
||||||
|
! lot of hash codes, and it's better to do this earlier than later.
|
||||||
|
[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ t = ] all? ] unit-test
|
||||||
|
|
||||||
|
[ ?{ t f t f t f } ] [ { 0 2 4 } 6 <bit-array> [ set-indices ] keep ] unit-test
|
||||||
|
|
||||||
|
: empty-bloom-filter ( -- bloom-filter )
|
||||||
|
0.01 2000 <bloom-filter> ;
|
||||||
|
|
||||||
|
[ 1 ] [ empty-bloom-filter [ increment-n-objects ] keep current-n-objects>> ] unit-test
|
||||||
|
|
||||||
|
: basic-insert-test-setup ( -- bloom-filter )
|
||||||
|
1 empty-bloom-filter [ bloom-filter-insert ] keep ;
|
||||||
|
|
||||||
|
! Basic tests that insert does something
|
||||||
|
[ t ] [ basic-insert-test-setup bits>> [ t = ] any? ] unit-test
|
||||||
|
[ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test
|
||||||
|
|
||||||
|
: non-empty-bloom-filter ( -- bloom-filter )
|
||||||
|
1000 iota
|
||||||
|
empty-bloom-filter
|
||||||
|
[ [ bloom-filter-insert ] curry each ] keep ;
|
||||||
|
|
||||||
|
: full-bloom-filter ( -- bloom-filter )
|
||||||
|
2000 iota
|
||||||
|
empty-bloom-filter
|
||||||
|
[ [ bloom-filter-insert ] curry each ] keep ;
|
||||||
|
|
||||||
|
! Should find what we put in there.
|
||||||
|
[ t ] [ 2000 iota
|
||||||
|
full-bloom-filter
|
||||||
|
[ bloom-filter-member? ] curry map
|
||||||
|
[ t = ] all? ] unit-test
|
||||||
|
|
||||||
|
! We shouldn't have more than 0.01 false-positive rate.
|
||||||
|
[ t ] [ 1000 iota [ drop most-positive-fixnum random 1000 + ] map
|
||||||
|
full-bloom-filter
|
||||||
|
[ bloom-filter-member? ] curry map
|
||||||
|
[ t = ] filter
|
||||||
|
! TODO: This should be 10, but the false positive rate is currently very
|
||||||
|
! high. It shouldn't be much more than this.
|
||||||
|
length 150 <= ] unit-test
|
|
@ -0,0 +1,161 @@
|
||||||
|
! Copyright (C) 2009 Alec Berryman.
|
||||||
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
|
USING: accessors arrays assocs bit-arrays kernel layouts locals math
|
||||||
|
math.functions math.ranges multiline sequences ;
|
||||||
|
IN: bloom-filters
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
TODO:
|
||||||
|
|
||||||
|
- How to singal an error when too many bits? It looks like a built-in for some
|
||||||
|
types of arrays, but bit-array just returns a zero-length array. What we do
|
||||||
|
now is completely broken: -1 hash codes? Really?
|
||||||
|
|
||||||
|
- The false positive rate is 10x what it should be, based on informal testing.
|
||||||
|
Better object hashes or a better method of generating extra hash codes would
|
||||||
|
help. Another way is to increase the number of bits used.
|
||||||
|
|
||||||
|
- Try something smarter than the bitwise complement for a second hash code.
|
||||||
|
|
||||||
|
- http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
|
||||||
|
makes a case for http://murmurhash.googlepages.com/ instead of enhanced
|
||||||
|
double-hashing.
|
||||||
|
|
||||||
|
- Be sure to adjust the test that asserts the number of false positives isn't
|
||||||
|
unreasonable.
|
||||||
|
|
||||||
|
- Should round bits up to next power of two, use wrap instead of mod.
|
||||||
|
|
||||||
|
- Should allow user to specify the hash codes, either as inputs to enhanced
|
||||||
|
double hashing or for direct use.
|
||||||
|
|
||||||
|
- Support for serialization.
|
||||||
|
|
||||||
|
- Wrappers for combining filters.
|
||||||
|
|
||||||
|
- Should we signal an error when inserting past the number of objects the filter
|
||||||
|
is sized for? The filter will continue to work, just not very well.
|
||||||
|
|
||||||
|
- The other TODOs sprinkled through the code.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
TUPLE: bloom-filter
|
||||||
|
{ n-hashes fixnum read-only }
|
||||||
|
{ bits bit-array read-only }
|
||||||
|
{ maximum-n-objects fixnum read-only }
|
||||||
|
{ current-n-objects fixnum } ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
! number-bits = -(n-objects * n-hashes) / ln(1 - error-rate ^ 1/n-hashes)
|
||||||
|
:: bits-to-satisfy-error-rate ( n-hashes error-rate n-objects -- size )
|
||||||
|
n-objects n-hashes * -1 *
|
||||||
|
1 error-rate 1 n-hashes / ^ - log
|
||||||
|
/
|
||||||
|
ceiling >integer ; ! should check that it's below max-array-capacity
|
||||||
|
|
||||||
|
! TODO: this should be a constant
|
||||||
|
!
|
||||||
|
! TODO: after very little experimentation, I never see this increase after about
|
||||||
|
! 20 or so. Maybe it should be smaller.
|
||||||
|
: n-hashes-range ( -- range )
|
||||||
|
100 [1,b] ;
|
||||||
|
|
||||||
|
! Ends up with a list of arrays - { n-bits position }
|
||||||
|
: find-bloom-filter-sizes ( error-rate number-objects -- seq )
|
||||||
|
[ bits-to-satisfy-error-rate ] 2curry
|
||||||
|
n-hashes-range swap
|
||||||
|
map
|
||||||
|
n-hashes-range zip ;
|
||||||
|
|
||||||
|
:: smallest-first ( seq1 seq2 -- seq )
|
||||||
|
seq1 first seq2 first <= [ seq1 ] [ seq2 ] if ;
|
||||||
|
|
||||||
|
! The consensus on the tradeoff between increasing the number of bits and
|
||||||
|
! increasing the number of hash functions seems to be "go for the smallest
|
||||||
|
! number of bits", probably because most implementations just generate one hash
|
||||||
|
! value and cheaply mangle it into the number of hashes they need. I have not
|
||||||
|
! seen any usage studies from the implementations that made this tradeoff to
|
||||||
|
! support it, and I haven't done my own, but we'll go with it anyway.
|
||||||
|
!
|
||||||
|
! TODO: check that error-rate is reasonable.
|
||||||
|
: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
|
||||||
|
find-bloom-filter-sizes
|
||||||
|
max-array-capacity -1 2array
|
||||||
|
[ smallest-first ]
|
||||||
|
reduce
|
||||||
|
[ second ] [ first ] bi ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: <bloom-filter> ( error-rate number-objects -- bloom-filter )
|
||||||
|
[ size-bloom-filter <bit-array> ] keep
|
||||||
|
0 ! initially empty
|
||||||
|
bloom-filter boa ;
|
||||||
|
|
||||||
|
<PRIVATE
|
||||||
|
|
||||||
|
! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and
|
||||||
|
! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing":
|
||||||
|
! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
|
||||||
|
!
|
||||||
|
! This is taken from the definition at the top of page 12:
|
||||||
|
!
|
||||||
|
! F(i) = (A(s) + (i * B(s)) + ((i^3 - i) / 6)) mod m
|
||||||
|
!
|
||||||
|
! Where i is the hash number, A and B are hash functions for object s, and m is
|
||||||
|
! the length of the array.
|
||||||
|
|
||||||
|
:: enhanced-double-hash ( index hash0 hash1 array-size -- hash )
|
||||||
|
hash0
|
||||||
|
index hash1 *
|
||||||
|
+
|
||||||
|
index 3 ^ index -
|
||||||
|
6 /
|
||||||
|
+
|
||||||
|
array-size mod ;
|
||||||
|
|
||||||
|
: enhanced-double-hashes ( n hash0 hash1 array-size -- seq )
|
||||||
|
[ enhanced-double-hash ] 3curry
|
||||||
|
[ [0,b) ] dip
|
||||||
|
map ;
|
||||||
|
|
||||||
|
! Stupid, should pick something good.
|
||||||
|
: hashcodes-from-hashcode ( n -- n n )
|
||||||
|
dup
|
||||||
|
! we could be running this through a lot of double hashing, make sure it's a
|
||||||
|
! fixnum here
|
||||||
|
most-positive-fixnum >fixnum bitxor ;
|
||||||
|
|
||||||
|
! TODO: This code calls abs because all the double-hashing stuff outputs array
|
||||||
|
! indices and those aren't good negative. Are we throwing away bits? -1000
|
||||||
|
! b. actually prints -1111101000, which confuses me.
|
||||||
|
: hashcodes-from-object ( obj -- n n )
|
||||||
|
hashcode abs hashcodes-from-hashcode ;
|
||||||
|
|
||||||
|
: set-indices ( indices bit-array -- )
|
||||||
|
[ [ drop t ] change-nth ] curry each ;
|
||||||
|
|
||||||
|
: increment-n-objects ( bloom-filter -- )
|
||||||
|
dup current-n-objects>> 1 + >>current-n-objects drop ;
|
||||||
|
|
||||||
|
! This would be better as an each-relevant-hash that didn't cons.
|
||||||
|
: relevant-indices ( value bloom-filter -- indices )
|
||||||
|
[ n-hashes>> ] [ bits>> length ] bi ! value n array-size
|
||||||
|
swapd [ hashcodes-from-object ] dip ! n value1 value2 array-size
|
||||||
|
enhanced-double-hashes ;
|
||||||
|
|
||||||
|
PRIVATE>
|
||||||
|
|
||||||
|
: bloom-filter-insert ( object bloom-filter -- )
|
||||||
|
[ relevant-indices ]
|
||||||
|
[ bits>> set-indices ]
|
||||||
|
[ increment-n-objects ]
|
||||||
|
tri ;
|
||||||
|
|
||||||
|
: bloom-filter-member? ( value bloom-filter -- ? )
|
||||||
|
[ relevant-indices ]
|
||||||
|
[ bits>> [ nth ] curry map [ t = ] all? ]
|
||||||
|
bi ;
|
Loading…
Reference in New Issue