bloom-filters: performance improvements.
parent
d47a306557
commit
2e40bffccf
|
@ -1,43 +1,45 @@
|
||||||
! Copyright (C) 2009 Alec Berryman.
|
! Copyright (C) 2009 Alec Berryman.
|
||||||
! See http://factorcode.org/license.txt for BSD license.
|
! See http://factorcode.org/license.txt for BSD license.
|
||||||
USING: accessors arrays bit-arrays fry infix kernel layouts locals math
|
USING: accessors arrays bit-arrays fry kernel layouts locals
|
||||||
math.functions multiline sequences ;
|
math math.functions math.order multiline sequences
|
||||||
IN: bloom-filters
|
sequences.private typed ;
|
||||||
|
|
||||||
FROM: math.ranges => [1,b] ;
|
FROM: math.ranges => [1,b] ;
|
||||||
FROM: math.intervals => (a,b) interval-contains? ;
|
|
||||||
FROM: sequences => change-nth ;
|
IN: bloom-filters
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
||||||
TODO:
|
TODO:
|
||||||
|
|
||||||
- The false positive rate is 10x what it should be, based on informal testing.
|
- The false positive rate is 10x what it should be, based on
|
||||||
Better object hashes or a better method of generating extra hash codes would
|
informal testing. Better object hashes or a better method of
|
||||||
help. Another way is to increase the number of bits used.
|
generating extra hash codes would help. Another way is to
|
||||||
|
increase the number of bits used.
|
||||||
|
|
||||||
- Try something smarter than the bitwise complement for a second hash code.
|
- Try something smarter than the bitwise complement for a
|
||||||
|
second hash code.
|
||||||
|
|
||||||
- http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
|
- http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
|
||||||
makes a case for http://murmurhash.googlepages.com/ instead of enhanced
|
makes a case for http://murmurhash.googlepages.com/ instead
|
||||||
double-hashing.
|
of enhanced double-hashing.
|
||||||
|
|
||||||
- Be sure to adjust the test that asserts the number of false positives isn't
|
- Be sure to adjust the test that asserts the number of false
|
||||||
unreasonable.
|
positives isn't unreasonable.
|
||||||
|
|
||||||
- Could round bits up to next power of two and use wrap instead of mod. This
|
- Could round bits up to next power of two and use wrap instead
|
||||||
would cost a lot of bits on 32-bit platforms, though, and limit the bit-array
|
of mod. This would cost a lot of bits on 32-bit platforms,
|
||||||
to 8MB.
|
though, and limit the bit-array to 8MB.
|
||||||
|
|
||||||
- Should allow user to specify the hash codes, either as inputs to enhanced
|
- Should allow user to specify the hash codes, either as inputs
|
||||||
double hashing or for direct use.
|
to enhanced double hashing or for direct use.
|
||||||
|
|
||||||
- Support for serialization.
|
- Support for serialization.
|
||||||
|
|
||||||
- Wrappers for combining filters.
|
- Wrappers for combining filters.
|
||||||
|
|
||||||
- Should we signal an error when inserting past the number of objects the filter
|
- Should we signal an error when inserting past the number of
|
||||||
is sized for? The filter will continue to work, just not very well.
|
objects the filter is sized for? The filter will continue to
|
||||||
|
work, just not very well.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -48,17 +50,13 @@ TUPLE: bloom-filter
|
||||||
{ current-n-objects fixnum } ;
|
{ current-n-objects fixnum } ;
|
||||||
|
|
||||||
ERROR: capacity-error ;
|
ERROR: capacity-error ;
|
||||||
ERROR: invalid-error-rate ;
|
ERROR: invalid-error-rate error-rate ;
|
||||||
ERROR: invalid-n-objects ;
|
ERROR: invalid-n-objects n-objects ;
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
! infix doesn't like ^
|
|
||||||
: pow ( x y -- z )
|
|
||||||
^ ; inline
|
|
||||||
|
|
||||||
:: bits-to-satisfy-error-rate ( hashes error objects -- size )
|
:: bits-to-satisfy-error-rate ( hashes error objects -- size )
|
||||||
[infix -(objects * hashes) / log(1 - pow(error, (1/hashes))) infix]
|
objects hashes * neg error hashes recip ^ 1 swap - log /
|
||||||
ceiling >integer ;
|
ceiling >integer ;
|
||||||
|
|
||||||
! 100 hashes ought to be enough for anybody.
|
! 100 hashes ought to be enough for anybody.
|
||||||
|
@ -72,17 +70,19 @@ ERROR: invalid-n-objects ;
|
||||||
: smaller-second ( 2seq 2seq -- 2seq )
|
: smaller-second ( 2seq 2seq -- 2seq )
|
||||||
[ [ second ] bi@ <= ] most ;
|
[ [ second ] bi@ <= ] most ;
|
||||||
|
|
||||||
! If the number of hashes isn't positive, we haven't found anything smaller than the
|
! If the number of hashes isn't positive, we haven't found
|
||||||
! identity configuration.
|
! anything smaller than the identity configuration.
|
||||||
: validate-sizes ( 2seq -- )
|
: validate-sizes ( 2seq -- )
|
||||||
first 0 <= [ capacity-error ] when ;
|
first 0 <= [ capacity-error ] when ;
|
||||||
|
|
||||||
! The consensus on the tradeoff between increasing the number of bits and
|
! The consensus on the tradeoff between increasing the number of
|
||||||
! increasing the number of hash functions seems to be "go for the smallest
|
! bits and increasing the number of hash functions seems to be
|
||||||
! number of bits", probably because most implementations just generate one hash
|
! "go for the smallest number of bits", probably because most
|
||||||
! value and cheaply mangle it into the number of hashes they need. I have not
|
! implementations just generate one hash value and cheaply
|
||||||
! seen any usage studies from the implementations that made this tradeoff to
|
! mangle it into the number of hashes they need. I have not
|
||||||
! support it, and I haven't done my own, but we'll go with it anyway.
|
! seen any usage studies from the implementations that made this
|
||||||
|
! tradeoff to support it, and I haven't done my own, but we'll
|
||||||
|
! go with it anyway.
|
||||||
!
|
!
|
||||||
: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
|
: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
|
||||||
[ n-hashes-range identity-configuration ] 2dip
|
[ n-hashes-range identity-configuration ] 2dip
|
||||||
|
@ -92,57 +92,50 @@ ERROR: invalid-n-objects ;
|
||||||
dup validate-sizes
|
dup validate-sizes
|
||||||
first2 ;
|
first2 ;
|
||||||
|
|
||||||
: validate-n-objects ( n-objects -- )
|
: check-n-objects ( n-objects -- n-objects )
|
||||||
0 <= [ invalid-n-objects ] when ;
|
dup 0 <= [ invalid-n-objects ] when ;
|
||||||
|
|
||||||
: valid-error-rate-interval ( -- interval )
|
: check-error-rate ( error-rate -- error-rate )
|
||||||
0 1 (a,b) ;
|
dup [ 0 after? ] [ 1 before? ] bi and
|
||||||
|
|
||||||
: validate-error-rate ( error-rate -- )
|
|
||||||
valid-error-rate-interval interval-contains?
|
|
||||||
[ invalid-error-rate ] unless ;
|
[ invalid-error-rate ] unless ;
|
||||||
|
|
||||||
: validate-constraints ( error-rate n-objects -- )
|
|
||||||
validate-n-objects validate-error-rate ;
|
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: <bloom-filter> ( error-rate number-objects -- bloom-filter )
|
: <bloom-filter> ( error-rate number-objects -- bloom-filter )
|
||||||
[ validate-constraints ] 2keep
|
[ check-error-rate ] [ check-n-objects ] bi*
|
||||||
[ size-bloom-filter <bit-array> ] keep
|
[ size-bloom-filter <bit-array> ] keep
|
||||||
0 ! initially empty
|
0 ! initially empty
|
||||||
bloom-filter boa ;
|
bloom-filter boa ;
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and
|
! See "Bloom Filters in Probabilistic Verification" by Peter C.
|
||||||
! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing":
|
! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
|
||||||
|
! Double Hashing":
|
||||||
! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
|
! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
|
||||||
:: enhanced-double-hash ( index hash0 hash1 -- hash )
|
:: enhanced-double-hash ( index hash0 hash1 -- hash )
|
||||||
[infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix] ;
|
hash0 index * hash1 + index 3 ^ index - 6 /i + ;
|
||||||
|
|
||||||
: enhanced-double-hashes ( hash0 hash1 n -- seq )
|
: enhanced-double-hashes ( hash0 hash1 n -- seq )
|
||||||
iota
|
-rot '[ _ _ enhanced-double-hash ] { } map-integers ;
|
||||||
[ '[ _ _ enhanced-double-hash ] ] dip
|
|
||||||
swap map ;
|
|
||||||
|
|
||||||
! Make sure it's a fixnum here to speed up double-hashing.
|
! Make sure it's a fixnum here to speed up double-hashing.
|
||||||
: hashcodes-from-hashcode ( n -- n n )
|
: hashcodes-from-hashcode ( hash0 -- hash0 hash1 )
|
||||||
dup most-positive-fixnum bitxor ;
|
dup most-positive-fixnum bitxor ;
|
||||||
|
|
||||||
: hashcodes-from-object ( obj -- n n )
|
: hashcodes-from-object ( obj -- n n )
|
||||||
hashcode abs hashcodes-from-hashcode ;
|
hashcode abs hashcodes-from-hashcode ;
|
||||||
|
|
||||||
: set-indices ( indices bit-array -- )
|
TYPED: set-indices ( indices: array bit-array: bit-array -- )
|
||||||
[ [ drop t ] change-nth ] curry each ;
|
[ t ] 2dip [ set-nth-unsafe ] curry with each ; inline
|
||||||
|
|
||||||
: increment-n-objects ( bloom-filter -- )
|
TYPED: increment-n-objects ( bloom-filter: bloom-filter -- )
|
||||||
[ 1 + ] change-current-n-objects drop ;
|
[ 1 + ] change-current-n-objects drop ; inline
|
||||||
|
|
||||||
: n-hashes-and-length ( bloom-filter -- n-hashes length )
|
TYPED: n-hashes-and-length ( bloom-filter: bloom-filter -- n-hashes length )
|
||||||
[ n-hashes>> ] [ bits>> length ] bi ;
|
[ n-hashes>> ] [ bits>> length ] bi ;
|
||||||
|
|
||||||
: relevant-indices ( value bloom-filter -- indices )
|
TYPED: relevant-indices ( value bloom-filter: bloom-filter -- indices )
|
||||||
[ hashcodes-from-object ] [ n-hashes-and-length ] bi*
|
[ hashcodes-from-object ] [ n-hashes-and-length ] bi*
|
||||||
[ enhanced-double-hashes ] dip '[ _ mod ] map ;
|
[ enhanced-double-hashes ] dip '[ _ mod ] map ;
|
||||||
|
|
||||||
|
@ -155,5 +148,5 @@ PRIVATE>
|
||||||
tri ;
|
tri ;
|
||||||
|
|
||||||
: bloom-filter-member? ( object bloom-filter -- ? )
|
: bloom-filter-member? ( object bloom-filter -- ? )
|
||||||
[ relevant-indices ] keep
|
[ relevant-indices ] [ bits>> ] bi
|
||||||
bits>> nths [ ] all? ;
|
[ nth-unsafe ] curry all? ;
|
||||||
|
|
Loading…
Reference in New Issue