From 2e40bffccf49141ac2297f7e72d11a65ac60a7aa Mon Sep 17 00:00:00 2001 From: John Benediktsson Date: Wed, 3 Apr 2013 15:11:08 -0700 Subject: [PATCH] bloom-filters: performance improvements. --- extra/bloom-filters/bloom-filters.factor | 117 +++++++++++------------ 1 file changed, 55 insertions(+), 62 deletions(-) diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index 022bac47c6..6f9aaf4f79 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -1,43 +1,45 @@ ! Copyright (C) 2009 Alec Berryman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays bit-arrays fry infix kernel layouts locals math -math.functions multiline sequences ; -IN: bloom-filters - +USING: accessors arrays bit-arrays fry kernel layouts locals +math math.functions math.order multiline sequences +sequences.private typed ; FROM: math.ranges => [1,b] ; -FROM: math.intervals => (a,b) interval-contains? ; -FROM: sequences => change-nth ; + +IN: bloom-filters /* TODO: -- The false positive rate is 10x what it should be, based on informal testing. - Better object hashes or a better method of generating extra hash codes would - help. Another way is to increase the number of bits used. +- The false positive rate is 10x what it should be, based on + informal testing. Better object hashes or a better method of + generating extra hash codes would help. Another way is to + increase the number of bits used. - - Try something smarter than the bitwise complement for a second hash code. + - Try something smarter than the bitwise complement for a + second hash code. - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html - makes a case for http://murmurhash.googlepages.com/ instead of enhanced - double-hashing. + makes a case for http://murmurhash.googlepages.com/ instead + of enhanced double-hashing. - - Be sure to adjust the test that asserts the number of false positives isn't - unreasonable. + - Be sure to adjust the test that asserts the number of false + positives isn't unreasonable. -- Could round bits up to next power of two and use wrap instead of mod. This - would cost a lot of bits on 32-bit platforms, though, and limit the bit-array - to 8MB. +- Could round bits up to next power of two and use wrap instead + of mod. This would cost a lot of bits on 32-bit platforms, + though, and limit the bit-array to 8MB. -- Should allow user to specify the hash codes, either as inputs to enhanced - double hashing or for direct use. +- Should allow user to specify the hash codes, either as inputs + to enhanced double hashing or for direct use. - Support for serialization. - Wrappers for combining filters. -- Should we signal an error when inserting past the number of objects the filter - is sized for? The filter will continue to work, just not very well. +- Should we signal an error when inserting past the number of + objects the filter is sized for? The filter will continue to + work, just not very well. */ @@ -48,17 +50,13 @@ TUPLE: bloom-filter { current-n-objects fixnum } ; ERROR: capacity-error ; -ERROR: invalid-error-rate ; -ERROR: invalid-n-objects ; +ERROR: invalid-error-rate error-rate ; +ERROR: invalid-n-objects n-objects ; integer ; ! 100 hashes ought to be enough for anybody. @@ -72,17 +70,19 @@ ERROR: invalid-n-objects ; : smaller-second ( 2seq 2seq -- 2seq ) [ [ second ] bi@ <= ] most ; -! If the number of hashes isn't positive, we haven't found anything smaller than the -! identity configuration. +! If the number of hashes isn't positive, we haven't found +! anything smaller than the identity configuration. : validate-sizes ( 2seq -- ) first 0 <= [ capacity-error ] when ; -! The consensus on the tradeoff between increasing the number of bits and -! increasing the number of hash functions seems to be "go for the smallest -! number of bits", probably because most implementations just generate one hash -! value and cheaply mangle it into the number of hashes they need. I have not -! seen any usage studies from the implementations that made this tradeoff to -! support it, and I haven't done my own, but we'll go with it anyway. +! The consensus on the tradeoff between increasing the number of +! bits and increasing the number of hash functions seems to be +! "go for the smallest number of bits", probably because most +! implementations just generate one hash value and cheaply +! mangle it into the number of hashes they need. I have not +! seen any usage studies from the implementations that made this +! tradeoff to support it, and I haven't done my own, but we'll +! go with it anyway. ! : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits ) [ n-hashes-range identity-configuration ] 2dip @@ -92,57 +92,50 @@ ERROR: invalid-n-objects ; dup validate-sizes first2 ; -: validate-n-objects ( n-objects -- ) - 0 <= [ invalid-n-objects ] when ; +: check-n-objects ( n-objects -- n-objects ) + dup 0 <= [ invalid-n-objects ] when ; -: valid-error-rate-interval ( -- interval ) - 0 1 (a,b) ; - -: validate-error-rate ( error-rate -- ) - valid-error-rate-interval interval-contains? +: check-error-rate ( error-rate -- error-rate ) + dup [ 0 after? ] [ 1 before? ] bi and [ invalid-error-rate ] unless ; -: validate-constraints ( error-rate n-objects -- ) - validate-n-objects validate-error-rate ; - PRIVATE> : ( error-rate number-objects -- bloom-filter ) - [ validate-constraints ] 2keep + [ check-error-rate ] [ check-n-objects ] bi* [ size-bloom-filter ] keep 0 ! initially empty bloom-filter boa ; > ] [ bits>> length ] bi ; -: relevant-indices ( value bloom-filter -- indices ) +TYPED: relevant-indices ( value bloom-filter: bloom-filter -- indices ) [ hashcodes-from-object ] [ n-hashes-and-length ] bi* [ enhanced-double-hashes ] dip '[ _ mod ] map ; @@ -155,5 +148,5 @@ PRIVATE> tri ; : bloom-filter-member? ( object bloom-filter -- ? ) - [ relevant-indices ] keep - bits>> nths [ ] all? ; + [ relevant-indices ] [ bits>> ] bi + [ nth-unsafe ] curry all? ;