bloom-filters: performance improvements.

2013-04-03 15:11:08 -07:00 · 2013-04-03 15:11:08 -07:00 · 2e40bffccf
parent d47a306557
commit 2e40bffccf
1 changed files with 55 additions and 62 deletions
--- a/extra/bloom-filters/bloom-filters.factor
+++ b/extra/bloom-filters/bloom-filters.factor
@ -1,43 +1,45 @@
 ! Copyright (C) 2009 Alec Berryman.
 ! See http://factorcode.org/license.txt for BSD license.
-USING: accessors arrays bit-arrays fry infix kernel layouts locals math
-math.functions multiline sequences ;
-IN: bloom-filters
-
+USING: accessors arrays bit-arrays fry kernel layouts locals
+math math.functions math.order multiline sequences
+sequences.private typed ;
 FROM: math.ranges => [1,b] ;
-FROM: math.intervals => (a,b) interval-contains? ;
-FROM: sequences => change-nth ;
+
+IN: bloom-filters

 /*

 TODO:

- The false positive rate is 10x what it should be, based on informal testing.
-  Better object hashes or a better method of generating extra hash codes would
-  help.  Another way is to increase the number of bits used.
+- The false positive rate is 10x what it should be, based on
+  informal testing.  Better object hashes or a better method of
+  generating extra hash codes would help.  Another way is to
+  increase the number of bits used.

-  - Try something smarter than the bitwise complement for a second hash code.
+  - Try something smarter than the bitwise complement for a
+    second hash code.

  - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
-    makes a case for http://murmurhash.googlepages.com/ instead of enhanced
-    double-hashing.
+    makes a case for http://murmurhash.googlepages.com/ instead
+    of enhanced double-hashing.

-  - Be sure to adjust the test that asserts the number of false positives isn't
-    unreasonable.
+  - Be sure to adjust the test that asserts the number of false
+    positives isn't unreasonable.

- Could round bits up to next power of two and use wrap instead of mod.  This
-  would cost a lot of bits on 32-bit platforms, though, and limit the bit-array
-  to 8MB.
+- Could round bits up to next power of two and use wrap instead
+  of mod.  This would cost a lot of bits on 32-bit platforms,
+  though, and limit the bit-array to 8MB.

- Should allow user to specify the hash codes, either as inputs to enhanced
-  double hashing or for direct use.
+- Should allow user to specify the hash codes, either as inputs
+  to enhanced double hashing or for direct use.

 - Support for serialization.

 - Wrappers for combining filters.

- Should we signal an error when inserting past the number of objects the filter
-  is sized for?  The filter will continue to work, just not very well.
+- Should we signal an error when inserting past the number of
+  objects the filter is sized for?  The filter will continue to
+  work, just not very well.

 */

@ -48,17 +50,13 @@ TUPLE: bloom-filter
 { current-n-objects fixnum } ;

 ERROR: capacity-error ;
-ERROR: invalid-error-rate ;
-ERROR: invalid-n-objects ;
+ERROR: invalid-error-rate error-rate ;
+ERROR: invalid-n-objects n-objects ;

 <PRIVATE

-! infix doesn't like ^
-: pow ( x y -- z )
-    ^ ; inline
-
 :: bits-to-satisfy-error-rate ( hashes error objects -- size )
-    [infix -(objects * hashes) / log(1 - pow(error, (1/hashes))) infix]
+    objects hashes * neg error hashes recip ^ 1 swap - log /
    ceiling >integer ;

 ! 100 hashes ought to be enough for anybody.
@ -72,17 +70,19 @@ ERROR: invalid-n-objects ;
 : smaller-second ( 2seq 2seq -- 2seq )
    [ [ second ] bi@ <= ] most ;

-! If the number of hashes isn't positive, we haven't found anything smaller than the
-! identity configuration.
+! If the number of hashes isn't positive, we haven't found
+! anything smaller than the identity configuration.
 : validate-sizes ( 2seq -- )
    first 0 <= [ capacity-error ] when ;

-! The consensus on the tradeoff between increasing the number of bits and
-! increasing the number of hash functions seems to be "go for the smallest
-! number of bits", probably because most implementations just generate one hash
-! value and cheaply mangle it into the number of hashes they need.  I have not
-! seen any usage studies from the implementations that made this tradeoff to
-! support it, and I haven't done my own, but we'll go with it anyway.
+! The consensus on the tradeoff between increasing the number of
+! bits and increasing the number of hash functions seems to be
+! "go for the smallest number of bits", probably because most
+! implementations just generate one hash value and cheaply
+! mangle it into the number of hashes they need.  I have not
+! seen any usage studies from the implementations that made this
+! tradeoff to support it, and I haven't done my own, but we'll
+! go with it anyway.
 !
 : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
    [ n-hashes-range identity-configuration ] 2dip
@ -92,57 +92,50 @@ ERROR: invalid-n-objects ;
    dup validate-sizes
    first2 ;

-: validate-n-objects ( n-objects -- )
-    0 <= [ invalid-n-objects ] when ;
+: check-n-objects ( n-objects -- n-objects )
+    dup 0 <= [ invalid-n-objects ] when ;

-: valid-error-rate-interval ( -- interval )
-    0 1 (a,b) ;
-
-: validate-error-rate ( error-rate -- )
-    valid-error-rate-interval interval-contains?
+: check-error-rate ( error-rate -- error-rate )
+    dup [ 0 after? ] [ 1 before? ] bi and
    [ invalid-error-rate ] unless ;

-: validate-constraints ( error-rate n-objects -- )
-    validate-n-objects validate-error-rate ;
-
 PRIVATE>

 : <bloom-filter> ( error-rate number-objects -- bloom-filter )
-    [ validate-constraints ] 2keep
+    [ check-error-rate ] [ check-n-objects ] bi*
    [ size-bloom-filter <bit-array> ] keep
    0 ! initially empty
    bloom-filter boa ;

 <PRIVATE

-! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and
-! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing":
+! See "Bloom Filters in Probabilistic Verification" by Peter C.
+! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
+! Double Hashing":
 ! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
 :: enhanced-double-hash ( index hash0 hash1 -- hash )
-    [infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix] ;
+    hash0 index * hash1 + index 3 ^ index - 6 /i + ;

 : enhanced-double-hashes ( hash0 hash1 n -- seq )
-    iota
-    [ '[ _ _ enhanced-double-hash ] ] dip
-    swap map ;
+    -rot '[ _ _ enhanced-double-hash ] { } map-integers ;

 ! Make sure it's a fixnum here to speed up double-hashing.
-: hashcodes-from-hashcode ( n -- n n )
+: hashcodes-from-hashcode ( hash0 -- hash0 hash1 )
    dup most-positive-fixnum bitxor ;

 : hashcodes-from-object ( obj -- n n )
    hashcode abs hashcodes-from-hashcode ;

-: set-indices ( indices bit-array -- )
-    [ [ drop t ] change-nth ] curry each ;
+TYPED: set-indices ( indices: array bit-array: bit-array -- )
+    [ t ] 2dip [ set-nth-unsafe ] curry with each ; inline

-: increment-n-objects ( bloom-filter -- )
-    [ 1 + ] change-current-n-objects drop ;
+TYPED: increment-n-objects ( bloom-filter: bloom-filter -- )
+    [ 1 + ] change-current-n-objects drop ; inline

-: n-hashes-and-length ( bloom-filter -- n-hashes length )
+TYPED: n-hashes-and-length ( bloom-filter: bloom-filter -- n-hashes length )
    [ n-hashes>> ] [ bits>> length ] bi ;

-: relevant-indices ( value bloom-filter -- indices )
+TYPED: relevant-indices ( value bloom-filter: bloom-filter -- indices )
    [ hashcodes-from-object ] [ n-hashes-and-length ] bi*
    [ enhanced-double-hashes ] dip '[ _ mod ] map ;

@ -155,5 +148,5 @@ PRIVATE>
    tri ;

 : bloom-filter-member? ( object bloom-filter -- ? )
-    [ relevant-indices ] keep
-    bits>> nths [ ] all? ;
+    [ relevant-indices ] [ bits>> ] bi
+    [ nth-unsafe ] curry all? ;