bloom-filters: compact, probabilistic membership testing

2009-05-07 22:45:02 -04:00 · 2009-05-07 22:45:02 -04:00 · 99f7babcc0
parent 5cba4f694c
commit 99f7babcc0
4 changed files with 269 additions and 0 deletions
--- a/extra/bloom-filters/authors.txt
+++ b/extra/bloom-filters/authors.txt
@ -0,0 +1 @@
 Alec Berryman
--- a/extra/bloom-filters/bloom-filters-docs.factor
+++ b/extra/bloom-filters/bloom-filters-docs.factor
@ -0,0 +1,36 @@
 USING: help.markup help.syntax kernel math ;
 IN: bloom-filters
 HELP: <bloom-filter>
 { $values { "error-rate" "The desired false positive rate.  A " { $link float } " between 0 and 1." }
          { "number-objects" "The expected number of object in the set.  An " { $link integer } "." }
          { "bloom-filter" bloom-filter } }
 { $description "Creates an empty Bloom filter." } ;
 HELP: bloom-filter-insert
 { $values { "object" object }
          { "bloom-filter" bloom-filter } }
 { $description "Records the item as a member of the filter." }
 { $side-effects "bloom-filter" } ;
 HELP: bloom-filter-member?
 { $values { "object" object }
          { "bloom-filter" bloom-filter }
          { "?" boolean } }
 { $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise.  The false positive rate is configurable; there are no false negatives." } ;
 HELP: bloom-filter
 { $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ;
 ARTICLE: "bloom-filters" "Bloom filters"
 "This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements."
 $nl
 "The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set."
 $nl
 "Bloom filters cannot be resized and do not support removal."
 $nl
 { $subsection <bloom-filter> }
 { $subsection bloom-filter-insert }
 { $subsection bloom-filter-member? } ;
 ABOUT: "bloom-filters"
--- a/extra/bloom-filters/bloom-filters-tests.factor
+++ b/extra/bloom-filters/bloom-filters-tests.factor
@ -0,0 +1,71 @@
 USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
 math random sequences tools.test ;
 IN: bloom-filters.tests
 ! The sizing information was generated using the subroutine
 ! calculate_shortest_filter_length from
 ! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
 ! Test bloom-filter creation
 [ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
 [ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test
 [ 7 ] [ 0.01 5000 <bloom-filter> n-hashes>> ] unit-test
 [ 47965 ] [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
 [ 5000 ] [ 0.01 5000 <bloom-filter> maximum-n-objects>> ] unit-test
 [ 0 ] [ 0.01 5000 <bloom-filter> current-n-objects>> ] unit-test
 ! Should return the fewest hashes to satisfy the bits requested, not the most.
 [ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
 [ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
 [ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test
 ! This is a lot of bits.  On linux-x86-32, max-array-capacity is 134217727,
 ! which is about 16MB (assuming I can do math), which is sort of pithy.  I'm
 ! not sure how to handle this case.  Returning a smaller-than-requested
 ! arrays is not the least surprising behavior, but is still surprising.
 [ 383718189 ] [ 7 0.01 40000000 bits-to-satisfy-error-rate ] unit-test
 ! [ 7 383718189 ] [ 0.01 40000000 size-bloom-filter ] unit-test
 ! [ 383718189 ] [ 0.01 40000000 <bloom-filter> bits>> length ] unit-test
 ! Should not generate bignum hash codes.  Enhanced double hashing may generate a
 ! lot of hash codes, and it's better to do this earlier than later.
 [ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ t = ] all? ] unit-test
 [ ?{ t f t f t f } ] [ { 0 2 4 } 6 <bit-array> [ set-indices ] keep ] unit-test
 : empty-bloom-filter ( -- bloom-filter )
    0.01 2000 <bloom-filter> ;
 [ 1 ] [ empty-bloom-filter [ increment-n-objects ] keep current-n-objects>> ] unit-test
 : basic-insert-test-setup ( -- bloom-filter )
    1 empty-bloom-filter [ bloom-filter-insert ] keep ;
 ! Basic tests that insert does something
 [ t ] [ basic-insert-test-setup bits>> [ t = ] any? ] unit-test
 [ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test
 : non-empty-bloom-filter ( -- bloom-filter )
    1000 iota
    empty-bloom-filter
    [ [ bloom-filter-insert ] curry each ] keep ;
 : full-bloom-filter ( -- bloom-filter )
    2000 iota
    empty-bloom-filter
    [ [ bloom-filter-insert ] curry each ] keep ;
 ! Should find what we put in there.
 [ t ] [ 2000 iota
        full-bloom-filter
        [ bloom-filter-member? ] curry map
        [ t = ] all? ] unit-test
 ! We shouldn't have more than 0.01 false-positive rate.
 [ t ] [ 1000 iota [ drop most-positive-fixnum random 1000 + ] map
        full-bloom-filter
        [ bloom-filter-member? ] curry map
        [ t = ] filter
        ! TODO: This should be 10, but the false positive rate is currently very
        ! high.  It shouldn't be much more than this.
        length 150 <= ] unit-test
--- a/extra/bloom-filters/bloom-filters.factor
+++ b/extra/bloom-filters/bloom-filters.factor
@ -0,0 +1,161 @@
 ! Copyright (C) 2009 Alec Berryman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays assocs bit-arrays kernel layouts locals math
 math.functions math.ranges multiline sequences ;
 IN: bloom-filters
 /*
 TODO:
 - How to singal an error when too many bits?  It looks like a built-in for some
  types of arrays, but bit-array just returns a zero-length array.  What we do
  now is completely broken: -1 hash codes?  Really?
 - The false positive rate is 10x what it should be, based on informal testing.
  Better object hashes or a better method of generating extra hash codes would
  help.  Another way is to increase the number of bits used.
  - Try something smarter than the bitwise complement for a second hash code.
  - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
    makes a case for http://murmurhash.googlepages.com/ instead of enhanced
    double-hashing.
  - Be sure to adjust the test that asserts the number of false positives isn't
    unreasonable.
 - Should round bits up to next power of two, use wrap instead of mod.
 - Should allow user to specify the hash codes, either as inputs to enhanced
  double hashing or for direct use.
 - Support for serialization.
 - Wrappers for combining filters.
 - Should we signal an error when inserting past the number of objects the filter
  is sized for?  The filter will continue to work, just not very well.
 - The other TODOs sprinkled through the code.
 */
 TUPLE: bloom-filter
 { n-hashes fixnum read-only }
 { bits bit-array read-only }
 { maximum-n-objects fixnum read-only }
 { current-n-objects fixnum } ;
 <PRIVATE
 ! number-bits = -(n-objects * n-hashes) / ln(1 - error-rate ^ 1/n-hashes)
 :: bits-to-satisfy-error-rate ( n-hashes error-rate n-objects -- size )
    n-objects n-hashes * -1 *
    1 error-rate 1 n-hashes / ^ - log
    /
    ceiling >integer ; ! should check that it's below max-array-capacity
 ! TODO: this should be a constant
 !
 ! TODO: after very little experimentation, I never see this increase after about
 ! 20 or so.  Maybe it should be smaller.
 : n-hashes-range ( -- range )
    100 [1,b] ;
 ! Ends up with a list of arrays - { n-bits position }
 : find-bloom-filter-sizes ( error-rate number-objects -- seq )
    [ bits-to-satisfy-error-rate ] 2curry
    n-hashes-range swap
    map
    n-hashes-range zip ;
 :: smallest-first ( seq1 seq2 -- seq )
    seq1 first seq2 first <= [ seq1 ] [ seq2 ] if ;
 ! The consensus on the tradeoff between increasing the number of bits and
 ! increasing the number of hash functions seems to be "go for the smallest
 ! number of bits", probably because most implementations just generate one hash
 ! value and cheaply mangle it into the number of hashes they need.  I have not
 ! seen any usage studies from the implementations that made this tradeoff to
 ! support it, and I haven't done my own, but we'll go with it anyway.
 !
 ! TODO: check that error-rate is reasonable.
 : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
    find-bloom-filter-sizes
    max-array-capacity -1 2array
    [ smallest-first ]
    reduce
    [ second ] [ first ] bi ;
 PRIVATE>
 : <bloom-filter> ( error-rate number-objects -- bloom-filter )
    [ size-bloom-filter <bit-array> ] keep
    0 ! initially empty
    bloom-filter boa ;
 <PRIVATE
 ! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and
 ! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing":
 ! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
 !
 ! This is taken from the definition at the top of page 12:
 !
 ! F(i) = (A(s) + (i * B(s)) + ((i^3 - i) / 6)) mod m
 !
 ! Where i is the hash number, A and B are hash functions for object s, and m is
 ! the length of the array.
 :: enhanced-double-hash ( index hash0 hash1 array-size -- hash )
    hash0
    index hash1 *
    +
    index 3 ^ index -
    6 /
    +
    array-size mod ;
 : enhanced-double-hashes ( n hash0 hash1 array-size -- seq )
    [ enhanced-double-hash ] 3curry
    [ [0,b) ] dip
    map ;
 ! Stupid, should pick something good.
 : hashcodes-from-hashcode ( n -- n n )
    dup
    ! we could be running this through a lot of double hashing, make sure it's a
    ! fixnum here
    most-positive-fixnum >fixnum bitxor ;
 ! TODO: This code calls abs because all the double-hashing stuff outputs array
 ! indices and those aren't good negative.  Are we throwing away bits?  -1000
 ! b. actually prints -1111101000, which confuses me.
 : hashcodes-from-object ( obj -- n n )
    hashcode abs hashcodes-from-hashcode ;
 : set-indices ( indices bit-array -- )
    [ [ drop t ] change-nth ] curry each ;
 : increment-n-objects ( bloom-filter -- )
    dup current-n-objects>> 1 + >>current-n-objects drop ;
 ! This would be better as an each-relevant-hash that didn't cons.
 : relevant-indices ( value bloom-filter -- indices )
    [ n-hashes>> ] [ bits>> length ] bi ! value n array-size
    swapd [ hashcodes-from-object ] dip ! n value1 value2 array-size
    enhanced-double-hashes ;
 PRIVATE>
 : bloom-filter-insert ( object bloom-filter -- )
    [ relevant-indices ]
    [ bits>> set-indices ]
    [ increment-n-objects ]
    tri ;
 : bloom-filter-member? ( value bloom-filter -- ? )
    [ relevant-indices ]
    [ bits>> [ nth ] curry map [ t = ] all? ]
    bi ;