Merge branch 'bloom-filters' of git://github.com/alec/factor
						commit
						c0f5dba77a
					
				| 
						 | 
				
			
			@ -0,0 +1 @@
 | 
			
		|||
Alec Berryman
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,38 @@
 | 
			
		|||
USING: help.markup help.syntax kernel math ;
 | 
			
		||||
IN: bloom-filters
 | 
			
		||||
 | 
			
		||||
HELP: <bloom-filter>
 | 
			
		||||
{ $values { "error-rate" "The desired false positive rate.  A " { $link float } " between 0 and 1." }
 | 
			
		||||
          { "number-objects" "The expected number of object in the set.  A positive " { $link integer } "." }
 | 
			
		||||
          { "bloom-filter" bloom-filter } }
 | 
			
		||||
{ $description "Creates an empty Bloom filter." }
 | 
			
		||||
{ $errors "Throws a " { $link capacity-error } " when unable to produce a filter meeting the given constraints.  Throws a " { $link invalid-error-rate } " or a " { $link invalid-n-objects } " when input is invalid." } ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
HELP: bloom-filter-insert
 | 
			
		||||
{ $values { "object" object }
 | 
			
		||||
          { "bloom-filter" bloom-filter } }
 | 
			
		||||
{ $description "Records the item as a member of the filter." }
 | 
			
		||||
{ $side-effects "bloom-filter" } ;
 | 
			
		||||
 | 
			
		||||
HELP: bloom-filter-member?
 | 
			
		||||
{ $values { "object" object }
 | 
			
		||||
          { "bloom-filter" bloom-filter }
 | 
			
		||||
          { "?" boolean } }
 | 
			
		||||
{ $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise.  The false positive rate is configurable; there are no false negatives." } ;
 | 
			
		||||
 | 
			
		||||
HELP: bloom-filter
 | 
			
		||||
{ $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ;
 | 
			
		||||
 | 
			
		||||
ARTICLE: "bloom-filters" "Bloom filters"
 | 
			
		||||
"This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements."
 | 
			
		||||
$nl
 | 
			
		||||
"The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set."
 | 
			
		||||
$nl
 | 
			
		||||
"Bloom filters cannot be resized and do not support removal."
 | 
			
		||||
$nl
 | 
			
		||||
{ $subsection <bloom-filter> }
 | 
			
		||||
{ $subsection bloom-filter-insert }
 | 
			
		||||
{ $subsection bloom-filter-member? } ;
 | 
			
		||||
 | 
			
		||||
ABOUT: "bloom-filters"
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,81 @@
 | 
			
		|||
USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
 | 
			
		||||
math random sequences tools.test ;
 | 
			
		||||
IN: bloom-filters.tests
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test
 | 
			
		||||
[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test
 | 
			
		||||
 | 
			
		||||
! The sizing information was generated using the subroutine
 | 
			
		||||
! calculate_shortest_filter_length from
 | 
			
		||||
! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
 | 
			
		||||
 | 
			
		||||
! Test bloom-filter creation
 | 
			
		||||
[ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
 | 
			
		||||
[ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test
 | 
			
		||||
[ 7 ] [ 0.01 5000 <bloom-filter> n-hashes>> ] unit-test
 | 
			
		||||
[ 47965 ] [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
 | 
			
		||||
[ 5000 ] [ 0.01 5000 <bloom-filter> maximum-n-objects>> ] unit-test
 | 
			
		||||
[ 0 ] [ 0.01 5000 <bloom-filter> current-n-objects>> ] unit-test
 | 
			
		||||
 | 
			
		||||
! Should return the fewest hashes to satisfy the bits requested, not the most.
 | 
			
		||||
[ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
 | 
			
		||||
[ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
 | 
			
		||||
[ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test
 | 
			
		||||
 | 
			
		||||
! This is a lot of bits.
 | 
			
		||||
: oversized-filter-params ( -- error-rate n-objects )
 | 
			
		||||
    0.00000001 400000000000000 ;
 | 
			
		||||
[ oversized-filter-params size-bloom-filter ] [ capacity-error? ]  must-fail-with
 | 
			
		||||
[ oversized-filter-params <bloom-filter> ] [ capacity-error? ] must-fail-with
 | 
			
		||||
 | 
			
		||||
! Other error conditions.
 | 
			
		||||
[ 1.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
 | 
			
		||||
[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
 | 
			
		||||
[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
 | 
			
		||||
[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
 | 
			
		||||
[ 0.5 0 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
 | 
			
		||||
[ 0.5 -5 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
 | 
			
		||||
 | 
			
		||||
! Should not generate bignum hash codes.  Enhanced double hashing may generate a
 | 
			
		||||
! lot of hash codes, and it's better to do this earlier than later.
 | 
			
		||||
[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ ] all? ] unit-test
 | 
			
		||||
 | 
			
		||||
[ ?{ t f t f t f } ] [ { 0 2 4 } 6 <bit-array> [ set-indices ] keep ] unit-test
 | 
			
		||||
 | 
			
		||||
: empty-bloom-filter ( -- bloom-filter )
 | 
			
		||||
    0.01 2000 <bloom-filter> ;
 | 
			
		||||
 | 
			
		||||
[ 1 ] [ empty-bloom-filter dup increment-n-objects current-n-objects>> ] unit-test
 | 
			
		||||
 | 
			
		||||
: basic-insert-test-setup ( -- bloom-filter )
 | 
			
		||||
    1 empty-bloom-filter [ bloom-filter-insert ] keep ;
 | 
			
		||||
 | 
			
		||||
! Basic tests that insert does something
 | 
			
		||||
[ t ] [ basic-insert-test-setup bits>> [ ] any? ] unit-test
 | 
			
		||||
[ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test
 | 
			
		||||
 | 
			
		||||
: non-empty-bloom-filter ( -- bloom-filter )
 | 
			
		||||
    1000 iota
 | 
			
		||||
    empty-bloom-filter
 | 
			
		||||
    [ [ bloom-filter-insert ] curry each ] keep ;
 | 
			
		||||
 | 
			
		||||
: full-bloom-filter ( -- bloom-filter )
 | 
			
		||||
    2000 iota
 | 
			
		||||
    empty-bloom-filter
 | 
			
		||||
    [ [ bloom-filter-insert ] curry each ] keep ;
 | 
			
		||||
 | 
			
		||||
! Should find what we put in there.
 | 
			
		||||
[ t ] [ 2000 iota
 | 
			
		||||
        full-bloom-filter
 | 
			
		||||
        [ bloom-filter-member? ] curry map
 | 
			
		||||
        [ ] all? ] unit-test
 | 
			
		||||
 | 
			
		||||
! We shouldn't have more than 0.01 false-positive rate.
 | 
			
		||||
[ t ] [ 1000 iota [ drop most-positive-fixnum random 1000 + ] map
 | 
			
		||||
        full-bloom-filter
 | 
			
		||||
        [ bloom-filter-member? ] curry map
 | 
			
		||||
        [ ] filter
 | 
			
		||||
        ! TODO: This should be 10, but the false positive rate is currently very
 | 
			
		||||
        ! high.  It shouldn't be much more than this.
 | 
			
		||||
        length 150 <= ] unit-test
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,158 @@
 | 
			
		|||
! Copyright (C) 2009 Alec Berryman.
 | 
			
		||||
! See http://factorcode.org/license.txt for BSD license.
 | 
			
		||||
USING: accessors arrays bit-arrays fry infix kernel layouts locals math
 | 
			
		||||
math.functions multiline sequences ;
 | 
			
		||||
IN: bloom-filters
 | 
			
		||||
 | 
			
		||||
FROM: math.ranges => [1,b] [0,b) ;
 | 
			
		||||
FROM: math.intervals => (a,b) interval-contains? ;
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 | 
			
		||||
TODO:
 | 
			
		||||
 | 
			
		||||
- The false positive rate is 10x what it should be, based on informal testing.
 | 
			
		||||
  Better object hashes or a better method of generating extra hash codes would
 | 
			
		||||
  help.  Another way is to increase the number of bits used.
 | 
			
		||||
 | 
			
		||||
  - Try something smarter than the bitwise complement for a second hash code.
 | 
			
		||||
 | 
			
		||||
  - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html
 | 
			
		||||
    makes a case for http://murmurhash.googlepages.com/ instead of enhanced
 | 
			
		||||
    double-hashing.
 | 
			
		||||
 | 
			
		||||
  - Be sure to adjust the test that asserts the number of false positives isn't
 | 
			
		||||
    unreasonable.
 | 
			
		||||
 | 
			
		||||
- Could round bits up to next power of two and use wrap instead of mod.  This
 | 
			
		||||
  would cost a lot of bits on 32-bit platforms, though, and limit the bit-array
 | 
			
		||||
  to 8MB.
 | 
			
		||||
 | 
			
		||||
- Should allow user to specify the hash codes, either as inputs to enhanced
 | 
			
		||||
  double hashing or for direct use.
 | 
			
		||||
 | 
			
		||||
- Support for serialization.
 | 
			
		||||
 | 
			
		||||
- Wrappers for combining filters.
 | 
			
		||||
 | 
			
		||||
- Should we signal an error when inserting past the number of objects the filter
 | 
			
		||||
  is sized for?  The filter will continue to work, just not very well.
 | 
			
		||||
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
TUPLE: bloom-filter
 | 
			
		||||
{ n-hashes fixnum read-only }
 | 
			
		||||
{ bits bit-array read-only }
 | 
			
		||||
{ maximum-n-objects fixnum read-only }
 | 
			
		||||
{ current-n-objects fixnum } ;
 | 
			
		||||
 | 
			
		||||
ERROR: capacity-error ;
 | 
			
		||||
ERROR: invalid-error-rate ;
 | 
			
		||||
ERROR: invalid-n-objects ;
 | 
			
		||||
 | 
			
		||||
<PRIVATE
 | 
			
		||||
 | 
			
		||||
! infix doesn't like ^
 | 
			
		||||
: pow ( x y -- z )
 | 
			
		||||
    ^ ; inline
 | 
			
		||||
 | 
			
		||||
:: bits-to-satisfy-error-rate ( hashes error objects -- size )
 | 
			
		||||
    [infix -(objects * hashes) / log(1 - pow(error, (1/hashes))) infix]
 | 
			
		||||
    ceiling >integer ;
 | 
			
		||||
 | 
			
		||||
! 100 hashes ought to be enough for anybody.
 | 
			
		||||
: n-hashes-range ( -- range )
 | 
			
		||||
    100 [1,b] ;
 | 
			
		||||
 | 
			
		||||
! { n-hashes n-bits }
 | 
			
		||||
: identity-configuration ( -- 2seq )
 | 
			
		||||
    0 max-array-capacity 2array ;
 | 
			
		||||
 | 
			
		||||
: smaller-second ( 2seq 2seq -- 2seq )
 | 
			
		||||
    [ [ second ] bi@ <= ] most ;
 | 
			
		||||
 | 
			
		||||
! If the number of hashes isn't positive, we haven't found anything smaller than the
 | 
			
		||||
! identity configuration.
 | 
			
		||||
: validate-sizes ( 2seq -- )
 | 
			
		||||
    first 0 <= [ capacity-error ] when ;
 | 
			
		||||
 | 
			
		||||
! The consensus on the tradeoff between increasing the number of bits and
 | 
			
		||||
! increasing the number of hash functions seems to be "go for the smallest
 | 
			
		||||
! number of bits", probably because most implementations just generate one hash
 | 
			
		||||
! value and cheaply mangle it into the number of hashes they need.  I have not
 | 
			
		||||
! seen any usage studies from the implementations that made this tradeoff to
 | 
			
		||||
! support it, and I haven't done my own, but we'll go with it anyway.
 | 
			
		||||
!
 | 
			
		||||
: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
 | 
			
		||||
    [ n-hashes-range identity-configuration ] 2dip
 | 
			
		||||
    '[ dup [ _ _ bits-to-satisfy-error-rate ]
 | 
			
		||||
       call 2array smaller-second ]
 | 
			
		||||
    reduce
 | 
			
		||||
    dup validate-sizes
 | 
			
		||||
    first2 ;
 | 
			
		||||
 | 
			
		||||
: validate-n-objects ( n-objects -- )
 | 
			
		||||
    0 <= [ invalid-n-objects ] when ;
 | 
			
		||||
 | 
			
		||||
: valid-error-rate-interval ( -- interval )
 | 
			
		||||
    0 1 (a,b) ;
 | 
			
		||||
 | 
			
		||||
: validate-error-rate ( error-rate -- )
 | 
			
		||||
    valid-error-rate-interval interval-contains?
 | 
			
		||||
    [ invalid-error-rate ] unless ;
 | 
			
		||||
 | 
			
		||||
: validate-constraints ( error-rate n-objects -- )
 | 
			
		||||
    validate-n-objects validate-error-rate ;
 | 
			
		||||
 | 
			
		||||
PRIVATE>
 | 
			
		||||
 | 
			
		||||
: <bloom-filter> ( error-rate number-objects -- bloom-filter )
 | 
			
		||||
    [ validate-constraints ] 2keep
 | 
			
		||||
    [ size-bloom-filter <bit-array> ] keep
 | 
			
		||||
    0 ! initially empty
 | 
			
		||||
    bloom-filter boa ;
 | 
			
		||||
 | 
			
		||||
<PRIVATE
 | 
			
		||||
 | 
			
		||||
! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and
 | 
			
		||||
! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing":
 | 
			
		||||
! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
 | 
			
		||||
:: enhanced-double-hash ( index hash0 hash1 -- hash )
 | 
			
		||||
    [infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix] ;
 | 
			
		||||
 | 
			
		||||
: enhanced-double-hashes ( hash0 hash1 n -- seq )
 | 
			
		||||
    [0,b)
 | 
			
		||||
    [ '[ _ _ enhanced-double-hash ] ] dip
 | 
			
		||||
    swap map ;
 | 
			
		||||
 | 
			
		||||
! Make sure it's a fixnum here to speed up double-hashing.
 | 
			
		||||
: hashcodes-from-hashcode ( n -- n n )
 | 
			
		||||
    dup most-positive-fixnum >fixnum bitxor ;
 | 
			
		||||
 | 
			
		||||
: hashcodes-from-object ( obj -- n n )
 | 
			
		||||
    hashcode abs hashcodes-from-hashcode ;
 | 
			
		||||
 | 
			
		||||
: set-indices ( indices bit-array -- )
 | 
			
		||||
    [ [ drop t ] change-nth ] curry each ;
 | 
			
		||||
 | 
			
		||||
: increment-n-objects ( bloom-filter -- )
 | 
			
		||||
    [ 1 + ] change-current-n-objects drop ;
 | 
			
		||||
 | 
			
		||||
: n-hashes-and-length ( bloom-filter -- n-hashes length )
 | 
			
		||||
    [ n-hashes>> ] [ bits>> length ] bi ;
 | 
			
		||||
 | 
			
		||||
: relevant-indices ( value bloom-filter -- indices )
 | 
			
		||||
    [ hashcodes-from-object ] [ n-hashes-and-length ] bi*
 | 
			
		||||
    [ enhanced-double-hashes ] dip '[ _ mod ] map ;
 | 
			
		||||
 | 
			
		||||
PRIVATE>
 | 
			
		||||
 | 
			
		||||
: bloom-filter-insert ( object bloom-filter -- )
 | 
			
		||||
    [ increment-n-objects ]
 | 
			
		||||
    [ relevant-indices ]
 | 
			
		||||
    [ bits>> set-indices ]
 | 
			
		||||
    tri ;
 | 
			
		||||
 | 
			
		||||
: bloom-filter-member? ( object bloom-filter -- ? )
 | 
			
		||||
    [ relevant-indices ] keep
 | 
			
		||||
    bits>> nths [ ] all? ;
 | 
			
		||||
		Loading…
	
		Reference in New Issue