bloom-filters: clean up indices code
Extricating mod from hash creation makes it a little nicer.db4
							parent
							
								
									8c26783455
								
							
						
					
					
						commit
						713f0db0a2
					
				| 
						 | 
					@ -46,7 +46,7 @@ IN: bloom-filters.tests
 | 
				
			||||||
: empty-bloom-filter ( -- bloom-filter )
 | 
					: empty-bloom-filter ( -- bloom-filter )
 | 
				
			||||||
    0.01 2000 <bloom-filter> ;
 | 
					    0.01 2000 <bloom-filter> ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[ 1 ] [ empty-bloom-filter increment-n-objects current-n-objects>> ] unit-test
 | 
					[ 1 ] [ empty-bloom-filter dup increment-n-objects current-n-objects>> ] unit-test
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: basic-insert-test-setup ( -- bloom-filter )
 | 
					: basic-insert-test-setup ( -- bloom-filter )
 | 
				
			||||||
    1 empty-bloom-filter [ bloom-filter-insert ] keep ;
 | 
					    1 empty-bloom-filter [ bloom-filter-insert ] keep ;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,8 +38,6 @@ TODO:
 | 
				
			||||||
- Should we signal an error when inserting past the number of objects the filter
 | 
					- Should we signal an error when inserting past the number of objects the filter
 | 
				
			||||||
  is sized for?  The filter will continue to work, just not very well.
 | 
					  is sized for?  The filter will continue to work, just not very well.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- The other TODOs sprinkled through the code.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
*/
 | 
					*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TUPLE: bloom-filter
 | 
					TUPLE: bloom-filter
 | 
				
			||||||
| 
						 | 
					@ -76,7 +74,7 @@ ERROR: invalid-n-objects ;
 | 
				
			||||||
! If the number of hashes isn't positive, we haven't found anything smaller than the
 | 
					! If the number of hashes isn't positive, we haven't found anything smaller than the
 | 
				
			||||||
! identity configuration.
 | 
					! identity configuration.
 | 
				
			||||||
: validate-sizes ( 2seq -- )
 | 
					: validate-sizes ( 2seq -- )
 | 
				
			||||||
    first 0 <= [ capacity-error ] when* ;
 | 
					    first 0 <= [ capacity-error ] when ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
! The consensus on the tradeoff between increasing the number of bits and
 | 
					! The consensus on the tradeoff between increasing the number of bits and
 | 
				
			||||||
! increasing the number of hash functions seems to be "go for the smallest
 | 
					! increasing the number of hash functions seems to be "go for the smallest
 | 
				
			||||||
| 
						 | 
					@ -119,45 +117,41 @@ PRIVATE>
 | 
				
			||||||
! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and
 | 
					! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and
 | 
				
			||||||
! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing":
 | 
					! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing":
 | 
				
			||||||
! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
 | 
					! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
 | 
				
			||||||
:: enhanced-double-hash ( index hash0 hash1 array-size -- hash )
 | 
					:: enhanced-double-hash ( index hash0 hash1 -- hash )
 | 
				
			||||||
    [infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix]
 | 
					    [infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix] ;
 | 
				
			||||||
    array-size mod ;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
: enhanced-double-hashes ( n hash0 hash1 array-size -- seq )
 | 
					: enhanced-double-hashes ( hash0 hash1 n -- seq )
 | 
				
			||||||
    '[ _ _ _ enhanced-double-hash ] [ [0,b) ] dip map ;
 | 
					    [0,b)
 | 
				
			||||||
 | 
					    [ '[ _ _ enhanced-double-hash ] ] dip
 | 
				
			||||||
 | 
					    swap map ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
! Stupid, should pick something good.
 | 
					! Make sure it's a fixnum here to speed up double-hashing.
 | 
				
			||||||
: hashcodes-from-hashcode ( n -- n n )
 | 
					: hashcodes-from-hashcode ( n -- n n )
 | 
				
			||||||
    dup
 | 
					    dup most-positive-fixnum >fixnum bitxor ;
 | 
				
			||||||
    ! we could be running this through a lot of double hashing, make sure it's a
 | 
					 | 
				
			||||||
    ! fixnum here
 | 
					 | 
				
			||||||
    most-positive-fixnum >fixnum bitxor ;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
! TODO: This code calls abs because all the double-hashing stuff outputs array
 | 
					 | 
				
			||||||
! indices and those aren't good negative.  Are we throwing away bits?  -1000
 | 
					 | 
				
			||||||
! b. actually prints -1111101000, which confuses me.
 | 
					 | 
				
			||||||
: hashcodes-from-object ( obj -- n n )
 | 
					: hashcodes-from-object ( obj -- n n )
 | 
				
			||||||
    hashcode abs hashcodes-from-hashcode ;
 | 
					    hashcode abs hashcodes-from-hashcode ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: set-indices ( indices bit-array -- )
 | 
					: set-indices ( indices bit-array -- )
 | 
				
			||||||
    [ [ drop t ] change-nth ] curry each ;
 | 
					    [ [ drop t ] change-nth ] curry each ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: increment-n-objects ( bloom-filter -- bloom-filter )
 | 
					: increment-n-objects ( bloom-filter -- )
 | 
				
			||||||
    [ 1 + ] change-current-n-objects ;
 | 
					    [ 1 + ] change-current-n-objects drop ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: n-hashes-and-bits ( bloom-filter -- n-hashes n-bits )
 | 
					: n-hashes-and-length ( bloom-filter -- n-hashes length )
 | 
				
			||||||
    [ n-hashes>> ] [ bits>> length ] bi ;
 | 
					    [ n-hashes>> ] [ bits>> length ] bi ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: relevant-indices ( value bloom-filter -- indices )
 | 
					: relevant-indices ( value bloom-filter -- indices )
 | 
				
			||||||
    n-hashes-and-bits
 | 
					    [ hashcodes-from-object ] [ n-hashes-and-length ] bi*
 | 
				
			||||||
    [ swap hashcodes-from-object ] dip
 | 
					    [ enhanced-double-hashes ] dip '[ _ mod ] map ;
 | 
				
			||||||
    enhanced-double-hashes ;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
PRIVATE>
 | 
					PRIVATE>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: bloom-filter-insert ( object bloom-filter -- )
 | 
					: bloom-filter-insert ( object bloom-filter -- )
 | 
				
			||||||
    increment-n-objects
 | 
					    [ increment-n-objects ]
 | 
				
			||||||
    [ relevant-indices ] [ bits>> set-indices ] bi ;
 | 
					    [ relevant-indices ]
 | 
				
			||||||
 | 
					    [ bits>> set-indices ]
 | 
				
			||||||
 | 
					    tri ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
: bloom-filter-member? ( object bloom-filter -- ? )
 | 
					: bloom-filter-member? ( object bloom-filter -- ? )
 | 
				
			||||||
    [ relevant-indices ] keep
 | 
					    [ relevant-indices ] keep
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue