bloom-filters: cleanup interface a little bit.
parent
55e0be5c7a
commit
66693a786f
|
@ -2,7 +2,6 @@ USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
|
|||
math random sequences tools.test ;
|
||||
IN: bloom-filters.tests
|
||||
|
||||
|
||||
[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test
|
||||
[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test
|
||||
|
||||
|
@ -13,10 +12,10 @@ IN: bloom-filters.tests
|
|||
! Test bloom-filter creation
|
||||
[ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
|
||||
[ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test
|
||||
[ 7 ] [ 0.01 5000 <bloom-filter> n-hashes>> ] unit-test
|
||||
[ 7 ] [ 0.01 5000 <bloom-filter> #hashes>> ] unit-test
|
||||
[ 47965 ] [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
|
||||
[ 5000 ] [ 0.01 5000 <bloom-filter> maximum-n-objects>> ] unit-test
|
||||
[ 0 ] [ 0.01 5000 <bloom-filter> current-n-objects>> ] unit-test
|
||||
[ 5000 ] [ 0.01 5000 <bloom-filter> capacity>> ] unit-test
|
||||
[ 0 ] [ 0.01 5000 <bloom-filter> count>> ] unit-test
|
||||
|
||||
! Should return the fewest hashes to satisfy the bits requested, not the most.
|
||||
[ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
|
||||
|
@ -31,24 +30,24 @@ IN: bloom-filters.tests
|
|||
[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||
[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||
[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||
[ 0.5 0 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
|
||||
[ 0.5 -5 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
|
||||
[ 0.5 0 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
|
||||
[ 0.5 -5 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
|
||||
|
||||
! Should not generate bignum hash codes. Enhanced double hashing may generate a
|
||||
! lot of hash codes, and it's better to do this earlier than later.
|
||||
[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ ] all? ] unit-test
|
||||
[ t ] [ 10000 iota [ double-hashcodes [ fixnum? ] both? ] map [ ] all? ] unit-test
|
||||
|
||||
: empty-bloom-filter ( -- bloom-filter )
|
||||
0.01 2000 <bloom-filter> ;
|
||||
|
||||
[ 1 ] [ empty-bloom-filter dup increment-n-objects current-n-objects>> ] unit-test
|
||||
[ 1 ] [ empty-bloom-filter dup increment-count count>> ] unit-test
|
||||
|
||||
: basic-insert-test-setup ( -- bloom-filter )
|
||||
1 empty-bloom-filter [ bloom-filter-insert ] keep ;
|
||||
|
||||
! Basic tests that insert does something
|
||||
[ t ] [ basic-insert-test-setup bits>> [ ] any? ] unit-test
|
||||
[ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test
|
||||
[ 1 ] [ basic-insert-test-setup count>> ] unit-test
|
||||
|
||||
: non-empty-bloom-filter ( -- bloom-filter )
|
||||
1000 iota
|
||||
|
|
|
@ -44,14 +44,14 @@ TODO:
|
|||
*/
|
||||
|
||||
TUPLE: bloom-filter
|
||||
{ n-hashes fixnum read-only }
|
||||
{ #hashes fixnum read-only }
|
||||
{ bits bit-array read-only }
|
||||
{ maximum-n-objects fixnum read-only }
|
||||
{ current-n-objects fixnum } ;
|
||||
{ capacity fixnum read-only }
|
||||
{ count fixnum } ;
|
||||
|
||||
ERROR: capacity-error ;
|
||||
ERROR: invalid-error-rate error-rate ;
|
||||
ERROR: invalid-n-objects n-objects ;
|
||||
ERROR: invalid-capacity capacity ;
|
||||
|
||||
<PRIVATE
|
||||
|
||||
|
@ -60,10 +60,10 @@ ERROR: invalid-n-objects n-objects ;
|
|||
ceiling >integer ;
|
||||
|
||||
! 100 hashes ought to be enough for anybody.
|
||||
: n-hashes-range ( -- range )
|
||||
: #hashes-range ( -- range )
|
||||
100 [1,b] ;
|
||||
|
||||
! { n-hashes n-bits }
|
||||
! { #hashes #bits }
|
||||
: identity-configuration ( -- 2seq )
|
||||
0 max-array-capacity 2array ;
|
||||
|
||||
|
@ -72,7 +72,7 @@ ERROR: invalid-n-objects n-objects ;
|
|||
|
||||
! If the number of hashes isn't positive, we haven't found
|
||||
! anything smaller than the identity configuration.
|
||||
: check-capacity ( 2seq -- 2seq )
|
||||
: check-hashes ( 2seq -- 2seq )
|
||||
dup first 0 <= [ capacity-error ] when ;
|
||||
|
||||
! The consensus on the tradeoff between increasing the number of
|
||||
|
@ -84,13 +84,13 @@ ERROR: invalid-n-objects n-objects ;
|
|||
! tradeoff to support it, and I haven't done my own, but we'll
|
||||
! go with it anyway.
|
||||
: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
|
||||
[ n-hashes-range identity-configuration ] 2dip '[
|
||||
[ #hashes-range identity-configuration ] 2dip '[
|
||||
dup _ _ bits-to-satisfy-error-rate
|
||||
2array smaller-second
|
||||
] reduce check-capacity first2 ;
|
||||
] reduce check-hashes first2 ;
|
||||
|
||||
: check-n-objects ( n-objects -- n-objects )
|
||||
dup 0 <= [ invalid-n-objects ] when ;
|
||||
: check-capacity ( capacity -- capacity )
|
||||
dup 0 <= [ invalid-capacity ] when ;
|
||||
|
||||
: check-error-rate ( error-rate -- error-rate )
|
||||
dup [ 0 after? ] [ 1 before? ] bi and
|
||||
|
@ -98,8 +98,8 @@ ERROR: invalid-n-objects n-objects ;
|
|||
|
||||
PRIVATE>
|
||||
|
||||
: <bloom-filter> ( error-rate number-objects -- bloom-filter )
|
||||
[ check-error-rate ] [ check-n-objects ] bi*
|
||||
: <bloom-filter> ( error-rate capacity -- bloom-filter )
|
||||
[ check-error-rate ] [ check-capacity ] bi*
|
||||
[ size-bloom-filter <bit-array> ] keep
|
||||
0 ! initially empty
|
||||
bloom-filter boa ;
|
||||
|
@ -110,32 +110,28 @@ PRIVATE>
|
|||
! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
|
||||
! Double Hashing":
|
||||
! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
|
||||
: enhanced-double-hash ( index hash0 hash1 -- hash )
|
||||
: combine-hashcodes ( index hash0 hash1 -- hash )
|
||||
{ fixnum fixnum fixnum } declare
|
||||
[ [ [ 3 ^ ] [ - ] bi 6 /i ] keep ]
|
||||
[ fixnum*fast ] [ fixnum+fast ] tri* + abs ;
|
||||
|
||||
: enhanced-double-hashes ( hash0 hash1 length -- quot: ( elt -- n ) )
|
||||
'[ _ _ enhanced-double-hash _ mod ] ; inline
|
||||
|
||||
! Make sure it's a fixnum here to speed up double-hashing.
|
||||
: hashcodes-from-object ( object -- n n )
|
||||
: double-hashcodes ( object -- hash0 hash1 )
|
||||
hashcode >fixnum dup most-positive-fixnum bitxor >fixnum ;
|
||||
|
||||
: increment-n-objects ( bloom-filter -- )
|
||||
[ 1 + ] change-current-n-objects drop ; inline
|
||||
: increment-count ( bloom-filter -- )
|
||||
[ 1 + ] change-count drop ; inline
|
||||
|
||||
: n-hashes-and-length ( bloom-filter -- n-hashes length )
|
||||
[ n-hashes>> ] [ bits>> length ] bi ; inline
|
||||
: #hashes-and-length ( bloom-filter -- #hashes length )
|
||||
[ #hashes>> ] [ bits>> length ] bi ; inline
|
||||
|
||||
: relevant-indices ( object bloom-filter -- n quot: ( elt -- n ) )
|
||||
[ hashcodes-from-object ] [ n-hashes-and-length ] bi*
|
||||
[ -rot ] dip enhanced-double-hashes ; inline
|
||||
[ double-hashcodes ] [ #hashes-and-length ] bi*
|
||||
[ -rot ] dip '[ _ _ combine-hashcodes _ mod ] ; inline
|
||||
|
||||
PRIVATE>
|
||||
|
||||
TYPED: bloom-filter-insert ( object bloom-filter: bloom-filter -- )
|
||||
[ increment-n-objects ]
|
||||
[ increment-count ]
|
||||
[ relevant-indices ]
|
||||
[ bits>> [ [ t ] 2dip set-nth-unsafe ] curry ]
|
||||
tri compose each-integer ;
|
||||
|
|
Loading…
Reference in New Issue