bloom-filters: cleanup interface a little bit.
parent
55e0be5c7a
commit
66693a786f
|
@ -2,7 +2,6 @@ USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
|
||||||
math random sequences tools.test ;
|
math random sequences tools.test ;
|
||||||
IN: bloom-filters.tests
|
IN: bloom-filters.tests
|
||||||
|
|
||||||
|
|
||||||
[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test
|
[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test
|
||||||
[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test
|
[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test
|
||||||
|
|
||||||
|
@ -13,10 +12,10 @@ IN: bloom-filters.tests
|
||||||
! Test bloom-filter creation
|
! Test bloom-filter creation
|
||||||
[ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
|
[ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test
|
||||||
[ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test
|
[ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test
|
||||||
[ 7 ] [ 0.01 5000 <bloom-filter> n-hashes>> ] unit-test
|
[ 7 ] [ 0.01 5000 <bloom-filter> #hashes>> ] unit-test
|
||||||
[ 47965 ] [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
|
[ 47965 ] [ 0.01 5000 <bloom-filter> bits>> length ] unit-test
|
||||||
[ 5000 ] [ 0.01 5000 <bloom-filter> maximum-n-objects>> ] unit-test
|
[ 5000 ] [ 0.01 5000 <bloom-filter> capacity>> ] unit-test
|
||||||
[ 0 ] [ 0.01 5000 <bloom-filter> current-n-objects>> ] unit-test
|
[ 0 ] [ 0.01 5000 <bloom-filter> count>> ] unit-test
|
||||||
|
|
||||||
! Should return the fewest hashes to satisfy the bits requested, not the most.
|
! Should return the fewest hashes to satisfy the bits requested, not the most.
|
||||||
[ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
|
[ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test
|
||||||
|
@ -31,24 +30,24 @@ IN: bloom-filters.tests
|
||||||
[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||||
[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||||
[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
|
||||||
[ 0.5 0 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
|
[ 0.5 0 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
|
||||||
[ 0.5 -5 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
|
[ 0.5 -5 <bloom-filter> ] [ invalid-capacity? ] must-fail-with
|
||||||
|
|
||||||
! Should not generate bignum hash codes. Enhanced double hashing may generate a
|
! Should not generate bignum hash codes. Enhanced double hashing may generate a
|
||||||
! lot of hash codes, and it's better to do this earlier than later.
|
! lot of hash codes, and it's better to do this earlier than later.
|
||||||
[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ ] all? ] unit-test
|
[ t ] [ 10000 iota [ double-hashcodes [ fixnum? ] both? ] map [ ] all? ] unit-test
|
||||||
|
|
||||||
: empty-bloom-filter ( -- bloom-filter )
|
: empty-bloom-filter ( -- bloom-filter )
|
||||||
0.01 2000 <bloom-filter> ;
|
0.01 2000 <bloom-filter> ;
|
||||||
|
|
||||||
[ 1 ] [ empty-bloom-filter dup increment-n-objects current-n-objects>> ] unit-test
|
[ 1 ] [ empty-bloom-filter dup increment-count count>> ] unit-test
|
||||||
|
|
||||||
: basic-insert-test-setup ( -- bloom-filter )
|
: basic-insert-test-setup ( -- bloom-filter )
|
||||||
1 empty-bloom-filter [ bloom-filter-insert ] keep ;
|
1 empty-bloom-filter [ bloom-filter-insert ] keep ;
|
||||||
|
|
||||||
! Basic tests that insert does something
|
! Basic tests that insert does something
|
||||||
[ t ] [ basic-insert-test-setup bits>> [ ] any? ] unit-test
|
[ t ] [ basic-insert-test-setup bits>> [ ] any? ] unit-test
|
||||||
[ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test
|
[ 1 ] [ basic-insert-test-setup count>> ] unit-test
|
||||||
|
|
||||||
: non-empty-bloom-filter ( -- bloom-filter )
|
: non-empty-bloom-filter ( -- bloom-filter )
|
||||||
1000 iota
|
1000 iota
|
||||||
|
|
|
@ -44,14 +44,14 @@ TODO:
|
||||||
*/
|
*/
|
||||||
|
|
||||||
TUPLE: bloom-filter
|
TUPLE: bloom-filter
|
||||||
{ n-hashes fixnum read-only }
|
{ #hashes fixnum read-only }
|
||||||
{ bits bit-array read-only }
|
{ bits bit-array read-only }
|
||||||
{ maximum-n-objects fixnum read-only }
|
{ capacity fixnum read-only }
|
||||||
{ current-n-objects fixnum } ;
|
{ count fixnum } ;
|
||||||
|
|
||||||
ERROR: capacity-error ;
|
ERROR: capacity-error ;
|
||||||
ERROR: invalid-error-rate error-rate ;
|
ERROR: invalid-error-rate error-rate ;
|
||||||
ERROR: invalid-n-objects n-objects ;
|
ERROR: invalid-capacity capacity ;
|
||||||
|
|
||||||
<PRIVATE
|
<PRIVATE
|
||||||
|
|
||||||
|
@ -60,10 +60,10 @@ ERROR: invalid-n-objects n-objects ;
|
||||||
ceiling >integer ;
|
ceiling >integer ;
|
||||||
|
|
||||||
! 100 hashes ought to be enough for anybody.
|
! 100 hashes ought to be enough for anybody.
|
||||||
: n-hashes-range ( -- range )
|
: #hashes-range ( -- range )
|
||||||
100 [1,b] ;
|
100 [1,b] ;
|
||||||
|
|
||||||
! { n-hashes n-bits }
|
! { #hashes #bits }
|
||||||
: identity-configuration ( -- 2seq )
|
: identity-configuration ( -- 2seq )
|
||||||
0 max-array-capacity 2array ;
|
0 max-array-capacity 2array ;
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ ERROR: invalid-n-objects n-objects ;
|
||||||
|
|
||||||
! If the number of hashes isn't positive, we haven't found
|
! If the number of hashes isn't positive, we haven't found
|
||||||
! anything smaller than the identity configuration.
|
! anything smaller than the identity configuration.
|
||||||
: check-capacity ( 2seq -- 2seq )
|
: check-hashes ( 2seq -- 2seq )
|
||||||
dup first 0 <= [ capacity-error ] when ;
|
dup first 0 <= [ capacity-error ] when ;
|
||||||
|
|
||||||
! The consensus on the tradeoff between increasing the number of
|
! The consensus on the tradeoff between increasing the number of
|
||||||
|
@ -84,13 +84,13 @@ ERROR: invalid-n-objects n-objects ;
|
||||||
! tradeoff to support it, and I haven't done my own, but we'll
|
! tradeoff to support it, and I haven't done my own, but we'll
|
||||||
! go with it anyway.
|
! go with it anyway.
|
||||||
: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
|
: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
|
||||||
[ n-hashes-range identity-configuration ] 2dip '[
|
[ #hashes-range identity-configuration ] 2dip '[
|
||||||
dup _ _ bits-to-satisfy-error-rate
|
dup _ _ bits-to-satisfy-error-rate
|
||||||
2array smaller-second
|
2array smaller-second
|
||||||
] reduce check-capacity first2 ;
|
] reduce check-hashes first2 ;
|
||||||
|
|
||||||
: check-n-objects ( n-objects -- n-objects )
|
: check-capacity ( capacity -- capacity )
|
||||||
dup 0 <= [ invalid-n-objects ] when ;
|
dup 0 <= [ invalid-capacity ] when ;
|
||||||
|
|
||||||
: check-error-rate ( error-rate -- error-rate )
|
: check-error-rate ( error-rate -- error-rate )
|
||||||
dup [ 0 after? ] [ 1 before? ] bi and
|
dup [ 0 after? ] [ 1 before? ] bi and
|
||||||
|
@ -98,8 +98,8 @@ ERROR: invalid-n-objects n-objects ;
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
: <bloom-filter> ( error-rate number-objects -- bloom-filter )
|
: <bloom-filter> ( error-rate capacity -- bloom-filter )
|
||||||
[ check-error-rate ] [ check-n-objects ] bi*
|
[ check-error-rate ] [ check-capacity ] bi*
|
||||||
[ size-bloom-filter <bit-array> ] keep
|
[ size-bloom-filter <bit-array> ] keep
|
||||||
0 ! initially empty
|
0 ! initially empty
|
||||||
bloom-filter boa ;
|
bloom-filter boa ;
|
||||||
|
@ -110,32 +110,28 @@ PRIVATE>
|
||||||
! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
|
! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced
|
||||||
! Double Hashing":
|
! Double Hashing":
|
||||||
! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
|
! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html
|
||||||
: enhanced-double-hash ( index hash0 hash1 -- hash )
|
: combine-hashcodes ( index hash0 hash1 -- hash )
|
||||||
{ fixnum fixnum fixnum } declare
|
{ fixnum fixnum fixnum } declare
|
||||||
[ [ [ 3 ^ ] [ - ] bi 6 /i ] keep ]
|
[ [ [ 3 ^ ] [ - ] bi 6 /i ] keep ]
|
||||||
[ fixnum*fast ] [ fixnum+fast ] tri* + abs ;
|
[ fixnum*fast ] [ fixnum+fast ] tri* + abs ;
|
||||||
|
|
||||||
: enhanced-double-hashes ( hash0 hash1 length -- quot: ( elt -- n ) )
|
: double-hashcodes ( object -- hash0 hash1 )
|
||||||
'[ _ _ enhanced-double-hash _ mod ] ; inline
|
|
||||||
|
|
||||||
! Make sure it's a fixnum here to speed up double-hashing.
|
|
||||||
: hashcodes-from-object ( object -- n n )
|
|
||||||
hashcode >fixnum dup most-positive-fixnum bitxor >fixnum ;
|
hashcode >fixnum dup most-positive-fixnum bitxor >fixnum ;
|
||||||
|
|
||||||
: increment-n-objects ( bloom-filter -- )
|
: increment-count ( bloom-filter -- )
|
||||||
[ 1 + ] change-current-n-objects drop ; inline
|
[ 1 + ] change-count drop ; inline
|
||||||
|
|
||||||
: n-hashes-and-length ( bloom-filter -- n-hashes length )
|
: #hashes-and-length ( bloom-filter -- #hashes length )
|
||||||
[ n-hashes>> ] [ bits>> length ] bi ; inline
|
[ #hashes>> ] [ bits>> length ] bi ; inline
|
||||||
|
|
||||||
: relevant-indices ( object bloom-filter -- n quot: ( elt -- n ) )
|
: relevant-indices ( object bloom-filter -- n quot: ( elt -- n ) )
|
||||||
[ hashcodes-from-object ] [ n-hashes-and-length ] bi*
|
[ double-hashcodes ] [ #hashes-and-length ] bi*
|
||||||
[ -rot ] dip enhanced-double-hashes ; inline
|
[ -rot ] dip '[ _ _ combine-hashcodes _ mod ] ; inline
|
||||||
|
|
||||||
PRIVATE>
|
PRIVATE>
|
||||||
|
|
||||||
TYPED: bloom-filter-insert ( object bloom-filter: bloom-filter -- )
|
TYPED: bloom-filter-insert ( object bloom-filter: bloom-filter -- )
|
||||||
[ increment-n-objects ]
|
[ increment-count ]
|
||||||
[ relevant-indices ]
|
[ relevant-indices ]
|
||||||
[ bits>> [ [ t ] 2dip set-nth-unsafe ] curry ]
|
[ bits>> [ [ t ] 2dip set-nth-unsafe ] curry ]
|
||||||
tri compose each-integer ;
|
tri compose each-integer ;
|
||||||
|
|
Loading…
Reference in New Issue