From 66693a786fdafdb7b3f25efc56f75c6ccb0312fa Mon Sep 17 00:00:00 2001 From: John Benediktsson Date: Thu, 4 Apr 2013 13:26:24 -0700 Subject: [PATCH] bloom-filters: cleanup interface a little bit. --- .../bloom-filters/bloom-filters-tests.factor | 17 ++++--- extra/bloom-filters/bloom-filters.factor | 48 +++++++++---------- 2 files changed, 30 insertions(+), 35 deletions(-) diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor index 92ec4bea31..06a84e3dd3 100644 --- a/extra/bloom-filters/bloom-filters-tests.factor +++ b/extra/bloom-filters/bloom-filters-tests.factor @@ -2,7 +2,6 @@ USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts math random sequences tools.test ; IN: bloom-filters.tests - [ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test [ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test @@ -13,10 +12,10 @@ IN: bloom-filters.tests ! Test bloom-filter creation [ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test [ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test -[ 7 ] [ 0.01 5000 n-hashes>> ] unit-test +[ 7 ] [ 0.01 5000 #hashes>> ] unit-test [ 47965 ] [ 0.01 5000 bits>> length ] unit-test -[ 5000 ] [ 0.01 5000 maximum-n-objects>> ] unit-test -[ 0 ] [ 0.01 5000 current-n-objects>> ] unit-test +[ 5000 ] [ 0.01 5000 capacity>> ] unit-test +[ 0 ] [ 0.01 5000 count>> ] unit-test ! Should return the fewest hashes to satisfy the bits requested, not the most. [ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test @@ -31,24 +30,24 @@ IN: bloom-filters.tests [ 20 2000 ] [ invalid-error-rate? ] must-fail-with [ 0.0 2000 ] [ invalid-error-rate? ] must-fail-with [ -2 2000 ] [ invalid-error-rate? ] must-fail-with -[ 0.5 0 ] [ invalid-n-objects? ] must-fail-with -[ 0.5 -5 ] [ invalid-n-objects? ] must-fail-with +[ 0.5 0 ] [ invalid-capacity? ] must-fail-with +[ 0.5 -5 ] [ invalid-capacity? ] must-fail-with ! Should not generate bignum hash codes. Enhanced double hashing may generate a ! lot of hash codes, and it's better to do this earlier than later. -[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ ] all? ] unit-test +[ t ] [ 10000 iota [ double-hashcodes [ fixnum? ] both? ] map [ ] all? ] unit-test : empty-bloom-filter ( -- bloom-filter ) 0.01 2000 ; -[ 1 ] [ empty-bloom-filter dup increment-n-objects current-n-objects>> ] unit-test +[ 1 ] [ empty-bloom-filter dup increment-count count>> ] unit-test : basic-insert-test-setup ( -- bloom-filter ) 1 empty-bloom-filter [ bloom-filter-insert ] keep ; ! Basic tests that insert does something [ t ] [ basic-insert-test-setup bits>> [ ] any? ] unit-test -[ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test +[ 1 ] [ basic-insert-test-setup count>> ] unit-test : non-empty-bloom-filter ( -- bloom-filter ) 1000 iota diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index b0920f0374..04d5ec1d37 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -44,14 +44,14 @@ TODO: */ TUPLE: bloom-filter -{ n-hashes fixnum read-only } +{ #hashes fixnum read-only } { bits bit-array read-only } -{ maximum-n-objects fixnum read-only } -{ current-n-objects fixnum } ; +{ capacity fixnum read-only } +{ count fixnum } ; ERROR: capacity-error ; ERROR: invalid-error-rate error-rate ; -ERROR: invalid-n-objects n-objects ; +ERROR: invalid-capacity capacity ; integer ; ! 100 hashes ought to be enough for anybody. -: n-hashes-range ( -- range ) +: #hashes-range ( -- range ) 100 [1,b] ; -! { n-hashes n-bits } +! { #hashes #bits } : identity-configuration ( -- 2seq ) 0 max-array-capacity 2array ; @@ -72,7 +72,7 @@ ERROR: invalid-n-objects n-objects ; ! If the number of hashes isn't positive, we haven't found ! anything smaller than the identity configuration. -: check-capacity ( 2seq -- 2seq ) +: check-hashes ( 2seq -- 2seq ) dup first 0 <= [ capacity-error ] when ; ! The consensus on the tradeoff between increasing the number of @@ -84,13 +84,13 @@ ERROR: invalid-n-objects n-objects ; ! tradeoff to support it, and I haven't done my own, but we'll ! go with it anyway. : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits ) - [ n-hashes-range identity-configuration ] 2dip '[ + [ #hashes-range identity-configuration ] 2dip '[ dup _ _ bits-to-satisfy-error-rate 2array smaller-second - ] reduce check-capacity first2 ; + ] reduce check-hashes first2 ; -: check-n-objects ( n-objects -- n-objects ) - dup 0 <= [ invalid-n-objects ] when ; +: check-capacity ( capacity -- capacity ) + dup 0 <= [ invalid-capacity ] when ; : check-error-rate ( error-rate -- error-rate ) dup [ 0 after? ] [ 1 before? ] bi and @@ -98,8 +98,8 @@ ERROR: invalid-n-objects n-objects ; PRIVATE> -: ( error-rate number-objects -- bloom-filter ) - [ check-error-rate ] [ check-n-objects ] bi* +: ( error-rate capacity -- bloom-filter ) + [ check-error-rate ] [ check-capacity ] bi* [ size-bloom-filter ] keep 0 ! initially empty bloom-filter boa ; @@ -110,32 +110,28 @@ PRIVATE> ! Dillinger and Panagiotis Manolios, section 5.2, "Enhanced ! Double Hashing": ! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html -: enhanced-double-hash ( index hash0 hash1 -- hash ) +: combine-hashcodes ( index hash0 hash1 -- hash ) { fixnum fixnum fixnum } declare [ [ [ 3 ^ ] [ - ] bi 6 /i ] keep ] [ fixnum*fast ] [ fixnum+fast ] tri* + abs ; -: enhanced-double-hashes ( hash0 hash1 length -- quot: ( elt -- n ) ) - '[ _ _ enhanced-double-hash _ mod ] ; inline - -! Make sure it's a fixnum here to speed up double-hashing. -: hashcodes-from-object ( object -- n n ) +: double-hashcodes ( object -- hash0 hash1 ) hashcode >fixnum dup most-positive-fixnum bitxor >fixnum ; -: increment-n-objects ( bloom-filter -- ) - [ 1 + ] change-current-n-objects drop ; inline +: increment-count ( bloom-filter -- ) + [ 1 + ] change-count drop ; inline -: n-hashes-and-length ( bloom-filter -- n-hashes length ) - [ n-hashes>> ] [ bits>> length ] bi ; inline +: #hashes-and-length ( bloom-filter -- #hashes length ) + [ #hashes>> ] [ bits>> length ] bi ; inline : relevant-indices ( object bloom-filter -- n quot: ( elt -- n ) ) - [ hashcodes-from-object ] [ n-hashes-and-length ] bi* - [ -rot ] dip enhanced-double-hashes ; inline + [ double-hashcodes ] [ #hashes-and-length ] bi* + [ -rot ] dip '[ _ _ combine-hashcodes _ mod ] ; inline PRIVATE> TYPED: bloom-filter-insert ( object bloom-filter: bloom-filter -- ) - [ increment-n-objects ] + [ increment-count ] [ relevant-indices ] [ bits>> [ [ t ] 2dip set-nth-unsafe ] curry ] tri compose each-integer ;