From e3d5d8bef08e231f0579ffe6fe5432675cd878d2 Mon Sep 17 00:00:00 2001 From: Alec Berryman Date: Thu, 7 May 2009 22:45:02 -0400 Subject: [PATCH 1/7] bloom-filters: compact, probabilistic membership testing --- extra/bloom-filters/authors.txt | 1 + extra/bloom-filters/bloom-filters-docs.factor | 36 ++++ .../bloom-filters/bloom-filters-tests.factor | 71 ++++++++ extra/bloom-filters/bloom-filters.factor | 161 ++++++++++++++++++ 4 files changed, 269 insertions(+) create mode 100644 extra/bloom-filters/authors.txt create mode 100644 extra/bloom-filters/bloom-filters-docs.factor create mode 100644 extra/bloom-filters/bloom-filters-tests.factor create mode 100644 extra/bloom-filters/bloom-filters.factor diff --git a/extra/bloom-filters/authors.txt b/extra/bloom-filters/authors.txt new file mode 100644 index 0000000000..528e5dfe6b --- /dev/null +++ b/extra/bloom-filters/authors.txt @@ -0,0 +1 @@ +Alec Berryman diff --git a/extra/bloom-filters/bloom-filters-docs.factor b/extra/bloom-filters/bloom-filters-docs.factor new file mode 100644 index 0000000000..4af1a82af6 --- /dev/null +++ b/extra/bloom-filters/bloom-filters-docs.factor @@ -0,0 +1,36 @@ +USING: help.markup help.syntax kernel math ; +IN: bloom-filters + +HELP: +{ $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." } + { "number-objects" "The expected number of object in the set. An " { $link integer } "." } + { "bloom-filter" bloom-filter } } +{ $description "Creates an empty Bloom filter." } ; + +HELP: bloom-filter-insert +{ $values { "object" object } + { "bloom-filter" bloom-filter } } +{ $description "Records the item as a member of the filter." } +{ $side-effects "bloom-filter" } ; + +HELP: bloom-filter-member? +{ $values { "object" object } + { "bloom-filter" bloom-filter } + { "?" boolean } } +{ $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise. The false positive rate is configurable; there are no false negatives." } ; + +HELP: bloom-filter +{ $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ; + +ARTICLE: "bloom-filters" "Bloom filters" +"This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements." +$nl +"The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set." +$nl +"Bloom filters cannot be resized and do not support removal." +$nl +{ $subsection } +{ $subsection bloom-filter-insert } +{ $subsection bloom-filter-member? } ; + +ABOUT: "bloom-filters" diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor new file mode 100644 index 0000000000..b7a5d7ebc2 --- /dev/null +++ b/extra/bloom-filters/bloom-filters-tests.factor @@ -0,0 +1,71 @@ +USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts +math random sequences tools.test ; +IN: bloom-filters.tests + +! The sizing information was generated using the subroutine +! calculate_shortest_filter_length from +! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html. + +! Test bloom-filter creation +[ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test +[ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test +[ 7 ] [ 0.01 5000 n-hashes>> ] unit-test +[ 47965 ] [ 0.01 5000 bits>> length ] unit-test +[ 5000 ] [ 0.01 5000 maximum-n-objects>> ] unit-test +[ 0 ] [ 0.01 5000 current-n-objects>> ] unit-test + +! Should return the fewest hashes to satisfy the bits requested, not the most. +[ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test +[ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test +[ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test + +! This is a lot of bits. On linux-x86-32, max-array-capacity is 134217727, +! which is about 16MB (assuming I can do math), which is sort of pithy. I'm +! not sure how to handle this case. Returning a smaller-than-requested +! arrays is not the least surprising behavior, but is still surprising. +[ 383718189 ] [ 7 0.01 40000000 bits-to-satisfy-error-rate ] unit-test +! [ 7 383718189 ] [ 0.01 40000000 size-bloom-filter ] unit-test +! [ 383718189 ] [ 0.01 40000000 bits>> length ] unit-test + +! Should not generate bignum hash codes. Enhanced double hashing may generate a +! lot of hash codes, and it's better to do this earlier than later. +[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ t = ] all? ] unit-test + +[ ?{ t f t f t f } ] [ { 0 2 4 } 6 [ set-indices ] keep ] unit-test + +: empty-bloom-filter ( -- bloom-filter ) + 0.01 2000 ; + +[ 1 ] [ empty-bloom-filter [ increment-n-objects ] keep current-n-objects>> ] unit-test + +: basic-insert-test-setup ( -- bloom-filter ) + 1 empty-bloom-filter [ bloom-filter-insert ] keep ; + +! Basic tests that insert does something +[ t ] [ basic-insert-test-setup bits>> [ t = ] any? ] unit-test +[ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test + +: non-empty-bloom-filter ( -- bloom-filter ) + 1000 iota + empty-bloom-filter + [ [ bloom-filter-insert ] curry each ] keep ; + +: full-bloom-filter ( -- bloom-filter ) + 2000 iota + empty-bloom-filter + [ [ bloom-filter-insert ] curry each ] keep ; + +! Should find what we put in there. +[ t ] [ 2000 iota + full-bloom-filter + [ bloom-filter-member? ] curry map + [ t = ] all? ] unit-test + +! We shouldn't have more than 0.01 false-positive rate. +[ t ] [ 1000 iota [ drop most-positive-fixnum random 1000 + ] map + full-bloom-filter + [ bloom-filter-member? ] curry map + [ t = ] filter + ! TODO: This should be 10, but the false positive rate is currently very + ! high. It shouldn't be much more than this. + length 150 <= ] unit-test diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor new file mode 100644 index 0000000000..94d0dd070f --- /dev/null +++ b/extra/bloom-filters/bloom-filters.factor @@ -0,0 +1,161 @@ +! Copyright (C) 2009 Alec Berryman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays assocs bit-arrays kernel layouts locals math +math.functions math.ranges multiline sequences ; +IN: bloom-filters + +/* + +TODO: + +- How to singal an error when too many bits? It looks like a built-in for some + types of arrays, but bit-array just returns a zero-length array. What we do + now is completely broken: -1 hash codes? Really? + +- The false positive rate is 10x what it should be, based on informal testing. + Better object hashes or a better method of generating extra hash codes would + help. Another way is to increase the number of bits used. + + - Try something smarter than the bitwise complement for a second hash code. + + - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html + makes a case for http://murmurhash.googlepages.com/ instead of enhanced + double-hashing. + + - Be sure to adjust the test that asserts the number of false positives isn't + unreasonable. + +- Should round bits up to next power of two, use wrap instead of mod. + +- Should allow user to specify the hash codes, either as inputs to enhanced + double hashing or for direct use. + +- Support for serialization. + +- Wrappers for combining filters. + +- Should we signal an error when inserting past the number of objects the filter + is sized for? The filter will continue to work, just not very well. + +- The other TODOs sprinkled through the code. + +*/ + +TUPLE: bloom-filter +{ n-hashes fixnum read-only } +{ bits bit-array read-only } +{ maximum-n-objects fixnum read-only } +{ current-n-objects fixnum } ; + +integer ; ! should check that it's below max-array-capacity + +! TODO: this should be a constant +! +! TODO: after very little experimentation, I never see this increase after about +! 20 or so. Maybe it should be smaller. +: n-hashes-range ( -- range ) + 100 [1,b] ; + +! Ends up with a list of arrays - { n-bits position } +: find-bloom-filter-sizes ( error-rate number-objects -- seq ) + [ bits-to-satisfy-error-rate ] 2curry + n-hashes-range swap + map + n-hashes-range zip ; + +:: smallest-first ( seq1 seq2 -- seq ) + seq1 first seq2 first <= [ seq1 ] [ seq2 ] if ; + +! The consensus on the tradeoff between increasing the number of bits and +! increasing the number of hash functions seems to be "go for the smallest +! number of bits", probably because most implementations just generate one hash +! value and cheaply mangle it into the number of hashes they need. I have not +! seen any usage studies from the implementations that made this tradeoff to +! support it, and I haven't done my own, but we'll go with it anyway. +! +! TODO: check that error-rate is reasonable. +: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits ) + find-bloom-filter-sizes + max-array-capacity -1 2array + [ smallest-first ] + reduce + [ second ] [ first ] bi ; + +PRIVATE> + +: ( error-rate number-objects -- bloom-filter ) + [ size-bloom-filter ] keep + 0 ! initially empty + bloom-filter boa ; + +fixnum bitxor ; + +! TODO: This code calls abs because all the double-hashing stuff outputs array +! indices and those aren't good negative. Are we throwing away bits? -1000 +! b. actually prints -1111101000, which confuses me. +: hashcodes-from-object ( obj -- n n ) + hashcode abs hashcodes-from-hashcode ; + +: set-indices ( indices bit-array -- ) + [ [ drop t ] change-nth ] curry each ; + +: increment-n-objects ( bloom-filter -- ) + dup current-n-objects>> 1 + >>current-n-objects drop ; + +! This would be better as an each-relevant-hash that didn't cons. +: relevant-indices ( value bloom-filter -- indices ) + [ n-hashes>> ] [ bits>> length ] bi ! value n array-size + swapd [ hashcodes-from-object ] dip ! n value1 value2 array-size + enhanced-double-hashes ; + +PRIVATE> + +: bloom-filter-insert ( object bloom-filter -- ) + [ relevant-indices ] + [ bits>> set-indices ] + [ increment-n-objects ] + tri ; + +: bloom-filter-member? ( value bloom-filter -- ? ) + [ relevant-indices ] + [ bits>> [ nth ] curry map [ t = ] all? ] + bi ; From c2482fe2bf1cf03b8f3a100ecc23db6f3e49adc2 Mon Sep 17 00:00:00 2001 From: Alec Berryman Date: Fri, 8 May 2009 22:14:07 -0400 Subject: [PATCH 2/7] bloom-filters: simplify several functions --- .../bloom-filters/bloom-filters-tests.factor | 10 +++--- extra/bloom-filters/bloom-filters.factor | 35 +++++++++---------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor index b7a5d7ebc2..40fd1469b2 100644 --- a/extra/bloom-filters/bloom-filters-tests.factor +++ b/extra/bloom-filters/bloom-filters-tests.factor @@ -29,20 +29,20 @@ IN: bloom-filters.tests ! Should not generate bignum hash codes. Enhanced double hashing may generate a ! lot of hash codes, and it's better to do this earlier than later. -[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ t = ] all? ] unit-test +[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ ] all? ] unit-test [ ?{ t f t f t f } ] [ { 0 2 4 } 6 [ set-indices ] keep ] unit-test : empty-bloom-filter ( -- bloom-filter ) 0.01 2000 ; -[ 1 ] [ empty-bloom-filter [ increment-n-objects ] keep current-n-objects>> ] unit-test +[ 1 ] [ empty-bloom-filter increment-n-objects current-n-objects>> ] unit-test : basic-insert-test-setup ( -- bloom-filter ) 1 empty-bloom-filter [ bloom-filter-insert ] keep ; ! Basic tests that insert does something -[ t ] [ basic-insert-test-setup bits>> [ t = ] any? ] unit-test +[ t ] [ basic-insert-test-setup bits>> [ ] any? ] unit-test [ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test : non-empty-bloom-filter ( -- bloom-filter ) @@ -59,13 +59,13 @@ IN: bloom-filters.tests [ t ] [ 2000 iota full-bloom-filter [ bloom-filter-member? ] curry map - [ t = ] all? ] unit-test + [ ] all? ] unit-test ! We shouldn't have more than 0.01 false-positive rate. [ t ] [ 1000 iota [ drop most-positive-fixnum random 1000 + ] map full-bloom-filter [ bloom-filter-member? ] curry map - [ t = ] filter + [ ] filter ! TODO: This should be 10, but the false positive rate is currently very ! high. It shouldn't be much more than this. length 150 <= ] unit-test diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index 94d0dd070f..3e0aba175c 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2009 Alec Berryman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays assocs bit-arrays kernel layouts locals math -math.functions math.ranges multiline sequences ; +USING: accessors arrays bit-arrays fry kernel layouts locals math math.functions +math.ranges multiline sequences ; IN: bloom-filters /* @@ -70,8 +70,8 @@ TUPLE: bloom-filter map n-hashes-range zip ; -:: smallest-first ( seq1 seq2 -- seq ) - seq1 first seq2 first <= [ seq1 ] [ seq2 ] if ; +: smallest-first ( seq1 seq2 -- seq ) + [ [ first ] bi@ <= ] most ; ! The consensus on the tradeoff between increasing the number of bits and ! increasing the number of hash functions seems to be "go for the smallest @@ -118,9 +118,7 @@ PRIVATE> array-size mod ; : enhanced-double-hashes ( n hash0 hash1 array-size -- seq ) - [ enhanced-double-hash ] 3curry - [ [0,b) ] dip - map ; + '[ _ _ _ enhanced-double-hash ] [ [0,b) ] dip map ; ! Stupid, should pick something good. : hashcodes-from-hashcode ( n -- n n ) @@ -138,24 +136,23 @@ PRIVATE> : set-indices ( indices bit-array -- ) [ [ drop t ] change-nth ] curry each ; -: increment-n-objects ( bloom-filter -- ) - dup current-n-objects>> 1 + >>current-n-objects drop ; +: increment-n-objects ( bloom-filter -- bloom-filter ) + [ 1 + ] change-current-n-objects ; + +: n-hashes-and-bits ( bloom-filter -- n-hashes n-bits ) + [ n-hashes>> ] [ bits>> length ] bi ; -! This would be better as an each-relevant-hash that didn't cons. : relevant-indices ( value bloom-filter -- indices ) - [ n-hashes>> ] [ bits>> length ] bi ! value n array-size - swapd [ hashcodes-from-object ] dip ! n value1 value2 array-size + n-hashes-and-bits + [ swap hashcodes-from-object ] dip enhanced-double-hashes ; PRIVATE> : bloom-filter-insert ( object bloom-filter -- ) - [ relevant-indices ] - [ bits>> set-indices ] - [ increment-n-objects ] - tri ; + increment-n-objects + [ relevant-indices ] [ bits>> set-indices ] bi ; : bloom-filter-member? ( value bloom-filter -- ? ) - [ relevant-indices ] - [ bits>> [ nth ] curry map [ t = ] all? ] - bi ; + [ relevant-indices ] keep + bits>> nths [ ] all? ; From 3e3f08c6e5b70633d400a57c836debd46b0adba7 Mon Sep 17 00:00:00 2001 From: Alec Berryman Date: Fri, 8 May 2009 23:30:01 -0400 Subject: [PATCH 3/7] bloom-filters: clean up creation More readable, less allocation, signals invalid input. --- extra/bloom-filters/bloom-filters-docs.factor | 6 +- .../bloom-filters/bloom-filters-tests.factor | 24 +++++-- extra/bloom-filters/bloom-filters.factor | 66 ++++++++++++------- 3 files changed, 63 insertions(+), 33 deletions(-) diff --git a/extra/bloom-filters/bloom-filters-docs.factor b/extra/bloom-filters/bloom-filters-docs.factor index 4af1a82af6..bc5df8611c 100644 --- a/extra/bloom-filters/bloom-filters-docs.factor +++ b/extra/bloom-filters/bloom-filters-docs.factor @@ -3,9 +3,11 @@ IN: bloom-filters HELP: { $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." } - { "number-objects" "The expected number of object in the set. An " { $link integer } "." } + { "number-objects" "The expected number of object in the set. A positive " { $link integer } "." } { "bloom-filter" bloom-filter } } -{ $description "Creates an empty Bloom filter." } ; +{ $description "Creates an empty Bloom filter." } +{ $errors "Throws a " { $link capacity-error } " when unable to produce a filter meeting the given constraints. Throws a " { $link invalid-error-rate } " or a " { $link invalid-n-objects } " when input is invalid." } ; + HELP: bloom-filter-insert { $values { "object" object } diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor index 40fd1469b2..b4fd69d849 100644 --- a/extra/bloom-filters/bloom-filters-tests.factor +++ b/extra/bloom-filters/bloom-filters-tests.factor @@ -2,6 +2,10 @@ USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts math random sequences tools.test ; IN: bloom-filters.tests + +[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test +[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test + ! The sizing information was generated using the subroutine ! calculate_shortest_filter_length from ! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html. @@ -19,13 +23,19 @@ IN: bloom-filters.tests [ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test [ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test -! This is a lot of bits. On linux-x86-32, max-array-capacity is 134217727, -! which is about 16MB (assuming I can do math), which is sort of pithy. I'm -! not sure how to handle this case. Returning a smaller-than-requested -! arrays is not the least surprising behavior, but is still surprising. -[ 383718189 ] [ 7 0.01 40000000 bits-to-satisfy-error-rate ] unit-test -! [ 7 383718189 ] [ 0.01 40000000 size-bloom-filter ] unit-test -! [ 383718189 ] [ 0.01 40000000 bits>> length ] unit-test +! This is a lot of bits. +: oversized-filter-params ( -- error-rate n-objects ) + 0.00000001 400000000000000 ; +[ oversized-filter-params size-bloom-filter ] [ capacity-error? ] must-fail-with +[ oversized-filter-params ] [ capacity-error? ] must-fail-with + +! Other error conditions. +[ 1.0 2000 ] [ invalid-error-rate? ] must-fail-with +[ 20 2000 ] [ invalid-error-rate? ] must-fail-with +[ 0.0 2000 ] [ invalid-error-rate? ] must-fail-with +[ -2 2000 ] [ invalid-error-rate? ] must-fail-with +[ 0.5 0 ] [ invalid-n-objects? ] must-fail-with +[ 0.5 -5 ] [ invalid-n-objects? ] must-fail-with ! Should not generate bignum hash codes. Enhanced double hashing may generate a ! lot of hash codes, and it's better to do this earlier than later. diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index 3e0aba175c..5440461892 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -1,17 +1,16 @@ ! Copyright (C) 2009 Alec Berryman. ! See http://factorcode.org/license.txt for BSD license. USING: accessors arrays bit-arrays fry kernel layouts locals math math.functions -math.ranges multiline sequences ; +multiline sequences ; IN: bloom-filters +FROM: math.ranges => [1,b] [0,b) ; +FROM: math.intervals => (a,b) interval-contains? ; + /* TODO: -- How to singal an error when too many bits? It looks like a built-in for some - types of arrays, but bit-array just returns a zero-length array. What we do - now is completely broken: -1 hash codes? Really? - - The false positive rate is 10x what it should be, based on informal testing. Better object hashes or a better method of generating extra hash codes would help. Another way is to increase the number of bits used. @@ -25,7 +24,9 @@ TODO: - Be sure to adjust the test that asserts the number of false positives isn't unreasonable. -- Should round bits up to next power of two, use wrap instead of mod. +- Could round bits up to next power of two and use wrap instead of mod. This + would cost a lot of bits on 32-bit platforms, though, and limit the bit-array + to 8MB. - Should allow user to specify the hash codes, either as inputs to enhanced double hashing or for direct use. @@ -47,6 +48,10 @@ TUPLE: bloom-filter { maximum-n-objects fixnum read-only } { current-n-objects fixnum } ; +ERROR: capacity-error ; +ERROR: invalid-error-rate ; +ERROR: invalid-n-objects ; + integer ; ! should check that it's below max-array-capacity -! TODO: this should be a constant -! -! TODO: after very little experimentation, I never see this increase after about -! 20 or so. Maybe it should be smaller. +! 100 hashes ought to be enough for anybody. : n-hashes-range ( -- range ) 100 [1,b] ; -! Ends up with a list of arrays - { n-bits position } -: find-bloom-filter-sizes ( error-rate number-objects -- seq ) - [ bits-to-satisfy-error-rate ] 2curry - n-hashes-range swap - map - n-hashes-range zip ; +! { n-hashes n-bits } +: identity-configuration ( -- 2seq ) + 0 max-array-capacity 2array ; -: smallest-first ( seq1 seq2 -- seq ) - [ [ first ] bi@ <= ] most ; +: smaller-second ( 2seq 2seq -- 2seq ) + [ [ second ] bi@ <= ] most ; + +! If the number of hashes isn't positive, we haven't found anything smaller than the +! identity configuration. +: validate-sizes ( 2seq -- ) + first 0 <= [ capacity-error ] when* ; ! The consensus on the tradeoff between increasing the number of bits and ! increasing the number of hash functions seems to be "go for the smallest @@ -80,17 +84,31 @@ TUPLE: bloom-filter ! seen any usage studies from the implementations that made this tradeoff to ! support it, and I haven't done my own, but we'll go with it anyway. ! -! TODO: check that error-rate is reasonable. : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits ) - find-bloom-filter-sizes - max-array-capacity -1 2array - [ smallest-first ] - reduce - [ second ] [ first ] bi ; + '[ _ _ bits-to-satisfy-error-rate ] + '[ dup _ call 2array smaller-second ] + '[ n-hashes-range identity-configuration _ reduce ] + call + dup validate-sizes + first2 ; + +: validate-n-objects ( n-objects -- ) + 0 <= [ invalid-n-objects ] when ; + +: valid-error-rate-interval ( -- interval ) + 0 1 (a,b) ; + +: validate-error-rate ( error-rate -- ) + valid-error-rate-interval interval-contains? + [ invalid-error-rate ] unless ; + +: validate-constraints ( error-rate n-objects -- ) + validate-n-objects validate-error-rate ; PRIVATE> : ( error-rate number-objects -- bloom-filter ) + [ validate-constraints ] 2keep [ size-bloom-filter ] keep 0 ! initially empty bloom-filter boa ; From e6f8aafe5f27c52f7cd3611aae4032aa3c3fd56a Mon Sep 17 00:00:00 2001 From: Alec Berryman Date: Sun, 10 May 2009 11:58:57 -0400 Subject: [PATCH 4/7] bloom-filters: use infix syntax --- extra/bloom-filters/bloom-filters.factor | 32 ++++++++---------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index 5440461892..b82bf46d36 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -1,7 +1,7 @@ ! Copyright (C) 2009 Alec Berryman. ! See http://factorcode.org/license.txt for BSD license. -USING: accessors arrays bit-arrays fry kernel layouts locals math math.functions -multiline sequences ; +USING: accessors arrays bit-arrays fry infix kernel layouts locals math +math.functions multiline sequences ; IN: bloom-filters FROM: math.ranges => [1,b] [0,b) ; @@ -54,12 +54,13 @@ ERROR: invalid-n-objects ; integer ; ! should check that it's below max-array-capacity +! infix doesn't like ^ +: pow ( x y -- z ) + ^ ; inline + +:: bits-to-satisfy-error-rate ( hashes error objects -- size ) + [infix -(objects * hashes) / log(1 - pow(error, (1/hashes))) infix] + ceiling >integer ; ! 100 hashes ought to be enough for anybody. : n-hashes-range ( -- range ) @@ -118,21 +119,8 @@ PRIVATE> ! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and ! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing": ! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html -! -! This is taken from the definition at the top of page 12: -! -! F(i) = (A(s) + (i * B(s)) + ((i^3 - i) / 6)) mod m -! -! Where i is the hash number, A and B are hash functions for object s, and m is -! the length of the array. - :: enhanced-double-hash ( index hash0 hash1 array-size -- hash ) - hash0 - index hash1 * - + - index 3 ^ index - - 6 / - + + [infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix] array-size mod ; : enhanced-double-hashes ( n hash0 hash1 array-size -- seq ) From 8c267834557aa5b73e777553c4af7e99f36abf05 Mon Sep 17 00:00:00 2001 From: Alec Berryman Date: Sun, 10 May 2009 12:50:26 -0400 Subject: [PATCH 5/7] bloom-filters: clean help-lint --- extra/bloom-filters/bloom-filters.factor | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index b82bf46d36..de7aa75a06 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -159,6 +159,6 @@ PRIVATE> increment-n-objects [ relevant-indices ] [ bits>> set-indices ] bi ; -: bloom-filter-member? ( value bloom-filter -- ? ) +: bloom-filter-member? ( object bloom-filter -- ? ) [ relevant-indices ] keep bits>> nths [ ] all? ; From 713f0db0a2ba2b5fb1234d9d2fbed5278e277de5 Mon Sep 17 00:00:00 2001 From: Alec Berryman Date: Sun, 10 May 2009 18:04:47 -0400 Subject: [PATCH 6/7] bloom-filters: clean up indices code Extricating mod from hash creation makes it a little nicer. --- .../bloom-filters/bloom-filters-tests.factor | 2 +- extra/bloom-filters/bloom-filters.factor | 42 ++++++++----------- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor index b4fd69d849..90fbc81f55 100644 --- a/extra/bloom-filters/bloom-filters-tests.factor +++ b/extra/bloom-filters/bloom-filters-tests.factor @@ -46,7 +46,7 @@ IN: bloom-filters.tests : empty-bloom-filter ( -- bloom-filter ) 0.01 2000 ; -[ 1 ] [ empty-bloom-filter increment-n-objects current-n-objects>> ] unit-test +[ 1 ] [ empty-bloom-filter dup increment-n-objects current-n-objects>> ] unit-test : basic-insert-test-setup ( -- bloom-filter ) 1 empty-bloom-filter [ bloom-filter-insert ] keep ; diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index de7aa75a06..46c2a3f8c1 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -38,8 +38,6 @@ TODO: - Should we signal an error when inserting past the number of objects the filter is sized for? The filter will continue to work, just not very well. -- The other TODOs sprinkled through the code. - */ TUPLE: bloom-filter @@ -76,7 +74,7 @@ ERROR: invalid-n-objects ; ! If the number of hashes isn't positive, we haven't found anything smaller than the ! identity configuration. : validate-sizes ( 2seq -- ) - first 0 <= [ capacity-error ] when* ; + first 0 <= [ capacity-error ] when ; ! The consensus on the tradeoff between increasing the number of bits and ! increasing the number of hash functions seems to be "go for the smallest @@ -119,45 +117,41 @@ PRIVATE> ! See "Bloom Filters in Probabilistic Verification" by Peter C. Dillinger and ! Panagiotis Manolios, section 5.2, "Enhanced Double Hashing": ! http://www.cc.gatech.edu/~manolios/research/bloom-filters-verification.html -:: enhanced-double-hash ( index hash0 hash1 array-size -- hash ) - [infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix] - array-size mod ; +:: enhanced-double-hash ( index hash0 hash1 -- hash ) + [infix hash0 + (index * hash1) + ((pow(index, 3) - index) / 6) infix] ; -: enhanced-double-hashes ( n hash0 hash1 array-size -- seq ) - '[ _ _ _ enhanced-double-hash ] [ [0,b) ] dip map ; +: enhanced-double-hashes ( hash0 hash1 n -- seq ) + [0,b) + [ '[ _ _ enhanced-double-hash ] ] dip + swap map ; -! Stupid, should pick something good. +! Make sure it's a fixnum here to speed up double-hashing. : hashcodes-from-hashcode ( n -- n n ) - dup - ! we could be running this through a lot of double hashing, make sure it's a - ! fixnum here - most-positive-fixnum >fixnum bitxor ; + dup most-positive-fixnum >fixnum bitxor ; -! TODO: This code calls abs because all the double-hashing stuff outputs array -! indices and those aren't good negative. Are we throwing away bits? -1000 -! b. actually prints -1111101000, which confuses me. : hashcodes-from-object ( obj -- n n ) hashcode abs hashcodes-from-hashcode ; : set-indices ( indices bit-array -- ) [ [ drop t ] change-nth ] curry each ; -: increment-n-objects ( bloom-filter -- bloom-filter ) - [ 1 + ] change-current-n-objects ; +: increment-n-objects ( bloom-filter -- ) + [ 1 + ] change-current-n-objects drop ; -: n-hashes-and-bits ( bloom-filter -- n-hashes n-bits ) +: n-hashes-and-length ( bloom-filter -- n-hashes length ) [ n-hashes>> ] [ bits>> length ] bi ; : relevant-indices ( value bloom-filter -- indices ) - n-hashes-and-bits - [ swap hashcodes-from-object ] dip - enhanced-double-hashes ; + [ hashcodes-from-object ] [ n-hashes-and-length ] bi* + [ enhanced-double-hashes ] dip '[ _ mod ] map ; PRIVATE> : bloom-filter-insert ( object bloom-filter -- ) - increment-n-objects - [ relevant-indices ] [ bits>> set-indices ] bi ; + [ increment-n-objects ] + [ relevant-indices ] + [ bits>> set-indices ] + tri ; : bloom-filter-member? ( object bloom-filter -- ? ) [ relevant-indices ] keep From 5a9aa07f15a409afb85c2230c1144fbb23996a09 Mon Sep 17 00:00:00 2001 From: Alec Berryman Date: Sun, 10 May 2009 19:41:39 -0400 Subject: [PATCH 7/7] bloom-filters: fewer fried quots --- extra/bloom-filters/bloom-filters.factor | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor index 46c2a3f8c1..308d10ad84 100644 --- a/extra/bloom-filters/bloom-filters.factor +++ b/extra/bloom-filters/bloom-filters.factor @@ -84,10 +84,10 @@ ERROR: invalid-n-objects ; ! support it, and I haven't done my own, but we'll go with it anyway. ! : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits ) - '[ _ _ bits-to-satisfy-error-rate ] - '[ dup _ call 2array smaller-second ] - '[ n-hashes-range identity-configuration _ reduce ] - call + [ n-hashes-range identity-configuration ] 2dip + '[ dup [ _ _ bits-to-satisfy-error-rate ] + call 2array smaller-second ] + reduce dup validate-sizes first2 ;