diff --git a/extra/bloom-filters/authors.txt b/extra/bloom-filters/authors.txt new file mode 100644 index 0000000000..528e5dfe6b --- /dev/null +++ b/extra/bloom-filters/authors.txt @@ -0,0 +1 @@ +Alec Berryman diff --git a/extra/bloom-filters/bloom-filters-docs.factor b/extra/bloom-filters/bloom-filters-docs.factor new file mode 100644 index 0000000000..bc5df8611c --- /dev/null +++ b/extra/bloom-filters/bloom-filters-docs.factor @@ -0,0 +1,38 @@ +USING: help.markup help.syntax kernel math ; +IN: bloom-filters + +HELP: +{ $values { "error-rate" "The desired false positive rate. A " { $link float } " between 0 and 1." } + { "number-objects" "The expected number of object in the set. A positive " { $link integer } "." } + { "bloom-filter" bloom-filter } } +{ $description "Creates an empty Bloom filter." } +{ $errors "Throws a " { $link capacity-error } " when unable to produce a filter meeting the given constraints. Throws a " { $link invalid-error-rate } " or a " { $link invalid-n-objects } " when input is invalid." } ; + + +HELP: bloom-filter-insert +{ $values { "object" object } + { "bloom-filter" bloom-filter } } +{ $description "Records the item as a member of the filter." } +{ $side-effects "bloom-filter" } ; + +HELP: bloom-filter-member? +{ $values { "object" object } + { "bloom-filter" bloom-filter } + { "?" boolean } } +{ $description "Returns " { $link t } " if the object may be a member of Bloom filter, " { $link f } " otherwise. The false positive rate is configurable; there are no false negatives." } ; + +HELP: bloom-filter +{ $class-description "This is the class for Bloom filters. These provide constant-time insertion and probabilistic membership-testing operations, but do not actually store any elements." } ; + +ARTICLE: "bloom-filters" "Bloom filters" +"This is a library for Bloom filters, sets that provide a constant-time insertion operation and probabilistic membership tests, but do not actually store any elements." +$nl +"The accuracy of the membership test is configurable; a Bloom filter will never incorrectly report an item is not a member of the set, but may incorrectly report than an item is a member of the set." +$nl +"Bloom filters cannot be resized and do not support removal." +$nl +{ $subsection } +{ $subsection bloom-filter-insert } +{ $subsection bloom-filter-member? } ; + +ABOUT: "bloom-filters" diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor new file mode 100644 index 0000000000..90fbc81f55 --- /dev/null +++ b/extra/bloom-filters/bloom-filters-tests.factor @@ -0,0 +1,81 @@ +USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts +math random sequences tools.test ; +IN: bloom-filters.tests + + +[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test +[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test + +! The sizing information was generated using the subroutine +! calculate_shortest_filter_length from +! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html. + +! Test bloom-filter creation +[ 47965 ] [ 7 0.01 5000 bits-to-satisfy-error-rate ] unit-test +[ 7 47965 ] [ 0.01 5000 size-bloom-filter ] unit-test +[ 7 ] [ 0.01 5000 n-hashes>> ] unit-test +[ 47965 ] [ 0.01 5000 bits>> length ] unit-test +[ 5000 ] [ 0.01 5000 maximum-n-objects>> ] unit-test +[ 0 ] [ 0.01 5000 current-n-objects>> ] unit-test + +! Should return the fewest hashes to satisfy the bits requested, not the most. +[ 32 ] [ 4 0.05 5 bits-to-satisfy-error-rate ] unit-test +[ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test +[ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test + +! This is a lot of bits. +: oversized-filter-params ( -- error-rate n-objects ) + 0.00000001 400000000000000 ; +[ oversized-filter-params size-bloom-filter ] [ capacity-error? ] must-fail-with +[ oversized-filter-params ] [ capacity-error? ] must-fail-with + +! Other error conditions. +[ 1.0 2000 ] [ invalid-error-rate? ] must-fail-with +[ 20 2000 ] [ invalid-error-rate? ] must-fail-with +[ 0.0 2000 ] [ invalid-error-rate? ] must-fail-with +[ -2 2000 ] [ invalid-error-rate? ] must-fail-with +[ 0.5 0 ] [ invalid-n-objects? ] must-fail-with +[ 0.5 -5 ] [ invalid-n-objects? ] must-fail-with + +! Should not generate bignum hash codes. Enhanced double hashing may generate a +! lot of hash codes, and it's better to do this earlier than later. +[ t ] [ 10000 iota [ hashcodes-from-object [ fixnum? ] both? ] map [ ] all? ] unit-test + +[ ?{ t f t f t f } ] [ { 0 2 4 } 6 [ set-indices ] keep ] unit-test + +: empty-bloom-filter ( -- bloom-filter ) + 0.01 2000 ; + +[ 1 ] [ empty-bloom-filter dup increment-n-objects current-n-objects>> ] unit-test + +: basic-insert-test-setup ( -- bloom-filter ) + 1 empty-bloom-filter [ bloom-filter-insert ] keep ; + +! Basic tests that insert does something +[ t ] [ basic-insert-test-setup bits>> [ ] any? ] unit-test +[ 1 ] [ basic-insert-test-setup current-n-objects>> ] unit-test + +: non-empty-bloom-filter ( -- bloom-filter ) + 1000 iota + empty-bloom-filter + [ [ bloom-filter-insert ] curry each ] keep ; + +: full-bloom-filter ( -- bloom-filter ) + 2000 iota + empty-bloom-filter + [ [ bloom-filter-insert ] curry each ] keep ; + +! Should find what we put in there. +[ t ] [ 2000 iota + full-bloom-filter + [ bloom-filter-member? ] curry map + [ ] all? ] unit-test + +! We shouldn't have more than 0.01 false-positive rate. +[ t ] [ 1000 iota [ drop most-positive-fixnum random 1000 + ] map + full-bloom-filter + [ bloom-filter-member? ] curry map + [ ] filter + ! TODO: This should be 10, but the false positive rate is currently very + ! high. It shouldn't be much more than this. + length 150 <= ] unit-test diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor new file mode 100644 index 0000000000..308d10ad84 --- /dev/null +++ b/extra/bloom-filters/bloom-filters.factor @@ -0,0 +1,158 @@ +! Copyright (C) 2009 Alec Berryman. +! See http://factorcode.org/license.txt for BSD license. +USING: accessors arrays bit-arrays fry infix kernel layouts locals math +math.functions multiline sequences ; +IN: bloom-filters + +FROM: math.ranges => [1,b] [0,b) ; +FROM: math.intervals => (a,b) interval-contains? ; + +/* + +TODO: + +- The false positive rate is 10x what it should be, based on informal testing. + Better object hashes or a better method of generating extra hash codes would + help. Another way is to increase the number of bits used. + + - Try something smarter than the bitwise complement for a second hash code. + + - http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html + makes a case for http://murmurhash.googlepages.com/ instead of enhanced + double-hashing. + + - Be sure to adjust the test that asserts the number of false positives isn't + unreasonable. + +- Could round bits up to next power of two and use wrap instead of mod. This + would cost a lot of bits on 32-bit platforms, though, and limit the bit-array + to 8MB. + +- Should allow user to specify the hash codes, either as inputs to enhanced + double hashing or for direct use. + +- Support for serialization. + +- Wrappers for combining filters. + +- Should we signal an error when inserting past the number of objects the filter + is sized for? The filter will continue to work, just not very well. + +*/ + +TUPLE: bloom-filter +{ n-hashes fixnum read-only } +{ bits bit-array read-only } +{ maximum-n-objects fixnum read-only } +{ current-n-objects fixnum } ; + +ERROR: capacity-error ; +ERROR: invalid-error-rate ; +ERROR: invalid-n-objects ; + +integer ; + +! 100 hashes ought to be enough for anybody. +: n-hashes-range ( -- range ) + 100 [1,b] ; + +! { n-hashes n-bits } +: identity-configuration ( -- 2seq ) + 0 max-array-capacity 2array ; + +: smaller-second ( 2seq 2seq -- 2seq ) + [ [ second ] bi@ <= ] most ; + +! If the number of hashes isn't positive, we haven't found anything smaller than the +! identity configuration. +: validate-sizes ( 2seq -- ) + first 0 <= [ capacity-error ] when ; + +! The consensus on the tradeoff between increasing the number of bits and +! increasing the number of hash functions seems to be "go for the smallest +! number of bits", probably because most implementations just generate one hash +! value and cheaply mangle it into the number of hashes they need. I have not +! seen any usage studies from the implementations that made this tradeoff to +! support it, and I haven't done my own, but we'll go with it anyway. +! +: size-bloom-filter ( error-rate number-objects -- number-hashes number-bits ) + [ n-hashes-range identity-configuration ] 2dip + '[ dup [ _ _ bits-to-satisfy-error-rate ] + call 2array smaller-second ] + reduce + dup validate-sizes + first2 ; + +: validate-n-objects ( n-objects -- ) + 0 <= [ invalid-n-objects ] when ; + +: valid-error-rate-interval ( -- interval ) + 0 1 (a,b) ; + +: validate-error-rate ( error-rate -- ) + valid-error-rate-interval interval-contains? + [ invalid-error-rate ] unless ; + +: validate-constraints ( error-rate n-objects -- ) + validate-n-objects validate-error-rate ; + +PRIVATE> + +: ( error-rate number-objects -- bloom-filter ) + [ validate-constraints ] 2keep + [ size-bloom-filter ] keep + 0 ! initially empty + bloom-filter boa ; + +fixnum bitxor ; + +: hashcodes-from-object ( obj -- n n ) + hashcode abs hashcodes-from-hashcode ; + +: set-indices ( indices bit-array -- ) + [ [ drop t ] change-nth ] curry each ; + +: increment-n-objects ( bloom-filter -- ) + [ 1 + ] change-current-n-objects drop ; + +: n-hashes-and-length ( bloom-filter -- n-hashes length ) + [ n-hashes>> ] [ bits>> length ] bi ; + +: relevant-indices ( value bloom-filter -- indices ) + [ hashcodes-from-object ] [ n-hashes-and-length ] bi* + [ enhanced-double-hashes ] dip '[ _ mod ] map ; + +PRIVATE> + +: bloom-filter-insert ( object bloom-filter -- ) + [ increment-n-objects ] + [ relevant-indices ] + [ bits>> set-indices ] + tri ; + +: bloom-filter-member? ( object bloom-filter -- ? ) + [ relevant-indices ] keep + bits>> nths [ ] all? ;