bloom-filters: clean up creation

More readable, less allocation, signals invalid input.
2009-05-08 23:30:01 -04:00 · 2009-05-08 23:30:01 -04:00 · ff04cf82fe
parent 2cf079b2d3
commit ff04cf82fe
3 changed files with 63 additions and 33 deletions
--- a/extra/bloom-filters/bloom-filters-docs.factor
+++ b/extra/bloom-filters/bloom-filters-docs.factor
@ -3,9 +3,11 @@ IN: bloom-filters

 HELP: <bloom-filter>
 { $values { "error-rate" "The desired false positive rate.  A " { $link float } " between 0 and 1." }
-          { "number-objects" "The expected number of object in the set.  An " { $link integer } "." }
+          { "number-objects" "The expected number of object in the set.  A positive " { $link integer } "." }
          { "bloom-filter" bloom-filter } }
-{ $description "Creates an empty Bloom filter." } ;
+{ $description "Creates an empty Bloom filter." }
+{ $errors "Throws a " { $link capacity-error } " when unable to produce a filter meeting the given constraints.  Throws a " { $link invalid-error-rate } " or a " { $link invalid-n-objects } " when input is invalid." } ;
+

 HELP: bloom-filter-insert
 { $values { "object" object }
--- a/extra/bloom-filters/bloom-filters-tests.factor
+++ b/extra/bloom-filters/bloom-filters-tests.factor
@ -2,6 +2,10 @@ USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
 math random sequences tools.test ;
 IN: bloom-filters.tests

+
+[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test
+[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test
+
 ! The sizing information was generated using the subroutine
 ! calculate_shortest_filter_length from
 ! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
@ -19,13 +23,19 @@ IN: bloom-filters.tests
 [ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
 [ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test

-! This is a lot of bits.  On linux-x86-32, max-array-capacity is 134217727,
-! which is about 16MB (assuming I can do math), which is sort of pithy.  I'm
-! not sure how to handle this case.  Returning a smaller-than-requested
-! arrays is not the least surprising behavior, but is still surprising.
-[ 383718189 ] [ 7 0.01 40000000 bits-to-satisfy-error-rate ] unit-test
-! [ 7 383718189 ] [ 0.01 40000000 size-bloom-filter ] unit-test
-! [ 383718189 ] [ 0.01 40000000 <bloom-filter> bits>> length ] unit-test
+! This is a lot of bits.
+: oversized-filter-params ( -- error-rate n-objects )
+    0.00000001 400000000000000 ;
+[ oversized-filter-params size-bloom-filter ] [ capacity-error? ]  must-fail-with
+[ oversized-filter-params <bloom-filter> ] [ capacity-error? ] must-fail-with
+
+! Other error conditions.
+[ 1.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 0.5 0 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
+[ 0.5 -5 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with

 ! Should not generate bignum hash codes.  Enhanced double hashing may generate a
 ! lot of hash codes, and it's better to do this earlier than later.
--- a/extra/bloom-filters/bloom-filters.factor
+++ b/extra/bloom-filters/bloom-filters.factor
@ -1,17 +1,16 @@
 ! Copyright (C) 2009 Alec Berryman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays bit-arrays fry kernel layouts locals math math.functions
-math.ranges multiline sequences ;
+multiline sequences ;
 IN: bloom-filters

+FROM: math.ranges => [1,b] [0,b) ;
+FROM: math.intervals => (a,b) interval-contains? ;
+
 /*

 TODO:

- How to singal an error when too many bits?  It looks like a built-in for some
-  types of arrays, but bit-array just returns a zero-length array.  What we do
-  now is completely broken: -1 hash codes?  Really?
-
 - The false positive rate is 10x what it should be, based on informal testing.
  Better object hashes or a better method of generating extra hash codes would
  help.  Another way is to increase the number of bits used.
@ -25,7 +24,9 @@ TODO:
  - Be sure to adjust the test that asserts the number of false positives isn't
    unreasonable.

- Should round bits up to next power of two, use wrap instead of mod.
+- Could round bits up to next power of two and use wrap instead of mod.  This
+  would cost a lot of bits on 32-bit platforms, though, and limit the bit-array
+  to 8MB.

 - Should allow user to specify the hash codes, either as inputs to enhanced
  double hashing or for direct use.
@ -47,6 +48,10 @@ TUPLE: bloom-filter
 { maximum-n-objects fixnum read-only }
 { current-n-objects fixnum } ;

+ERROR: capacity-error ;
+ERROR: invalid-error-rate ;
+ERROR: invalid-n-objects ;
+
 <PRIVATE

 ! number-bits = -(n-objects * n-hashes) / ln(1 - error-rate ^ 1/n-hashes)
@ -56,22 +61,21 @@ TUPLE: bloom-filter
    /
    ceiling >integer ; ! should check that it's below max-array-capacity

-! TODO: this should be a constant
-!
-! TODO: after very little experimentation, I never see this increase after about
-! 20 or so.  Maybe it should be smaller.
+! 100 hashes ought to be enough for anybody.
 : n-hashes-range ( -- range )
    100 [1,b] ;

-! Ends up with a list of arrays - { n-bits position }
-: find-bloom-filter-sizes ( error-rate number-objects -- seq )
-    [ bits-to-satisfy-error-rate ] 2curry
-    n-hashes-range swap
-    map
-    n-hashes-range zip ;
+! { n-hashes n-bits }
+: identity-configuration ( -- 2seq )
+    0 max-array-capacity 2array ;

-: smallest-first ( seq1 seq2 -- seq )
-    [ [ first ] bi@ <= ] most ;
+: smaller-second ( 2seq 2seq -- 2seq )
+    [ [ second ] bi@ <= ] most ;
+
+! If the number of hashes isn't positive, we haven't found anything smaller than the
+! identity configuration.
+: validate-sizes ( 2seq -- )
+    first 0 <= [ capacity-error ] when* ;

 ! The consensus on the tradeoff between increasing the number of bits and
 ! increasing the number of hash functions seems to be "go for the smallest
@ -80,17 +84,31 @@ TUPLE: bloom-filter
 ! seen any usage studies from the implementations that made this tradeoff to
 ! support it, and I haven't done my own, but we'll go with it anyway.
 !
-! TODO: check that error-rate is reasonable.
 : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
-    find-bloom-filter-sizes
-    max-array-capacity -1 2array
-    [ smallest-first ]
-    reduce
-    [ second ] [ first ] bi ;
+    '[ _ _ bits-to-satisfy-error-rate ]
+    '[ dup _ call 2array smaller-second ]
+    '[ n-hashes-range identity-configuration _ reduce ]
+    call
+    dup validate-sizes
+    first2 ;
+
+: validate-n-objects ( n-objects -- )
+    0 <= [ invalid-n-objects ] when ;
+
+: valid-error-rate-interval ( -- interval )
+    0 1 (a,b) ;
+
+: validate-error-rate ( error-rate -- )
+    valid-error-rate-interval interval-contains?
+    [ invalid-error-rate ] unless ;
+
+: validate-constraints ( error-rate n-objects -- )
+    validate-n-objects validate-error-rate ;

 PRIVATE>

 : <bloom-filter> ( error-rate number-objects -- bloom-filter )
+    [ validate-constraints ] 2keep
    [ size-bloom-filter <bit-array> ] keep
    0 ! initially empty
    bloom-filter boa ;