From ff04cf82fe1e55bc30803e45e1b0e1ec1a6812ea Mon Sep 17 00:00:00 2001
From: Alec Berryman <alec@thened.net>
Date: Fri, 8 May 2009 23:30:01 -0400
Subject: [PATCH] bloom-filters: clean up creation

More readable, less allocation, signals invalid input.
---
 extra/bloom-filters/bloom-filters-docs.factor |  6 +-
 .../bloom-filters/bloom-filters-tests.factor  | 24 +++++--
 extra/bloom-filters/bloom-filters.factor      | 66 ++++++++++++-------
 3 files changed, 63 insertions(+), 33 deletions(-)
diff --git a/extra/bloom-filters/bloom-filters-docs.factor b/extra/bloom-filters/bloom-filters-docs.factor
index 4af1a82af6..bc5df8611c 100644
--- a/extra/bloom-filters/bloom-filters-docs.factor
+++ b/extra/bloom-filters/bloom-filters-docs.factor
@@ -3,9 +3,11 @@ IN: bloom-filters
 
 HELP: <bloom-filter>
 { $values { "error-rate" "The desired false positive rate.  A " { $link float } " between 0 and 1." }
-          { "number-objects" "The expected number of object in the set.  An " { $link integer } "." }
+          { "number-objects" "The expected number of object in the set.  A positive " { $link integer } "." }
           { "bloom-filter" bloom-filter } }
-{ $description "Creates an empty Bloom filter." } ;
+{ $description "Creates an empty Bloom filter." }
+{ $errors "Throws a " { $link capacity-error } " when unable to produce a filter meeting the given constraints.  Throws a " { $link invalid-error-rate } " or a " { $link invalid-n-objects } " when input is invalid." } ;
+
 
 HELP: bloom-filter-insert
 { $values { "object" object }
diff --git a/extra/bloom-filters/bloom-filters-tests.factor b/extra/bloom-filters/bloom-filters-tests.factor
index 40fd1469b2..b4fd69d849 100644
--- a/extra/bloom-filters/bloom-filters-tests.factor
+++ b/extra/bloom-filters/bloom-filters-tests.factor
@@ -2,6 +2,10 @@ USING: accessors bit-arrays bloom-filters bloom-filters.private kernel layouts
 math random sequences tools.test ;
 IN: bloom-filters.tests
 
+
+[ { 200 5 } ] [ { 100 7 } { 200 5 } smaller-second ] unit-test
+[ { 200 5 } ] [ { 200 5 } { 100 7 } smaller-second ] unit-test
+
 ! The sizing information was generated using the subroutine
 ! calculate_shortest_filter_length from
 ! http://www.perl.com/pub/a/2004/04/08/bloom_filters.html.
@@ -19,13 +23,19 @@ IN: bloom-filters.tests
 [ 32 ] [ 5 0.05 5 bits-to-satisfy-error-rate ] unit-test
 [ 4 32 ] [ 0.05 5 size-bloom-filter ] unit-test
 
-! This is a lot of bits.  On linux-x86-32, max-array-capacity is 134217727,
-! which is about 16MB (assuming I can do math), which is sort of pithy.  I'm
-! not sure how to handle this case.  Returning a smaller-than-requested
-! arrays is not the least surprising behavior, but is still surprising.
-[ 383718189 ] [ 7 0.01 40000000 bits-to-satisfy-error-rate ] unit-test
-! [ 7 383718189 ] [ 0.01 40000000 size-bloom-filter ] unit-test
-! [ 383718189 ] [ 0.01 40000000 <bloom-filter> bits>> length ] unit-test
+! This is a lot of bits.
+: oversized-filter-params ( -- error-rate n-objects )
+    0.00000001 400000000000000 ;
+[ oversized-filter-params size-bloom-filter ] [ capacity-error? ]  must-fail-with
+[ oversized-filter-params <bloom-filter> ] [ capacity-error? ] must-fail-with
+
+! Other error conditions.
+[ 1.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 20 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 0.0 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ -2 2000 <bloom-filter> ] [ invalid-error-rate? ] must-fail-with
+[ 0.5 0 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
+[ 0.5 -5 <bloom-filter> ] [ invalid-n-objects? ] must-fail-with
 
 ! Should not generate bignum hash codes.  Enhanced double hashing may generate a
 ! lot of hash codes, and it's better to do this earlier than later.
diff --git a/extra/bloom-filters/bloom-filters.factor b/extra/bloom-filters/bloom-filters.factor
index 3e0aba175c..5440461892 100644
--- a/extra/bloom-filters/bloom-filters.factor
+++ b/extra/bloom-filters/bloom-filters.factor
@@ -1,17 +1,16 @@
 ! Copyright (C) 2009 Alec Berryman.
 ! See http://factorcode.org/license.txt for BSD license.
 USING: accessors arrays bit-arrays fry kernel layouts locals math math.functions
-math.ranges multiline sequences ;
+multiline sequences ;
 IN: bloom-filters
 
+FROM: math.ranges => [1,b] [0,b) ;
+FROM: math.intervals => (a,b) interval-contains? ;
+
 /*
 
 TODO:
 
-- How to singal an error when too many bits?  It looks like a built-in for some
-  types of arrays, but bit-array just returns a zero-length array.  What we do
-  now is completely broken: -1 hash codes?  Really?
-
 - The false positive rate is 10x what it should be, based on informal testing.
   Better object hashes or a better method of generating extra hash codes would
   help.  Another way is to increase the number of bits used.
@@ -25,7 +24,9 @@ TODO:
   - Be sure to adjust the test that asserts the number of false positives isn't
     unreasonable.
 
-- Should round bits up to next power of two, use wrap instead of mod.
+- Could round bits up to next power of two and use wrap instead of mod.  This
+  would cost a lot of bits on 32-bit platforms, though, and limit the bit-array
+  to 8MB.
 
 - Should allow user to specify the hash codes, either as inputs to enhanced
   double hashing or for direct use.
@@ -47,6 +48,10 @@ TUPLE: bloom-filter
 { maximum-n-objects fixnum read-only }
 { current-n-objects fixnum } ;
 
+ERROR: capacity-error ;
+ERROR: invalid-error-rate ;
+ERROR: invalid-n-objects ;
+
 <PRIVATE
 
 ! number-bits = -(n-objects * n-hashes) / ln(1 - error-rate ^ 1/n-hashes)
@@ -56,22 +61,21 @@ TUPLE: bloom-filter
     /
     ceiling >integer ; ! should check that it's below max-array-capacity
 
-! TODO: this should be a constant
-!
-! TODO: after very little experimentation, I never see this increase after about
-! 20 or so.  Maybe it should be smaller.
+! 100 hashes ought to be enough for anybody.
 : n-hashes-range ( -- range )
     100 [1,b] ;
 
-! Ends up with a list of arrays - { n-bits position }
-: find-bloom-filter-sizes ( error-rate number-objects -- seq )
-    [ bits-to-satisfy-error-rate ] 2curry
-    n-hashes-range swap
-    map
-    n-hashes-range zip ;
+! { n-hashes n-bits }
+: identity-configuration ( -- 2seq )
+    0 max-array-capacity 2array ;
 
-: smallest-first ( seq1 seq2 -- seq )
-    [ [ first ] bi@ <= ] most ;
+: smaller-second ( 2seq 2seq -- 2seq )
+    [ [ second ] bi@ <= ] most ;
+
+! If the number of hashes isn't positive, we haven't found anything smaller than the
+! identity configuration.
+: validate-sizes ( 2seq -- )
+    first 0 <= [ capacity-error ] when* ;
 
 ! The consensus on the tradeoff between increasing the number of bits and
 ! increasing the number of hash functions seems to be "go for the smallest
@@ -80,17 +84,31 @@ TUPLE: bloom-filter
 ! seen any usage studies from the implementations that made this tradeoff to
 ! support it, and I haven't done my own, but we'll go with it anyway.
 !
-! TODO: check that error-rate is reasonable.
 : size-bloom-filter ( error-rate number-objects -- number-hashes number-bits )
-    find-bloom-filter-sizes
-    max-array-capacity -1 2array
-    [ smallest-first ]
-    reduce
-    [ second ] [ first ] bi ;
+    '[ _ _ bits-to-satisfy-error-rate ]
+    '[ dup _ call 2array smaller-second ]
+    '[ n-hashes-range identity-configuration _ reduce ]
+    call
+    dup validate-sizes
+    first2 ;
+
+: validate-n-objects ( n-objects -- )
+    0 <= [ invalid-n-objects ] when ;
+
+: valid-error-rate-interval ( -- interval )
+    0 1 (a,b) ;
+
+: validate-error-rate ( error-rate -- )
+    valid-error-rate-interval interval-contains?
+    [ invalid-error-rate ] unless ;
+
+: validate-constraints ( error-rate n-objects -- )
+    validate-n-objects validate-error-rate ;
 
 PRIVATE>
 
 : <bloom-filter> ( error-rate number-objects -- bloom-filter )
+    [ validate-constraints ] 2keep
     [ size-bloom-filter <bit-array> ] keep
     0 ! initially empty
     bloom-filter boa ;