From ca05d4cefb62e4deb428adf839a58096de8d7606 Mon Sep 17 00:00:00 2001 From: John Benediktsson Date: Mon, 8 Aug 2016 15:15:08 -0700 Subject: [PATCH] cuckoo-filters: adding some documentation. --- .../cuckoo-filters/cuckoo-filters-docs.factor | 28 +++++++++++++++ extra/cuckoo-filters/cuckoo-filters.factor | 36 +++++++++---------- 2 files changed, 46 insertions(+), 18 deletions(-) create mode 100644 extra/cuckoo-filters/cuckoo-filters-docs.factor diff --git a/extra/cuckoo-filters/cuckoo-filters-docs.factor b/extra/cuckoo-filters/cuckoo-filters-docs.factor new file mode 100644 index 0000000000..bc1c27952a --- /dev/null +++ b/extra/cuckoo-filters/cuckoo-filters-docs.factor @@ -0,0 +1,28 @@ +USING: byte-arrays checksums help.markup help.syntax kernel ; +IN: cuckoo-filters + +HELP: cuckoo-insert +{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } } +{ $description "Insert the data into the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data was inserted." } +{ $notes "Attempting to insert data twice will result in the hashed fingerprint of the data appearing twice and the " { $link cuckoo-filter } " size being incremented twice." } ; + +HELP: cuckoo-lookup +{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } } +{ $description "Lookup the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be a member. This is a probabilistic test, meaning there is a possibility of false positives." } ; + +HELP: cuckoo-delete +{ $values { "bytes" byte-array } { "cuckoo-filter" cuckoo-filter } { "?" boolean } } +{ $description "Remove the data from the " { $snippet "cuckoo-filter" } ", returning " { $link t } " if the data appears to be removed." } ; + +ARTICLE: "cuckoo-filters" "Cuckoo Filters" +"Cuckoo Filters are probabilistic data structures similar to Bloom Filters that provides support for removing elements without significantly degrading space and performance." +$nl +"Instead of storing the elements themselves, it stores a fingerprint obtained by using a " { $link checksum } ". This allows for item removal without false negatives (assuming you do not try and remove an item not contained in the filter." +$nl +"For applications that store many items and target low false-positive rates, Cuckoo Filters can have a lower space overhead than Bloom Filters." +$nl +"More information is available in the paper by Andersen, Kaminsky, and Mitzenmacher titled \"Cuckoo Filter: Practically Better Than Bloom\":" +$nl +{ $url "http://www.pdl.cmu.edu/PDL-FTP/FS/cuckoo-conext2014.pdf" } ; + +ABOUT: "cuckoo-filters" diff --git a/extra/cuckoo-filters/cuckoo-filters.factor b/extra/cuckoo-filters/cuckoo-filters.factor index 8c3db68220..e64b12115d 100644 --- a/extra/cuckoo-filters/cuckoo-filters.factor +++ b/extra/cuckoo-filters/cuckoo-filters.factor @@ -54,43 +54,43 @@ TUPLE: cuckoo-filter buckets checksum size ; : ( capacity -- cuckoo-filter ) sha1 0 cuckoo-filter boa ; -:: cuckoo-insert ( obj cuckoo-filter -- ? ) - obj cuckoo-filter tag-indices :> ( tag! i1 i2 ) +:: cuckoo-insert ( bytes cuckoo-filter -- ? ) + bytes cuckoo-filter tag-indices :> ( tag! i1 i2 ) cuckoo-filter buckets>> :> buckets - buckets length :> cuckoo-size + buckets length :> n { - [ tag i1 cuckoo-size mod buckets nth bucket-insert ] - [ tag i2 cuckoo-size mod buckets nth bucket-insert ] + [ tag i1 n mod buckets nth bucket-insert ] + [ tag i2 n mod buckets nth bucket-insert ] } 0|| [ cuckoo-filter [ 1 + ] change-size drop t ] [ cuckoo-filter checksum>> :> checksum - { i1 i2 } random :> i! + 2 random zero? i1 i2 ? :> i! max-cuckoo-count [ drop - tag i cuckoo-size mod buckets nth bucket-swap tag! + tag i n mod buckets nth bucket-swap tag! tag i alt-index i! - tag i cuckoo-size mod buckets nth bucket-insert + tag i n mod buckets nth bucket-insert dup [ cuckoo-filter [ 1 + ] change-size drop ] when ] find-integer >boolean ] if ; -:: cuckoo-lookup ( obj cuckoo-filter -- ? ) - obj cuckoo-filter tag-indices :> ( tag i1 i2 ) +:: cuckoo-lookup ( bytes cuckoo-filter -- ? ) + bytes cuckoo-filter tag-indices :> ( tag i1 i2 ) cuckoo-filter buckets>> :> buckets - buckets length :> cuckoo-size + buckets length :> n { - [ tag i1 cuckoo-size mod buckets nth bucket-lookup ] - [ tag i2 cuckoo-size mod buckets nth bucket-lookup ] + [ tag i1 n mod buckets nth bucket-lookup ] + [ tag i2 n mod buckets nth bucket-lookup ] } 0|| ; -:: cuckoo-delete ( obj cuckoo-filter -- ? ) - obj cuckoo-filter tag-indices :> ( tag i1 i2 ) +:: cuckoo-delete ( bytes cuckoo-filter -- ? ) + bytes cuckoo-filter tag-indices :> ( tag i1 i2 ) cuckoo-filter buckets>> :> buckets - buckets length :> cuckoo-size + buckets length :> n { - [ tag i1 cuckoo-size mod buckets nth bucket-delete ] - [ tag i2 cuckoo-size mod buckets nth bucket-delete ] + [ tag i1 n mod buckets nth bucket-delete ] + [ tag i2 n mod buckets nth bucket-delete ] } 0|| dup [ cuckoo-filter [ 1 - ] change-size drop ] when ;