diff --git a/extra/math/cardinality/authors.txt b/extra/math/cardinality/authors.txt new file mode 100644 index 0000000000..e091bb8164 --- /dev/null +++ b/extra/math/cardinality/authors.txt @@ -0,0 +1 @@ +John Benediktsson diff --git a/extra/math/cardinality/cardinality-docs.factor b/extra/math/cardinality/cardinality-docs.factor new file mode 100644 index 0000000000..7cf473136e --- /dev/null +++ b/extra/math/cardinality/cardinality-docs.factor @@ -0,0 +1,12 @@ +! Copyright (C) 2012 John Benediktsson +! See http://factorcode.org/license.txt for BSD license +USING: help.markup help.syntax layouts math sequences ; +IN: math.cardinality + +HELP: trailing-zeros +{ $values { "m" number } { "n" number } } +{ $description "Counts the number of trailing 0 bits in " { $snippet "m" } ", returning " { $link fixnum-bits } " if the number is zero." } ; + +HELP: estimate-cardinality +{ $values { "seq" sequence } { "k" number } { "n" number } } +{ $description "Estimates the number of unique elements in " { $snippet "seq" } "." $nl "The number " { $snippet "k" } " controls how many bits of hash to use, creating " { $snippet "2^k" } " buckets." } ; diff --git a/extra/math/cardinality/cardinality.factor b/extra/math/cardinality/cardinality.factor new file mode 100644 index 0000000000..3976c4628e --- /dev/null +++ b/extra/math/cardinality/cardinality.factor @@ -0,0 +1,25 @@ +! Copyright (C) 2012 John Benediktsson +! See http://factorcode.org/license.txt for BSD license +USING: arrays kernel layouts locals math math.functions +math.order math.statistics sequences ; +IN: math.cardinality + +GENERIC: trailing-zeros ( m -- n ) + +M: fixnum trailing-zeros + [ fixnum-bits ] [ + 0 [ over even? ] [ [ 2/ ] [ 1 + ] bi* ] while nip + ] if-zero ; + +:: estimate-cardinality ( seq k -- n ) + k 2^ :> num_buckets + num_buckets 0 :> max_zeros + seq [ + hashcode >fixnum :> h + h num_buckets 1 - bitand :> bucket + h k neg shift :> bucket_hash + bucket max_zeros [ + bucket_hash trailing-zeros max + ] change-nth + ] each + max_zeros [ mean 2 swap ^ ] [ length * ] bi 0.79402 * ;