Use capacity / error_rate syntax
This commit is contained in:
parent
54b8905e6f
commit
b73b5692d3
|
@ -1,24 +1,50 @@
|
|||
/*
|
||||
* BloomFilters as implemented by https://github.com/jasondavies/bloomfilter.js
|
||||
*
|
||||
* Original license kept
|
||||
*
|
||||
* Modified by Phyks to be constructed using the (capacity, error_rate) syntax rather
|
||||
* than the explicit (number of bits, number of hash functions) syntax.
|
||||
*/
|
||||
|
||||
(function(exports) {
|
||||
exports.BloomFilter = BloomFilter;
|
||||
exports.fnv_1a = fnv_1a;
|
||||
exports.fnv_1a_b = fnv_1a_b;
|
||||
|
||||
var typedArrays = typeof ArrayBuffer !== "undefined";
|
||||
|
||||
// Creates a new bloom filter. If *m* is an array-like object, with a length
|
||||
// Creates a new bloom filter given its minimal capacity and an error_rate.
|
||||
// Calculation taken from https://en.wikipedia.org/wiki/Bloom_filter.
|
||||
// If *capacity* is an array-like object, with a length
|
||||
// property, then the bloom filter is loaded with data from the array, where
|
||||
// each element is a 32-bit integer. Otherwise, *m* should specify the
|
||||
// number of bits. Note that *m* is rounded up to the nearest multiple of
|
||||
// 32. *k* specifies the number of hashing functions.
|
||||
function BloomFilter(m, k) {
|
||||
var a;
|
||||
if (typeof m !== "number") a = m, m = a.length * 32;
|
||||
|
||||
var n = Math.ceil(m / 32),
|
||||
i = -1;
|
||||
this.m = m = n * 32;
|
||||
// each element is a 32-bit integer.
|
||||
// *error_rate* is an estimation of the required error_rate.
|
||||
function BloomFilter(capacity, error_rate) {
|
||||
// *m* is the number of bits. Note that *m* is rounded up to
|
||||
// the nearest multiple of 32. *k* specifies the number of hashing functions.
|
||||
var a, i = -1;
|
||||
// Number of slices, k
|
||||
var k = Math.ceil(- Math.log(error_rate) / Math.log(2));
|
||||
// Total number of bits, m
|
||||
// Size of the UInt32 table, n
|
||||
var m, n;
|
||||
if (typeof capacity !== "number") {
|
||||
a = capacity;
|
||||
// Total number of bits, m
|
||||
m = a.length * 32;
|
||||
// Size of the UInt32 table, n
|
||||
n = a.length;
|
||||
}
|
||||
else {
|
||||
// Total number of bits, m
|
||||
m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k;
|
||||
// Size of the UInt32 table, n
|
||||
n = Math.ceil(m / 32);
|
||||
// Round total number of bits to closest multiple of 32
|
||||
m = n * 32;
|
||||
}
|
||||
this.m = m;
|
||||
this.k = k;
|
||||
|
||||
if (typedArrays) {
|
||||
var kbytes = 1 << Math.ceil(Math.log(Math.ceil(Math.log(m) / Math.LN2 / 8)) / Math.LN2),
|
||||
array = kbytes === 1 ? Uint8Array : kbytes === 2 ? Uint16Array : Uint32Array,
|
||||
|
@ -33,7 +59,6 @@
|
|||
this._locations = [];
|
||||
}
|
||||
}
|
||||
|
||||
// See http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/
|
||||
BloomFilter.prototype.locations = function(v) {
|
||||
var k = this.k,
|
||||
|
@ -49,7 +74,6 @@
|
|||
}
|
||||
return r;
|
||||
};
|
||||
|
||||
BloomFilter.prototype.add = function(v) {
|
||||
var l = this.locations(v + ""),
|
||||
i = -1,
|
||||
|
@ -57,7 +81,6 @@
|
|||
buckets = this.buckets;
|
||||
while (++i < k) buckets[Math.floor(l[i] / 32)] |= 1 << (l[i] % 32);
|
||||
};
|
||||
|
||||
BloomFilter.prototype.test = function(v) {
|
||||
var l = this.locations(v + ""),
|
||||
i = -1,
|
||||
|
@ -72,7 +95,6 @@
|
|||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
// Estimated cardinality.
|
||||
BloomFilter.prototype.size = function() {
|
||||
var buckets = this.buckets,
|
||||
|
@ -80,14 +102,12 @@
|
|||
for (var i = 0, n = buckets.length; i < n; ++i) bits += popcnt(buckets[i]);
|
||||
return -this.m * Math.log(1 - bits / this.m) / this.k;
|
||||
};
|
||||
|
||||
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||
function popcnt(v) {
|
||||
v -= (v >> 1) & 0x55555555;
|
||||
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
|
||||
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
|
||||
}
|
||||
|
||||
// Fowler/Noll/Vo hashing.
|
||||
function fnv_1a(v) {
|
||||
var n = v.length,
|
||||
|
@ -120,7 +140,6 @@
|
|||
a += a << 5;
|
||||
return a & 0xffffffff;
|
||||
}
|
||||
|
||||
// One additional iteration of FNV, given a hash.
|
||||
function fnv_1a_b(a) {
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
/* These are some basic unit-tests for the bloom.js module */
|
||||
|
||||
var bloom = new BloomFilter(4, 0.1);
|
||||
console.log(bloom);
|
||||
|
||||
// Add some elements to the filter.
|
||||
bloom.add("foo");
|
||||
bloom.add("bar");
|
||||
|
||||
// Test if an item is in our filter.
|
||||
// Returns true if an item is probably in the set,
|
||||
// or false if an item is definitely not in the set.
|
||||
console.assert(bloom.test("foo") === true);
|
||||
console.assert(bloom.test("bar") === true);
|
||||
console.assert(bloom.test("blah") === false);
|
||||
console.assert(bloom.test("blahahvhzfeh") === false);
|
||||
console.assert(bloom.test("blahahvhzfehgfgahafgfa") === false);
|
||||
|
||||
// Serialisation. Note that bloom.buckets may be a typed array,
|
||||
// so we convert to a normal array first.
|
||||
var array = [].slice.call(bloom.buckets),
|
||||
json = JSON.stringify(array);
|
||||
|
||||
console.log(array);
|
||||
|
||||
// Deserialisation. Note that the any array-like object is supported, but
|
||||
// this will be used directly, so you may wish to use a typed array for
|
||||
// performance.
|
||||
var bloom = new BloomFilter(array, 0.1);
|
||||
console.log(bloom);
|
||||
|
||||
console.assert(bloom.test("foo") === true);
|
||||
console.assert(bloom.test("bar") === true);
|
||||
console.assert(bloom.test("blah") === false);
|
||||
console.assert(bloom.test("blahahvhzfeh") === false);
|
||||
console.assert(bloom.test("blahahvhzfehgfgahafgfa") === false);
|
Loading…
Reference in New Issue