Use capacity / error_rate syntax

This commit is contained in:
Phyks 2014-10-27 22:14:45 +01:00
parent 54b8905e6f
commit b73b5692d3
2 changed files with 175 additions and 120 deletions

View File

@ -1,134 +1,153 @@
/*
* BloomFilters as implemented by https://github.com/jasondavies/bloomfilter.js
*
* Original license kept
*
* Modified by Phyks to be constructed using the (capacity, error_rate) syntax rather
* than the explicit (number of bits, number of hash functions) syntax.
*/
(function(exports) { (function(exports) {
exports.BloomFilter = BloomFilter; exports.BloomFilter = BloomFilter;
exports.fnv_1a = fnv_1a; exports.fnv_1a = fnv_1a;
exports.fnv_1a_b = fnv_1a_b; exports.fnv_1a_b = fnv_1a_b;
var typedArrays = typeof ArrayBuffer !== "undefined";
var typedArrays = typeof ArrayBuffer !== "undefined"; // Creates a new bloom filter given its minimal capacity and an error_rate.
// Calculation taken from https://en.wikipedia.org/wiki/Bloom_filter.
// Creates a new bloom filter. If *m* is an array-like object, with a length // If *capacity* is an array-like object, with a length
// property, then the bloom filter is loaded with data from the array, where // property, then the bloom filter is loaded with data from the array, where
// each element is a 32-bit integer. Otherwise, *m* should specify the // each element is a 32-bit integer.
// number of bits. Note that *m* is rounded up to the nearest multiple of // *error_rate* is an estimation of the required error_rate.
// 32. *k* specifies the number of hashing functions. function BloomFilter(capacity, error_rate) {
function BloomFilter(m, k) { // *m* is the number of bits. Note that *m* is rounded up to
var a; // the nearest multiple of 32. *k* specifies the number of hashing functions.
if (typeof m !== "number") a = m, m = a.length * 32; var a, i = -1;
// Number of slices, k
var n = Math.ceil(m / 32), var k = Math.ceil(- Math.log(error_rate) / Math.log(2));
i = -1; // Total number of bits, m
this.m = m = n * 32; // Size of the UInt32 table, n
this.k = k; var m, n;
if (typeof capacity !== "number") {
if (typedArrays) { a = capacity;
var kbytes = 1 << Math.ceil(Math.log(Math.ceil(Math.log(m) / Math.LN2 / 8)) / Math.LN2), // Total number of bits, m
array = kbytes === 1 ? Uint8Array : kbytes === 2 ? Uint16Array : Uint32Array, m = a.length * 32;
kbuffer = new ArrayBuffer(kbytes * k), // Size of the UInt32 table, n
buckets = this.buckets = new Int32Array(n); n = a.length;
if (a) while (++i < n) buckets[i] = a[i]; }
this._locations = new array(kbuffer); else {
} else { // Total number of bits, m
var buckets = this.buckets = []; m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k;
if (a) while (++i < n) buckets[i] = a[i]; // Size of the UInt32 table, n
else while (++i < n) buckets[i] = 0; n = Math.ceil(m / 32);
this._locations = []; // Round total number of bits to closest multiple of 32
m = n * 32;
}
this.m = m;
this.k = k;
if (typedArrays) {
var kbytes = 1 << Math.ceil(Math.log(Math.ceil(Math.log(m) / Math.LN2 / 8)) / Math.LN2),
array = kbytes === 1 ? Uint8Array : kbytes === 2 ? Uint16Array : Uint32Array,
kbuffer = new ArrayBuffer(kbytes * k),
buckets = this.buckets = new Int32Array(n);
if (a) while (++i < n) buckets[i] = a[i];
this._locations = new array(kbuffer);
} else {
var buckets = this.buckets = [];
if (a) while (++i < n) buckets[i] = a[i];
else while (++i < n) buckets[i] = 0;
this._locations = [];
}
} }
} // See http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/
BloomFilter.prototype.locations = function(v) {
// See http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/ var k = this.k,
BloomFilter.prototype.locations = function(v) {
var k = this.k,
m = this.m, m = this.m,
r = this._locations, r = this._locations,
a = fnv_1a(v), a = fnv_1a(v),
b = fnv_1a_b(a), b = fnv_1a_b(a),
i = -1, i = -1,
x = a % m; x = a % m;
while (++i < k) { while (++i < k) {
r[i] = x < 0 ? (x + m) : x; r[i] = x < 0 ? (x + m) : x;
x = (x + b) % m; x = (x + b) % m;
} }
return r; return r;
}; };
BloomFilter.prototype.add = function(v) {
BloomFilter.prototype.add = function(v) { var l = this.locations(v + ""),
var l = this.locations(v + ""),
i = -1, i = -1,
k = this.k, k = this.k,
buckets = this.buckets; buckets = this.buckets;
while (++i < k) buckets[Math.floor(l[i] / 32)] |= 1 << (l[i] % 32); while (++i < k) buckets[Math.floor(l[i] / 32)] |= 1 << (l[i] % 32);
}; };
BloomFilter.prototype.test = function(v) {
BloomFilter.prototype.test = function(v) { var l = this.locations(v + ""),
var l = this.locations(v + ""),
i = -1, i = -1,
k = this.k, k = this.k,
b, b,
buckets = this.buckets; buckets = this.buckets;
while (++i < k) { while (++i < k) {
b = l[i]; b = l[i];
if ((buckets[Math.floor(b / 32)] & (1 << (b % 32))) === 0) { if ((buckets[Math.floor(b / 32)] & (1 << (b % 32))) === 0) {
return false; return false;
} }
} }
return true; return true;
}; };
// Estimated cardinality.
// Estimated cardinality. BloomFilter.prototype.size = function() {
BloomFilter.prototype.size = function() { var buckets = this.buckets,
var buckets = this.buckets,
bits = 0; bits = 0;
for (var i = 0, n = buckets.length; i < n; ++i) bits += popcnt(buckets[i]); for (var i = 0, n = buckets.length; i < n; ++i) bits += popcnt(buckets[i]);
return -this.m * Math.log(1 - bits / this.m) / this.k; return -this.m * Math.log(1 - bits / this.m) / this.k;
}; };
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel function popcnt(v) {
function popcnt(v) { v -= (v >> 1) & 0x55555555;
v -= (v >> 1) & 0x55555555; v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; }
} // Fowler/Noll/Vo hashing.
function fnv_1a(v) {
// Fowler/Noll/Vo hashing. var n = v.length,
function fnv_1a(v) {
var n = v.length,
a = 2166136261, a = 2166136261,
c, c,
d, d,
i = -1; i = -1;
while (++i < n) { while (++i < n) {
c = v.charCodeAt(i); c = v.charCodeAt(i);
if (d = c & 0xff000000) { if (d = c & 0xff000000) {
a ^= d >> 24; a ^= d >> 24;
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
} }
if (d = c & 0xff0000) { if (d = c & 0xff0000) {
a ^= d >> 16; a ^= d >> 16;
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
} }
if (d = c & 0xff00) { if (d = c & 0xff00) {
a ^= d >> 8; a ^= d >> 8;
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
} }
a ^= c & 0xff; a ^= c & 0xff;
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
}
// From http://home.comcast.net/~bretm/hash/6.html
a += a << 13;
a ^= a >> 7;
a += a << 3;
a ^= a >> 17;
a += a << 5;
return a & 0xffffffff;
}
// One additional iteration of FNV, given a hash.
function fnv_1a_b(a) {
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
a += a << 13;
a ^= a >> 7;
a += a << 3;
a ^= a >> 17;
a += a << 5;
return a & 0xffffffff;
} }
// From http://home.comcast.net/~bretm/hash/6.html
a += a << 13;
a ^= a >> 7;
a += a << 3;
a ^= a >> 17;
a += a << 5;
return a & 0xffffffff;
}
// One additional iteration of FNV, given a hash.
function fnv_1a_b(a) {
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
a += a << 13;
a ^= a >> 7;
a += a << 3;
a ^= a >> 17;
a += a << 5;
return a & 0xffffffff;
}
})(typeof exports !== "undefined" ? exports : this); })(typeof exports !== "undefined" ? exports : this);

36
test/bloomfilter-test2.js Normal file
View File

@ -0,0 +1,36 @@
/* These are some basic unit-tests for the bloom.js module */
var bloom = new BloomFilter(4, 0.1);
console.log(bloom);
// Add some elements to the filter.
bloom.add("foo");
bloom.add("bar");
// Test if an item is in our filter.
// Returns true if an item is probably in the set,
// or false if an item is definitely not in the set.
console.assert(bloom.test("foo") === true);
console.assert(bloom.test("bar") === true);
console.assert(bloom.test("blah") === false);
console.assert(bloom.test("blahahvhzfeh") === false);
console.assert(bloom.test("blahahvhzfehgfgahafgfa") === false);
// Serialisation. Note that bloom.buckets may be a typed array,
// so we convert to a normal array first.
var array = [].slice.call(bloom.buckets),
json = JSON.stringify(array);
console.log(array);
// Deserialisation. Note that the any array-like object is supported, but
// this will be used directly, so you may wish to use a typed array for
// performance.
var bloom = new BloomFilter(array, 0.1);
console.log(bloom);
console.assert(bloom.test("foo") === true);
console.assert(bloom.test("bar") === true);
console.assert(bloom.test("blah") === false);
console.assert(bloom.test("blahahvhzfeh") === false);
console.assert(bloom.test("blahahvhzfehgfgahafgfa") === false);