From 775a8ab930a502f494be991a28a5e51f2f01a330 Mon Sep 17 00:00:00 2001 From: Phyks Date: Mon, 27 Oct 2014 22:17:19 +0100 Subject: [PATCH] Refactor JS view, working JS code for Bloom filters --- .gitignore | 1 + data/index.html | 0 index.html | 1 + js/bloom.js | 241 +++++++++++++++++++++++++++++------------------- js/test.js | 36 ++++++++ 5 files changed, 183 insertions(+), 96 deletions(-) delete mode 100644 data/index.html create mode 100644 js/test.js diff --git a/.gitignore b/.gitignore index 696776f..04b2924 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ */__pycache__ */search_index* */pages_index.json +*.pyc diff --git a/data/index.html b/data/index.html deleted file mode 100644 index e69de29..0000000 diff --git a/index.html b/index.html index 599e265..c348094 100644 --- a/index.html +++ b/index.html @@ -14,5 +14,6 @@
+ diff --git a/js/bloom.js b/js/bloom.js index ec89ac7..39dff2d 100644 --- a/js/bloom.js +++ b/js/bloom.js @@ -1,104 +1,153 @@ -var loading = false; -var usable = false; -var search_index = false; +/* + * BloomFilters as implemented by https://github.com/jasondavies/bloomfilter.js + * + * Original license kept + * + * Modified by Phyks to be constructed using the (capacity, error_rate) syntax rather + * than the explicit (number of bits, number of hash functions) syntax. +*/ -// Check endianness to serve right file -function checkEndian(){ - var a = new ArrayBuffer(4); - var b = new Uint8Array(a); - var c = new Uint32Array(a); - b[0] = 0xa1; - b[1] = 0xb2; - b[2] = 0xc3; - b[3] = 0xd4; - if(c[0] == 0xd4c3b2a1) return "little"; - if(c[0] == 0xa1b2c3d4) return "big"; - else return 0; -} +(function(exports) { + exports.BloomFilter = BloomFilter; + exports.fnv_1a = fnv_1a; + exports.fnv_1a_b = fnv_1a_b; + var typedArrays = typeof ArrayBuffer !== "undefined"; -document.getElementById('search_form').addEventListener('submit', function(e) { - e.preventDefault(); -}); - -document.getElementById('search').addEventListener('click', function() { - if(this.value == "Search for articles...") { - this.value = ""; + // Creates a new bloom filter given its minimal capacity and an error_rate. + // Calculation taken from https://en.wikipedia.org/wiki/Bloom_filter. + // If *capacity* is an array-like object, with a length + // property, then the bloom filter is loaded with data from the array, where + // each element is a 32-bit integer. + // *error_rate* is an estimation of the required error_rate. + function BloomFilter(capacity, error_rate) { + // *m* is the number of bits. Note that *m* is rounded up to + // the nearest multiple of 32. *k* specifies the number of hashing functions. + var a, i = -1; + // Number of slices, k + var k = Math.ceil(- Math.log(error_rate) / Math.log(2)); + // Total number of bits, m + // Size of the UInt32 table, n + var m, n; + if (typeof capacity !== "number") { + a = capacity; + // Total number of bits, m + m = a.length * 32; + // Size of the UInt32 table, n + n = a.length; + } + else { + // Total number of bits, m + m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k; + // Size of the UInt32 table, n + n = Math.ceil(m / 32); + // Round total number of bits to closest multiple of 32 + m = n * 32; + } + this.m = m; + this.k = k; + if (typedArrays) { + var kbytes = 1 << Math.ceil(Math.log(Math.ceil(Math.log(m) / Math.LN2 / 8)) / Math.LN2), + array = kbytes === 1 ? Uint8Array : kbytes === 2 ? Uint16Array : Uint32Array, + kbuffer = new ArrayBuffer(kbytes * k), + buckets = this.buckets = new Int32Array(n); + if (a) while (++i < n) buckets[i] = a[i]; + this._locations = new array(kbuffer); + } else { + var buckets = this.buckets = []; + if (a) while (++i < n) buckets[i] = a[i]; + else while (++i < n) buckets[i] = 0; + this._locations = []; + } } - - if(search_index === false) { - loading = true; - document.getElementById("loading").innerHTML = "Loading index file..."; - - var oReq = new XMLHttpRequest(); - oReq.open("GET", "data/search_index_"+checkEndian(), true); - oReq.responseType = "arraybuffer"; - - oReq.onload = function (oEvent) { - var arrayBuffer = oReq.response; // Note: not oReq.responseText - - if (arrayBuffer) { - var tmp = new Uint8Array(arrayBuffer); - var nb_filters = tmp[0]*256+tmp[1]; - - search_index = new Array(nb_filters); - - // For each of the bitarrays, parse it - var offset = 2; - for (var i = 0; i < nb_filters; i++) { - // Size of the filter - var length = tmp[offset]*256+tmp[offset+1]; // length is a number of bytes - - var length_offset = Math.ceil(length/8); - - search_index[i] = new Uint8Array(length_offset); - - // Parse filter - for (var j = 2; j < 2 + length_offset; j++) { - search_index[i][j] = tmp[offset + j]; - } - offset += 2 + length_offset; - } - console.log(search_index); - document.getElementById("loading").innerHTML = ""; - loading = false; - usable = true; - } - else { - document.getElementById("loading").innerHTML = "Error while loading search index."; - } - }; - oReq.send(null); - - var oReq2 = new XMLHttpRequest(); - oReq2.open("GET", "data/pages_index.json", true); - oReq2.onreadystatechange = function() { - if (this.readyState == 4) { - if (this.status == 200) { - pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")"); - } - else { - document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText; - } + // See http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/ + BloomFilter.prototype.locations = function(v) { + var k = this.k, + m = this.m, + r = this._locations, + a = fnv_1a(v), + b = fnv_1a_b(a), + i = -1, + x = a % m; + while (++i < k) { + r[i] = x < 0 ? (x + m) : x; + x = (x + b) % m; + } + return r; + }; + BloomFilter.prototype.add = function(v) { + var l = this.locations(v + ""), + i = -1, + k = this.k, + buckets = this.buckets; + while (++i < k) buckets[Math.floor(l[i] / 32)] |= 1 << (l[i] % 32); + }; + BloomFilter.prototype.test = function(v) { + var l = this.locations(v + ""), + i = -1, + k = this.k, + b, + buckets = this.buckets; + while (++i < k) { + b = l[i]; + if ((buckets[Math.floor(b / 32)] & (1 << (b % 32))) === 0) { + return false; } } - oReq2.send(); + return true; + }; + // Estimated cardinality. + BloomFilter.prototype.size = function() { + var buckets = this.buckets, + bits = 0; + for (var i = 0, n = buckets.length; i < n; ++i) bits += popcnt(buckets[i]); + return -this.m * Math.log(1 - bits / this.m) / this.k; + }; + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + function popcnt(v) { + v -= (v >> 1) & 0x55555555; + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; } -}); - -/*function callback_change() { - if(!usable) { - return; - } - var search = document.getElementById("search").value; - document.getElementById("results").innerHTML = "

Results :

"; -//* for(var key in index) { - if(index[key].test(search)) { - document.getElementById("results").innerHTML += "

"+key+"

"; + // Fowler/Noll/Vo hashing. + function fnv_1a(v) { + var n = v.length, + a = 2166136261, + c, + d, + i = -1; + while (++i < n) { + c = v.charCodeAt(i); + if (d = c & 0xff000000) { + a ^= d >> 24; + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); + } + if (d = c & 0xff0000) { + a ^= d >> 16; + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); + } + if (d = c & 0xff00) { + a ^= d >> 8; + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); + } + a ^= c & 0xff; + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); } - }* // - if(!document.querySelectorAll("#results p").length) { - document.getElementById("results").innerHTML += "

No results...

"; + // From http://home.comcast.net/~bretm/hash/6.html + a += a << 13; + a ^= a >> 7; + a += a << 3; + a ^= a >> 17; + a += a << 5; + return a & 0xffffffff; } -} - -document.getElementById("search").addEventListener('input', callback_change);*/ + // One additional iteration of FNV, given a hash. + function fnv_1a_b(a) { + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); + a += a << 13; + a ^= a >> 7; + a += a << 3; + a ^= a >> 17; + a += a << 5; + return a & 0xffffffff; + } +})(typeof exports !== "undefined" ? exports : this); diff --git a/js/test.js b/js/test.js new file mode 100644 index 0000000..9441115 --- /dev/null +++ b/js/test.js @@ -0,0 +1,36 @@ +/* These are some basic unit-tests for the bloom.js module */ + +var bloom = new BloomFilter(4, 0.1); +console.log(bloom); + +// Add some elements to the filter. +bloom.add("foo"); +bloom.add("bar"); + +// Test if an item is in our filter. +// Returns true if an item is probably in the set, +// or false if an item is definitely not in the set. +console.assert(bloom.test("foo") === true); +console.assert(bloom.test("bar") === true); +console.assert(bloom.test("blah") === false); +console.assert(bloom.test("blahahvhzfeh") === false); +console.assert(bloom.test("blahahvhzfehgfgahafgfa") === false); + +// Serialisation. Note that bloom.buckets may be a typed array, +// so we convert to a normal array first. +var array = [].slice.call(bloom.buckets), + json = JSON.stringify(array); + +console.log(array); + +// Deserialisation. Note that the any array-like object is supported, but +// this will be used directly, so you may wish to use a typed array for +// performance. +var bloom = new BloomFilter(array, 0.1); +console.log(bloom); + +console.assert(bloom.test("foo") === true); +console.assert(bloom.test("bar") === true); +console.assert(bloom.test("blah") === false); +console.assert(bloom.test("blahahvhzfeh") === false); +console.assert(bloom.test("blahahvhzfehgfgahafgfa") === false);