diff --git a/.gitignore b/.gitignore index e8f0a64..696776f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ */__pycache__ -*/search_index +*/search_index* */pages_index.json diff --git a/index.html b/index.html index 7670303..599e265 100644 --- a/index.html +++ b/index.html @@ -13,7 +13,6 @@

- diff --git a/index_generation/generate_index.py b/index_generation/generate_index.py index 24a52fe..e4d7aca 100755 --- a/index_generation/generate_index.py +++ b/index_generation/generate_index.py @@ -23,15 +23,21 @@ def remove_common_words(words): returned = [word for word in words if len(word) > 3] return returned - -def bitfield(n, fill): - return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)] +def padding_16(x): + if x < 256: + return bytes([0,len(samples)]) + else: + return bytes([int(x/256), x%256]) # ============================================================================= samples = list_directory("../samples/") filters = {} p = stemmer.PorterStemmer() -write = bitarray(bitfield(len(samples), 16)) +write_little = bitarray(endian="little") +write_big = bitarray(endian="big") + +write_little.frombytes(padding_16(len(samples))) +write_big.frombytes(padding_16(len(samples))) if len(samples) > 65535: sys.exit("[ERROR] Too many articles to index. You will have to change the " @@ -62,11 +68,17 @@ for sample in samples: "will have to change the way data is stored in the binary " "file to handle such amount of text.") - write.extend(bitfield(filters[sample].bitarray.length(), 16)) - write.extend(filters[sample].bitarray) + #write_little.extend(bitfield(filters[sample].bitarray.length(), 16)) + #write_little.extend(filters[sample].bitarray) + #write_big.extend(bitfield(filters[sample].bitarray.length(), 16)) + #write_big.extend(filters[sample].bitarray) -with open('../data/search_index', 'wb') as index_fh: - index_fh.write(write.tobytes()) +with open('../data/search_index_little', 'wb') as index_fh: + print(write_little) + write_little.tofile(index_fh) +with open('../data/search_index_big', 'wb') as index_fh: + print(write_big) + write_big.tofile(index_fh) with open('../data/pages_index.json', 'w') as pages_fh: pages_fh.write(json.dumps(samples)) diff --git a/js/bloom.js b/js/bloom.js index c1344ac..fc08d16 100644 --- a/js/bloom.js +++ b/js/bloom.js @@ -2,8 +2,19 @@ var loading = false; var usable = false; var search_index = false; -window.onload = function() { -}; +// Check endianness to serve right file +function checkEndian(){ + var a = new ArrayBuffer(4); + var b = new Uint8Array(a); + var c = new Uint32Array(a); + b[0] = 0xa1; + b[1] = 0xb2; + b[2] = 0xc3; + b[3] = 0xd4; + if(c[0] == 0xd4c3b2a1) return "little"; + if(c[0] == 0xa1b2c3d4) return "big"; + else return 0; +} document.getElementById('search_form').addEventListener('submit', function(e) { e.preventDefault(); @@ -19,18 +30,17 @@ document.getElementById('search').addEventListener('click', function() { document.getElementById("loading").innerHTML = "Loading index file..."; var oReq = new XMLHttpRequest(); - oReq.open("GET", "/data/search_index", true); + oReq.open("GET", "data/search_index_"+checkEndian(), true); oReq.responseType = "arraybuffer"; oReq.onload = function (oEvent) { var arrayBuffer = oReq.response; // Note: not oReq.responseText - if (arrayBuffer) { - loading = false; - usable = true; - document.getElementById("loading").innerHTML = ""; + if (arrayBuffer) { var tmp = new Uint8Array(arrayBuffer); var nb_filters = 0; + console.log(tmp); + return; // First 16 bits == number of bitarrays for (var i = 0; i < 16; i++) { @@ -55,6 +65,9 @@ document.getElementById('search').addEventListener('click', function() { offset += 16 + length; } + document.getElementById("loading").innerHTML = ""; + loading = false; + usable = true; } else { document.getElementById("loading").innerHTML = "Error while loading search index."; @@ -78,20 +91,20 @@ document.getElementById('search').addEventListener('click', function() { } }); -function callback_change() { +/*function callback_change() { if(!usable) { return; } var search = document.getElementById("search").value; document.getElementById("results").innerHTML = "

Results :

"; -/*TODO for(var key in index) { +//* for(var key in index) { if(index[key].test(search)) { document.getElementById("results").innerHTML += "

"+key+"

"; } - }*/ + }* // if(!document.querySelectorAll("#results p").length) { document.getElementById("results").innerHTML += "

No results...

"; } } -document.getElementById("search").addEventListener('input', callback_change); +document.getElementById("search").addEventListener('input', callback_change);*/ diff --git a/js/bloomfilter.js b/js/bloomfilter.js deleted file mode 100644 index 4950155..0000000 --- a/js/bloomfilter.js +++ /dev/null @@ -1,133 +0,0 @@ -(function(exports) { - exports.BloomFilter = BloomFilter; - exports.fnv_1a = fnv_1a; - exports.fnv_1a_b = fnv_1a_b; - - var typedArrays = typeof ArrayBuffer !== "undefined"; - - // Creates a new bloom filter. If *m* is an array-like object, with a length - // property, then the bloom filter is loaded with data from the array, where - // each element is a 32-bit integer. Otherwise, *m* should specify the - // number of bits. *k* specifies the number of hashing functions. - function BloomFilter(m, k) { - var a; - if (typeof m !== "number") a = m, m = a.length * 32; - - this.m = m; - this.k = k; - var n = Math.ceil(m / 32), - i = -1; - - if (typedArrays) { - var kbytes = 1 << Math.ceil(Math.log(Math.ceil(Math.log(m) / Math.LN2 / 8)) / Math.LN2), - array = kbytes === 1 ? Uint8Array : kbytes === 2 ? Uint16Array : Uint32Array, - kbuffer = new ArrayBuffer(kbytes * k), - buckets = this.buckets = new Int32Array(n); - if (a) while (++i < n) buckets[i] = a[i]; - this._locations = new array(kbuffer); - } else { - var buckets = this.buckets = []; - if (a) while (++i < n) buckets[i] = a[i]; - else while (++i < n) buckets[i] = 0; - this._locations = []; - } - } - - // See http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/ - BloomFilter.prototype.locations = function(v) { - var k = this.k, - m = this.m, - r = this._locations, - a = fnv_1a(v), - b = fnv_1a_b(a), - i = -1, - x = a % m; - while (++i < k) { - r[i] = x < 0 ? (x + m) : x; - x = (x + b) % m; - } - return r; - }; - - BloomFilter.prototype.add = function(v) { - var l = this.locations(v + ""), - i = -1, - k = this.k, - buckets = this.buckets; - while (++i < k) buckets[Math.floor(l[i] / 32)] |= 1 << (l[i] % 32); - }; - - BloomFilter.prototype.test = function(v) { - var l = this.locations(v + ""), - i = -1, - k = this.k, - b, - buckets = this.buckets; - while (++i < k) { - b = l[i]; - if ((buckets[Math.floor(b / 32)] & (1 << (b % 32))) === 0) { - return false; - } - } - return true; - }; - - // Estimated cardinality. - BloomFilter.prototype.size = function() { - var buckets = this.buckets, - bits = 0; - for (var i = 0, n = buckets.length; i < n; ++i) bits += popcnt(buckets[i]); - return -this.m * Math.log(1 - bits / this.m) / this.k; - }; - - // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - function popcnt(v) { - v -= (v >> 1) & 0x55555555; - v = (v & 0x33333333) + ((v >> 2) & 0x33333333); - return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; - } - - // Fowler/Noll/Vo hashing. - function fnv_1a(v) { - var n = v.length, - a = 2166136261, - c, - d, - i = -1; - while (++i < n) { - c = v.charCodeAt(i); - if (d = c & 0xff000000) { - a ^= d >> 24; - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); - } - if (d = c & 0xff0000) { - a ^= d >> 16; - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); - } - if (d = c & 0xff00) { - a ^= d >> 8; - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); - } - a ^= c & 0xff; - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); - } - // From http://home.comcast.net/~bretm/hash/6.html - a += a << 13; - a ^= a >> 7; - a += a << 3; - a ^= a >> 17; - a += a << 5; - return a & 0xffffffff; - } - - // One additional iteration of FNV, given a hash. - function fnv_1a_b(a) { - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24); - a += a << 13; - a ^= a >> 7; - a += a << 3; - a ^= a >> 17; - a += a << 5; - return a & 0xffffffff; - } -})(typeof exports !== "undefined" ? exports : this);