From 7eb55478ed62386212676fe1b2777c7ed9e3e397 Mon Sep 17 00:00:00 2001 From: Phyks Date: Sat, 11 Jan 2014 01:19:50 +0100 Subject: [PATCH] Finished handling of binary data by JS TODO : Recode the pybloom library in JS to load back the BloomFilters in the JS script. --- .gitignore | 1 + README.md | 14 +++++++- data/index.html | 0 index_generation/generate_index.py | 30 +++++++++++----- js/bloom.js | 57 +++++++++++++++++++++++++----- 5 files changed, 84 insertions(+), 18 deletions(-) create mode 100644 data/index.html diff --git a/.gitignore b/.gitignore index ee29b3d..e8f0a64 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ */__pycache__ */search_index +*/pages_index.json diff --git a/README.md b/README.md index e820858..f814ee8 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,24 @@ An index is generated by a Python script, upon generation of the pages, and is d * `index.html` * `js/bloom.js`: main JS code * `js/bloomfilters.js`: JS library to use BloomFilters -* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future. ### Examples * `samples/`: samples for testing purpose (taken from my blog articles) +## Data storing + +One of the main problem was to transmit the binary data from the Python script to the JS script. I found [an article about handling binary data in JavaScript](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Sending_and_Receiving_Binary_Data) which helped me a lot. + +Data from the python script is just the array of bloomfilters bitarray written as a binary file (`data/search_index`), which I open with JS. The list of articles is also written in JSON form in a specific file (`data/pages_index.json`). + +Here's the format of the output from the python script: + +* [16 bits] : number of articles (== number of bitarrays) +* for each bitarray: + * [16 bits] : length of the bitarray + * […] : the bitarray itself + ## Notes * I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones: diff --git a/data/index.html b/data/index.html new file mode 100644 index 0000000..e69de29 diff --git a/index_generation/generate_index.py b/index_generation/generate_index.py index a0464ba..24a52fe 100755 --- a/index_generation/generate_index.py +++ b/index_generation/generate_index.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 import os +import sys from lxml import html import re import stemmer +import json from bitarray import bitarray from pybloom import BloomFilter @@ -22,14 +24,19 @@ def remove_common_words(words): return returned -def bitfield(n): - return [1 if digit=='1' else 0 for digit in bin(n)[2:]] +def bitfield(n, fill): + return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)] # ============================================================================= samples = list_directory("../samples/") filters = {} p = stemmer.PorterStemmer() -write = bitarray() +write = bitarray(bitfield(len(samples), 16)) + +if len(samples) > 65535: + sys.exit("[ERROR] Too many articles to index. You will have to change the " + "way data is stored in the binary file to handle such amount of " + "files.") for sample in samples: with open(sample, 'r') as sample_fh: @@ -50,9 +57,16 @@ for sample in samples: for word in words: filters[sample].add(word) -with open('search_index', 'wb') as index_fh: - index_fh.write(filters[samples[0]].bitarray.tobytes()) # TODO + if filters[sample].bitarray.length() > 65535: + sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You " + "will have to change the way data is stored in the binary " + "file to handle such amount of text.") -write.extend(bitfield(len(filters[samples[0]].bitarray))) -write.extend(filters[samples[0]].bitarray) -print(write) + write.extend(bitfield(filters[sample].bitarray.length(), 16)) + write.extend(filters[sample].bitarray) + +with open('../data/search_index', 'wb') as index_fh: + index_fh.write(write.tobytes()) + +with open('../data/pages_index.json', 'w') as pages_fh: + pages_fh.write(json.dumps(samples)) diff --git a/js/bloom.js b/js/bloom.js index f22e0fe..c1344ac 100644 --- a/js/bloom.js +++ b/js/bloom.js @@ -1,6 +1,6 @@ var loading = false; var usable = false; -var index = false; +var search_index = false; window.onload = function() { }; @@ -14,12 +14,12 @@ document.getElementById('search').addEventListener('click', function() { this.value = ""; } - if(index === false) { + if(search_index === false) { loading = true; document.getElementById("loading").innerHTML = "Loading index file..."; var oReq = new XMLHttpRequest(); - oReq.open("GET", "/index_generation/search_index", true); + oReq.open("GET", "/data/search_index", true); oReq.responseType = "arraybuffer"; oReq.onload = function (oEvent) { @@ -30,12 +30,51 @@ document.getElementById('search').addEventListener('click', function() { document.getElementById("loading").innerHTML = ""; var tmp = new Uint8Array(arrayBuffer); - for (var i = 0; i < tmp.byteLength; i++) { - // TODO + var nb_filters = 0; + + // First 16 bits == number of bitarrays + for (var i = 0; i < 16; i++) { + nb_filters += tmp[i] << i; } + search_index = new Array(nb_filters); + + // For each of the bitarrays, parse it + var offset = 0; + for (var i = 0; i < nb_filters; i++) { + // Size of the filter + var length = 0; + for (var j = offset; j < offset + 16; j++) { + length += tmp[j] << j; + } + search_index[i] = new Uint8Array(length); + + // Parse filter + for (var j = 16; j < 16 + length; j++) { + search_index[i][j] = tmp[j + offset]; + } + + offset += 16 + length; + } + } + else { + document.getElementById("loading").innerHTML = "Error while loading search index."; } }; oReq.send(null); + + var oReq2 = new XMLHttpRequest(); + oReq2.open("GET", "data/pages_index.json", true); + oReq2.onreadystatechange = function() { + if (this.readyState == 4) { + if (this.status == 200) { + pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")"); + } + else { + document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText; + } + } + } + oReq2.send(); } }); @@ -45,11 +84,11 @@ function callback_change() { } var search = document.getElementById("search").value; document.getElementById("results").innerHTML = "

Results :

"; - for(var key in index) { - //if(index[key].test(search)) { TODO +/*TODO for(var key in index) { + if(index[key].test(search)) { document.getElementById("results").innerHTML += "

"+key+"

"; - //} - } + } + }*/ if(!document.querySelectorAll("#results p").length) { document.getElementById("results").innerHTML += "

No results...

"; }