Finished handling of binary data by JS

TODO : Recode the pybloom library in JS to load back the BloomFilters in
the JS script.
This commit is contained in:
Phyks 2014-01-11 01:19:50 +01:00
parent 0c33477699
commit 7eb55478ed
5 changed files with 84 additions and 18 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
*/__pycache__ */__pycache__
*/search_index */search_index
*/pages_index.json

View File

@ -21,12 +21,24 @@ An index is generated by a Python script, upon generation of the pages, and is d
* `index.html` * `index.html`
* `js/bloom.js`: main JS code * `js/bloom.js`: main JS code
* `js/bloomfilters.js`: JS library to use BloomFilters * `js/bloomfilters.js`: JS library to use BloomFilters
* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future.
### Examples ### Examples
* `samples/`: samples for testing purpose (taken from my blog articles) * `samples/`: samples for testing purpose (taken from my blog articles)
## Data storing
One of the main problem was to transmit the binary data from the Python script to the JS script. I found [an article about handling binary data in JavaScript](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Sending_and_Receiving_Binary_Data) which helped me a lot.
Data from the python script is just the array of bloomfilters bitarray written as a binary file (`data/search_index`), which I open with JS. The list of articles is also written in JSON form in a specific file (`data/pages_index.json`).
Here's the format of the output from the python script:
* [16 bits] : number of articles (== number of bitarrays)
* for each bitarray:
* [16 bits] : length of the bitarray
* […] : the bitarray itself
## Notes ## Notes
* I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones: * I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:

0
data/index.html Normal file
View File

View File

@ -1,9 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import sys
from lxml import html from lxml import html
import re import re
import stemmer import stemmer
import json
from bitarray import bitarray from bitarray import bitarray
from pybloom import BloomFilter from pybloom import BloomFilter
@ -22,14 +24,19 @@ def remove_common_words(words):
return returned return returned
def bitfield(n): def bitfield(n, fill):
return [1 if digit=='1' else 0 for digit in bin(n)[2:]] return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]
# ============================================================================= # =============================================================================
samples = list_directory("../samples/") samples = list_directory("../samples/")
filters = {} filters = {}
p = stemmer.PorterStemmer() p = stemmer.PorterStemmer()
write = bitarray() write = bitarray(bitfield(len(samples), 16))
if len(samples) > 65535:
sys.exit("[ERROR] Too many articles to index. You will have to change the "
"way data is stored in the binary file to handle such amount of "
"files.")
for sample in samples: for sample in samples:
with open(sample, 'r') as sample_fh: with open(sample, 'r') as sample_fh:
@ -50,9 +57,16 @@ for sample in samples:
for word in words: for word in words:
filters[sample].add(word) filters[sample].add(word)
with open('search_index', 'wb') as index_fh: if filters[sample].bitarray.length() > 65535:
index_fh.write(filters[samples[0]].bitarray.tobytes()) # TODO sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
"will have to change the way data is stored in the binary "
"file to handle such amount of text.")
write.extend(bitfield(len(filters[samples[0]].bitarray))) write.extend(bitfield(filters[sample].bitarray.length(), 16))
write.extend(filters[samples[0]].bitarray) write.extend(filters[sample].bitarray)
print(write)
with open('../data/search_index', 'wb') as index_fh:
index_fh.write(write.tobytes())
with open('../data/pages_index.json', 'w') as pages_fh:
pages_fh.write(json.dumps(samples))

View File

@ -1,6 +1,6 @@
var loading = false; var loading = false;
var usable = false; var usable = false;
var index = false; var search_index = false;
window.onload = function() { window.onload = function() {
}; };
@ -14,12 +14,12 @@ document.getElementById('search').addEventListener('click', function() {
this.value = ""; this.value = "";
} }
if(index === false) { if(search_index === false) {
loading = true; loading = true;
document.getElementById("loading").innerHTML = "Loading index file..."; document.getElementById("loading").innerHTML = "Loading index file...";
var oReq = new XMLHttpRequest(); var oReq = new XMLHttpRequest();
oReq.open("GET", "/index_generation/search_index", true); oReq.open("GET", "/data/search_index", true);
oReq.responseType = "arraybuffer"; oReq.responseType = "arraybuffer";
oReq.onload = function (oEvent) { oReq.onload = function (oEvent) {
@ -30,12 +30,51 @@ document.getElementById('search').addEventListener('click', function() {
document.getElementById("loading").innerHTML = ""; document.getElementById("loading").innerHTML = "";
var tmp = new Uint8Array(arrayBuffer); var tmp = new Uint8Array(arrayBuffer);
for (var i = 0; i < tmp.byteLength; i++) { var nb_filters = 0;
// TODO
// First 16 bits == number of bitarrays
for (var i = 0; i < 16; i++) {
nb_filters += tmp[i] << i;
} }
search_index = new Array(nb_filters);
// For each of the bitarrays, parse it
var offset = 0;
for (var i = 0; i < nb_filters; i++) {
// Size of the filter
var length = 0;
for (var j = offset; j < offset + 16; j++) {
length += tmp[j] << j;
}
search_index[i] = new Uint8Array(length);
// Parse filter
for (var j = 16; j < 16 + length; j++) {
search_index[i][j] = tmp[j + offset];
}
offset += 16 + length;
}
}
else {
document.getElementById("loading").innerHTML = "Error while loading search index.";
} }
}; };
oReq.send(null); oReq.send(null);
var oReq2 = new XMLHttpRequest();
oReq2.open("GET", "data/pages_index.json", true);
oReq2.onreadystatechange = function() {
if (this.readyState == 4) {
if (this.status == 200) {
pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")");
}
else {
document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText;
}
}
}
oReq2.send();
} }
}); });
@ -45,11 +84,11 @@ function callback_change() {
} }
var search = document.getElementById("search").value; var search = document.getElementById("search").value;
document.getElementById("results").innerHTML = "<h2>Results :</h2>"; document.getElementById("results").innerHTML = "<h2>Results :</h2>";
for(var key in index) { /*TODO for(var key in index) {
//if(index[key].test(search)) { TODO if(index[key].test(search)) {
document.getElementById("results").innerHTML += "<p>"+key+"</p>"; document.getElementById("results").innerHTML += "<p>"+key+"</p>";
//} }
} }*/
if(!document.querySelectorAll("#results p").length) { if(!document.querySelectorAll("#results p").length) {
document.getElementById("results").innerHTML += "<p>No results...</p>"; document.getElementById("results").innerHTML += "<p>No results...</p>";
} }