Finished handling of binary data by JS

TODO : Recode the pybloom library in JS to load back the BloomFilters in
the JS script.
This commit is contained in:
Phyks 2014-01-11 01:19:50 +01:00
parent 0c33477699
commit 7eb55478ed
5 changed files with 84 additions and 18 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
*/__pycache__
*/search_index
*/pages_index.json

View File

@ -21,12 +21,24 @@ An index is generated by a Python script, upon generation of the pages, and is d
* `index.html`
* `js/bloom.js`: main JS code
* `js/bloomfilters.js`: JS library to use BloomFilters
* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future.
### Examples
* `samples/`: samples for testing purpose (taken from my blog articles)
## Data storing
One of the main problem was to transmit the binary data from the Python script to the JS script. I found [an article about handling binary data in JavaScript](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Sending_and_Receiving_Binary_Data) which helped me a lot.
Data from the python script is just the array of bloomfilters bitarray written as a binary file (`data/search_index`), which I open with JS. The list of articles is also written in JSON form in a specific file (`data/pages_index.json`).
Here's the format of the output from the python script:
* [16 bits] : number of articles (== number of bitarrays)
* for each bitarray:
* [16 bits] : length of the bitarray
* […] : the bitarray itself
## Notes
* I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:

0
data/index.html Normal file
View File

View File

@ -1,9 +1,11 @@
#!/usr/bin/env python3
import os
import sys
from lxml import html
import re
import stemmer
import json
from bitarray import bitarray
from pybloom import BloomFilter
@ -22,14 +24,19 @@ def remove_common_words(words):
return returned
def bitfield(n):
return [1 if digit=='1' else 0 for digit in bin(n)[2:]]
def bitfield(n, fill):
return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]
# =============================================================================
samples = list_directory("../samples/")
filters = {}
p = stemmer.PorterStemmer()
write = bitarray()
write = bitarray(bitfield(len(samples), 16))
if len(samples) > 65535:
sys.exit("[ERROR] Too many articles to index. You will have to change the "
"way data is stored in the binary file to handle such amount of "
"files.")
for sample in samples:
with open(sample, 'r') as sample_fh:
@ -50,9 +57,16 @@ for sample in samples:
for word in words:
filters[sample].add(word)
with open('search_index', 'wb') as index_fh:
index_fh.write(filters[samples[0]].bitarray.tobytes()) # TODO
if filters[sample].bitarray.length() > 65535:
sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
"will have to change the way data is stored in the binary "
"file to handle such amount of text.")
write.extend(bitfield(len(filters[samples[0]].bitarray)))
write.extend(filters[samples[0]].bitarray)
print(write)
write.extend(bitfield(filters[sample].bitarray.length(), 16))
write.extend(filters[sample].bitarray)
with open('../data/search_index', 'wb') as index_fh:
index_fh.write(write.tobytes())
with open('../data/pages_index.json', 'w') as pages_fh:
pages_fh.write(json.dumps(samples))

View File

@ -1,6 +1,6 @@
var loading = false;
var usable = false;
var index = false;
var search_index = false;
window.onload = function() {
};
@ -14,12 +14,12 @@ document.getElementById('search').addEventListener('click', function() {
this.value = "";
}
if(index === false) {
if(search_index === false) {
loading = true;
document.getElementById("loading").innerHTML = "Loading index file...";
var oReq = new XMLHttpRequest();
oReq.open("GET", "/index_generation/search_index", true);
oReq.open("GET", "/data/search_index", true);
oReq.responseType = "arraybuffer";
oReq.onload = function (oEvent) {
@ -30,12 +30,51 @@ document.getElementById('search').addEventListener('click', function() {
document.getElementById("loading").innerHTML = "";
var tmp = new Uint8Array(arrayBuffer);
for (var i = 0; i < tmp.byteLength; i++) {
// TODO
var nb_filters = 0;
// First 16 bits == number of bitarrays
for (var i = 0; i < 16; i++) {
nb_filters += tmp[i] << i;
}
search_index = new Array(nb_filters);
// For each of the bitarrays, parse it
var offset = 0;
for (var i = 0; i < nb_filters; i++) {
// Size of the filter
var length = 0;
for (var j = offset; j < offset + 16; j++) {
length += tmp[j] << j;
}
search_index[i] = new Uint8Array(length);
// Parse filter
for (var j = 16; j < 16 + length; j++) {
search_index[i][j] = tmp[j + offset];
}
offset += 16 + length;
}
}
else {
document.getElementById("loading").innerHTML = "Error while loading search index.";
}
};
oReq.send(null);
var oReq2 = new XMLHttpRequest();
oReq2.open("GET", "data/pages_index.json", true);
oReq2.onreadystatechange = function() {
if (this.readyState == 4) {
if (this.status == 200) {
pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")");
}
else {
document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText;
}
}
}
oReq2.send();
}
});
@ -45,11 +84,11 @@ function callback_change() {
}
var search = document.getElementById("search").value;
document.getElementById("results").innerHTML = "<h2>Results :</h2>";
for(var key in index) {
//if(index[key].test(search)) { TODO
/*TODO for(var key in index) {
if(index[key].test(search)) {
document.getElementById("results").innerHTML += "<p>"+key+"</p>";
//}
}
}
}*/
if(!document.querySelectorAll("#results p").length) {
document.getElementById("results").innerHTML += "<p>No results...</p>";
}