Finished handling of binary data by JS
TODO : Recode the pybloom library in JS to load back the BloomFilters in the JS script.
This commit is contained in:
parent
0c33477699
commit
7eb55478ed
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
*/__pycache__
|
||||
*/search_index
|
||||
*/pages_index.json
|
||||
|
14
README.md
14
README.md
@ -21,12 +21,24 @@ An index is generated by a Python script, upon generation of the pages, and is d
|
||||
* `index.html`
|
||||
* `js/bloom.js`: main JS code
|
||||
* `js/bloomfilters.js`: JS library to use BloomFilters
|
||||
* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future.
|
||||
|
||||
### Examples
|
||||
|
||||
* `samples/`: samples for testing purpose (taken from my blog articles)
|
||||
|
||||
## Data storing
|
||||
|
||||
One of the main problem was to transmit the binary data from the Python script to the JS script. I found [an article about handling binary data in JavaScript](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Sending_and_Receiving_Binary_Data) which helped me a lot.
|
||||
|
||||
Data from the python script is just the array of bloomfilters bitarray written as a binary file (`data/search_index`), which I open with JS. The list of articles is also written in JSON form in a specific file (`data/pages_index.json`).
|
||||
|
||||
Here's the format of the output from the python script:
|
||||
|
||||
* [16 bits] : number of articles (== number of bitarrays)
|
||||
* for each bitarray:
|
||||
* [16 bits] : length of the bitarray
|
||||
* […] : the bitarray itself
|
||||
|
||||
## Notes
|
||||
* I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:
|
||||
|
||||
|
0
data/index.html
Normal file
0
data/index.html
Normal file
@ -1,9 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
from lxml import html
|
||||
import re
|
||||
import stemmer
|
||||
import json
|
||||
from bitarray import bitarray
|
||||
from pybloom import BloomFilter
|
||||
|
||||
@ -22,14 +24,19 @@ def remove_common_words(words):
|
||||
return returned
|
||||
|
||||
|
||||
def bitfield(n):
|
||||
return [1 if digit=='1' else 0 for digit in bin(n)[2:]]
|
||||
def bitfield(n, fill):
|
||||
return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]
|
||||
|
||||
# =============================================================================
|
||||
samples = list_directory("../samples/")
|
||||
filters = {}
|
||||
p = stemmer.PorterStemmer()
|
||||
write = bitarray()
|
||||
write = bitarray(bitfield(len(samples), 16))
|
||||
|
||||
if len(samples) > 65535:
|
||||
sys.exit("[ERROR] Too many articles to index. You will have to change the "
|
||||
"way data is stored in the binary file to handle such amount of "
|
||||
"files.")
|
||||
|
||||
for sample in samples:
|
||||
with open(sample, 'r') as sample_fh:
|
||||
@ -50,9 +57,16 @@ for sample in samples:
|
||||
for word in words:
|
||||
filters[sample].add(word)
|
||||
|
||||
with open('search_index', 'wb') as index_fh:
|
||||
index_fh.write(filters[samples[0]].bitarray.tobytes()) # TODO
|
||||
if filters[sample].bitarray.length() > 65535:
|
||||
sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
|
||||
"will have to change the way data is stored in the binary "
|
||||
"file to handle such amount of text.")
|
||||
|
||||
write.extend(bitfield(len(filters[samples[0]].bitarray)))
|
||||
write.extend(filters[samples[0]].bitarray)
|
||||
print(write)
|
||||
write.extend(bitfield(filters[sample].bitarray.length(), 16))
|
||||
write.extend(filters[sample].bitarray)
|
||||
|
||||
with open('../data/search_index', 'wb') as index_fh:
|
||||
index_fh.write(write.tobytes())
|
||||
|
||||
with open('../data/pages_index.json', 'w') as pages_fh:
|
||||
pages_fh.write(json.dumps(samples))
|
||||
|
57
js/bloom.js
57
js/bloom.js
@ -1,6 +1,6 @@
|
||||
var loading = false;
|
||||
var usable = false;
|
||||
var index = false;
|
||||
var search_index = false;
|
||||
|
||||
window.onload = function() {
|
||||
};
|
||||
@ -14,12 +14,12 @@ document.getElementById('search').addEventListener('click', function() {
|
||||
this.value = "";
|
||||
}
|
||||
|
||||
if(index === false) {
|
||||
if(search_index === false) {
|
||||
loading = true;
|
||||
document.getElementById("loading").innerHTML = "Loading index file...";
|
||||
|
||||
var oReq = new XMLHttpRequest();
|
||||
oReq.open("GET", "/index_generation/search_index", true);
|
||||
oReq.open("GET", "/data/search_index", true);
|
||||
oReq.responseType = "arraybuffer";
|
||||
|
||||
oReq.onload = function (oEvent) {
|
||||
@ -30,12 +30,51 @@ document.getElementById('search').addEventListener('click', function() {
|
||||
document.getElementById("loading").innerHTML = "";
|
||||
|
||||
var tmp = new Uint8Array(arrayBuffer);
|
||||
for (var i = 0; i < tmp.byteLength; i++) {
|
||||
// TODO
|
||||
var nb_filters = 0;
|
||||
|
||||
// First 16 bits == number of bitarrays
|
||||
for (var i = 0; i < 16; i++) {
|
||||
nb_filters += tmp[i] << i;
|
||||
}
|
||||
search_index = new Array(nb_filters);
|
||||
|
||||
// For each of the bitarrays, parse it
|
||||
var offset = 0;
|
||||
for (var i = 0; i < nb_filters; i++) {
|
||||
// Size of the filter
|
||||
var length = 0;
|
||||
for (var j = offset; j < offset + 16; j++) {
|
||||
length += tmp[j] << j;
|
||||
}
|
||||
search_index[i] = new Uint8Array(length);
|
||||
|
||||
// Parse filter
|
||||
for (var j = 16; j < 16 + length; j++) {
|
||||
search_index[i][j] = tmp[j + offset];
|
||||
}
|
||||
|
||||
offset += 16 + length;
|
||||
}
|
||||
}
|
||||
else {
|
||||
document.getElementById("loading").innerHTML = "Error while loading search index.";
|
||||
}
|
||||
};
|
||||
oReq.send(null);
|
||||
|
||||
var oReq2 = new XMLHttpRequest();
|
||||
oReq2.open("GET", "data/pages_index.json", true);
|
||||
oReq2.onreadystatechange = function() {
|
||||
if (this.readyState == 4) {
|
||||
if (this.status == 200) {
|
||||
pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")");
|
||||
}
|
||||
else {
|
||||
document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText;
|
||||
}
|
||||
}
|
||||
}
|
||||
oReq2.send();
|
||||
}
|
||||
});
|
||||
|
||||
@ -45,11 +84,11 @@ function callback_change() {
|
||||
}
|
||||
var search = document.getElementById("search").value;
|
||||
document.getElementById("results").innerHTML = "<h2>Results :</h2>";
|
||||
for(var key in index) {
|
||||
//if(index[key].test(search)) { TODO
|
||||
/*TODO for(var key in index) {
|
||||
if(index[key].test(search)) {
|
||||
document.getElementById("results").innerHTML += "<p>"+key+"</p>";
|
||||
//}
|
||||
}
|
||||
}
|
||||
}*/
|
||||
if(!document.querySelectorAll("#results p").length) {
|
||||
document.getElementById("results").innerHTML += "<p>No results...</p>";
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user