Finished handling of binary data by JS
TODO : Recode the pybloom library in JS to load back the BloomFilters in the JS script.
This commit is contained in:
parent
0c33477699
commit
7eb55478ed
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1,3 @@
|
|||||||
*/__pycache__
|
*/__pycache__
|
||||||
*/search_index
|
*/search_index
|
||||||
|
*/pages_index.json
|
||||||
|
14
README.md
14
README.md
@ -21,12 +21,24 @@ An index is generated by a Python script, upon generation of the pages, and is d
|
|||||||
* `index.html`
|
* `index.html`
|
||||||
* `js/bloom.js`: main JS code
|
* `js/bloom.js`: main JS code
|
||||||
* `js/bloomfilters.js`: JS library to use BloomFilters
|
* `js/bloomfilters.js`: JS library to use BloomFilters
|
||||||
* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future.
|
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
* `samples/`: samples for testing purpose (taken from my blog articles)
|
* `samples/`: samples for testing purpose (taken from my blog articles)
|
||||||
|
|
||||||
|
## Data storing
|
||||||
|
|
||||||
|
One of the main problem was to transmit the binary data from the Python script to the JS script. I found [an article about handling binary data in JavaScript](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Sending_and_Receiving_Binary_Data) which helped me a lot.
|
||||||
|
|
||||||
|
Data from the python script is just the array of bloomfilters bitarray written as a binary file (`data/search_index`), which I open with JS. The list of articles is also written in JSON form in a specific file (`data/pages_index.json`).
|
||||||
|
|
||||||
|
Here's the format of the output from the python script:
|
||||||
|
|
||||||
|
* [16 bits] : number of articles (== number of bitarrays)
|
||||||
|
* for each bitarray:
|
||||||
|
* [16 bits] : length of the bitarray
|
||||||
|
* […] : the bitarray itself
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
* I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:
|
* I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:
|
||||||
|
|
||||||
|
0
data/index.html
Normal file
0
data/index.html
Normal file
@ -1,9 +1,11 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import re
|
import re
|
||||||
import stemmer
|
import stemmer
|
||||||
|
import json
|
||||||
from bitarray import bitarray
|
from bitarray import bitarray
|
||||||
from pybloom import BloomFilter
|
from pybloom import BloomFilter
|
||||||
|
|
||||||
@ -22,14 +24,19 @@ def remove_common_words(words):
|
|||||||
return returned
|
return returned
|
||||||
|
|
||||||
|
|
||||||
def bitfield(n):
|
def bitfield(n, fill):
|
||||||
return [1 if digit=='1' else 0 for digit in bin(n)[2:]]
|
return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
samples = list_directory("../samples/")
|
samples = list_directory("../samples/")
|
||||||
filters = {}
|
filters = {}
|
||||||
p = stemmer.PorterStemmer()
|
p = stemmer.PorterStemmer()
|
||||||
write = bitarray()
|
write = bitarray(bitfield(len(samples), 16))
|
||||||
|
|
||||||
|
if len(samples) > 65535:
|
||||||
|
sys.exit("[ERROR] Too many articles to index. You will have to change the "
|
||||||
|
"way data is stored in the binary file to handle such amount of "
|
||||||
|
"files.")
|
||||||
|
|
||||||
for sample in samples:
|
for sample in samples:
|
||||||
with open(sample, 'r') as sample_fh:
|
with open(sample, 'r') as sample_fh:
|
||||||
@ -50,9 +57,16 @@ for sample in samples:
|
|||||||
for word in words:
|
for word in words:
|
||||||
filters[sample].add(word)
|
filters[sample].add(word)
|
||||||
|
|
||||||
with open('search_index', 'wb') as index_fh:
|
if filters[sample].bitarray.length() > 65535:
|
||||||
index_fh.write(filters[samples[0]].bitarray.tobytes()) # TODO
|
sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
|
||||||
|
"will have to change the way data is stored in the binary "
|
||||||
|
"file to handle such amount of text.")
|
||||||
|
|
||||||
write.extend(bitfield(len(filters[samples[0]].bitarray)))
|
write.extend(bitfield(filters[sample].bitarray.length(), 16))
|
||||||
write.extend(filters[samples[0]].bitarray)
|
write.extend(filters[sample].bitarray)
|
||||||
print(write)
|
|
||||||
|
with open('../data/search_index', 'wb') as index_fh:
|
||||||
|
index_fh.write(write.tobytes())
|
||||||
|
|
||||||
|
with open('../data/pages_index.json', 'w') as pages_fh:
|
||||||
|
pages_fh.write(json.dumps(samples))
|
||||||
|
57
js/bloom.js
57
js/bloom.js
@ -1,6 +1,6 @@
|
|||||||
var loading = false;
|
var loading = false;
|
||||||
var usable = false;
|
var usable = false;
|
||||||
var index = false;
|
var search_index = false;
|
||||||
|
|
||||||
window.onload = function() {
|
window.onload = function() {
|
||||||
};
|
};
|
||||||
@ -14,12 +14,12 @@ document.getElementById('search').addEventListener('click', function() {
|
|||||||
this.value = "";
|
this.value = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
if(index === false) {
|
if(search_index === false) {
|
||||||
loading = true;
|
loading = true;
|
||||||
document.getElementById("loading").innerHTML = "Loading index file...";
|
document.getElementById("loading").innerHTML = "Loading index file...";
|
||||||
|
|
||||||
var oReq = new XMLHttpRequest();
|
var oReq = new XMLHttpRequest();
|
||||||
oReq.open("GET", "/index_generation/search_index", true);
|
oReq.open("GET", "/data/search_index", true);
|
||||||
oReq.responseType = "arraybuffer";
|
oReq.responseType = "arraybuffer";
|
||||||
|
|
||||||
oReq.onload = function (oEvent) {
|
oReq.onload = function (oEvent) {
|
||||||
@ -30,12 +30,51 @@ document.getElementById('search').addEventListener('click', function() {
|
|||||||
document.getElementById("loading").innerHTML = "";
|
document.getElementById("loading").innerHTML = "";
|
||||||
|
|
||||||
var tmp = new Uint8Array(arrayBuffer);
|
var tmp = new Uint8Array(arrayBuffer);
|
||||||
for (var i = 0; i < tmp.byteLength; i++) {
|
var nb_filters = 0;
|
||||||
// TODO
|
|
||||||
|
// First 16 bits == number of bitarrays
|
||||||
|
for (var i = 0; i < 16; i++) {
|
||||||
|
nb_filters += tmp[i] << i;
|
||||||
}
|
}
|
||||||
|
search_index = new Array(nb_filters);
|
||||||
|
|
||||||
|
// For each of the bitarrays, parse it
|
||||||
|
var offset = 0;
|
||||||
|
for (var i = 0; i < nb_filters; i++) {
|
||||||
|
// Size of the filter
|
||||||
|
var length = 0;
|
||||||
|
for (var j = offset; j < offset + 16; j++) {
|
||||||
|
length += tmp[j] << j;
|
||||||
|
}
|
||||||
|
search_index[i] = new Uint8Array(length);
|
||||||
|
|
||||||
|
// Parse filter
|
||||||
|
for (var j = 16; j < 16 + length; j++) {
|
||||||
|
search_index[i][j] = tmp[j + offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
offset += 16 + length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
document.getElementById("loading").innerHTML = "Error while loading search index.";
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
oReq.send(null);
|
oReq.send(null);
|
||||||
|
|
||||||
|
var oReq2 = new XMLHttpRequest();
|
||||||
|
oReq2.open("GET", "data/pages_index.json", true);
|
||||||
|
oReq2.onreadystatechange = function() {
|
||||||
|
if (this.readyState == 4) {
|
||||||
|
if (this.status == 200) {
|
||||||
|
pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
oReq2.send();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -45,11 +84,11 @@ function callback_change() {
|
|||||||
}
|
}
|
||||||
var search = document.getElementById("search").value;
|
var search = document.getElementById("search").value;
|
||||||
document.getElementById("results").innerHTML = "<h2>Results :</h2>";
|
document.getElementById("results").innerHTML = "<h2>Results :</h2>";
|
||||||
for(var key in index) {
|
/*TODO for(var key in index) {
|
||||||
//if(index[key].test(search)) { TODO
|
if(index[key].test(search)) {
|
||||||
document.getElementById("results").innerHTML += "<p>"+key+"</p>";
|
document.getElementById("results").innerHTML += "<p>"+key+"</p>";
|
||||||
//}
|
}
|
||||||
}
|
}*/
|
||||||
if(!document.querySelectorAll("#results p").length) {
|
if(!document.querySelectorAll("#results p").length) {
|
||||||
document.getElementById("results").innerHTML += "<p>No results...</p>";
|
document.getElementById("results").innerHTML += "<p>No results...</p>";
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user