Finished handling of binary data by JS

TODO : Recode the pybloom library in JS to load back the BloomFilters in the JS script.
2014-01-11 01:19:50 +01:00 · 2014-01-11 01:19:50 +01:00 · 7eb55478ed
parent 0c33477699
commit 7eb55478ed
5 changed files with 84 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 */__pycache__
 */search_index
 */pages_index.json
--- a/README.md
+++ b/README.md
@ -21,12 +21,24 @@ An index is generated by a Python script, upon generation of the pages, and is d
 * `index.html`
 * `js/bloom.js`: main JS code
 * `js/bloomfilters.js`: JS library to use BloomFilters
 * `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future.
 ### Examples
 * `samples/`: samples for testing purpose (taken from my blog articles)
 ## Data storing
 One of the main problem was to transmit the binary data from the Python script to the JS script. I found [an article about handling binary data in JavaScript](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Sending_and_Receiving_Binary_Data) which helped me a lot.
 Data from the python script is just the array of bloomfilters bitarray written as a binary file (`data/search_index`), which I open with JS. The list of articles is also written in JSON form in a specific file (`data/pages_index.json`).
 Here's the format of the output from the python script:
 * [16 bits] : number of articles (== number of bitarrays)
 * for each bitarray:
    * [16 bits] : length of the bitarray
    * […] : the bitarray itself
 ## Notes
 * I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:
--- a/data/index.html
+++ b/data/index.html
--- a/index_generation/generate_index.py
+++ b/index_generation/generate_index.py
@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 import os
 import sys
 from lxml import html
 import re
 import stemmer
 import json
 from bitarray import bitarray
 from pybloom import BloomFilter
@ -22,14 +24,19 @@ def remove_common_words(words):
    return returned
-def bitfield(n):
+def bitfield(n, fill):
-    return [1 if digit=='1' else 0 for digit in bin(n)[2:]]
+    return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]
 # =============================================================================
 samples = list_directory("../samples/")
 filters = {}
 p = stemmer.PorterStemmer()
-write = bitarray()
+write = bitarray(bitfield(len(samples), 16))
 if len(samples) > 65535:
    sys.exit("[ERROR] Too many articles to index. You will have to change the "
             "way data is stored in the binary file to handle such amount of "
             "files.")
 for sample in samples:
    with open(sample, 'r') as sample_fh:
@ -50,9 +57,16 @@ for sample in samples:
    for word in words:
        filters[sample].add(word)
-with open('search_index', 'wb') as index_fh:
+    if filters[sample].bitarray.length() > 65535:
-    index_fh.write(filters[samples[0]].bitarray.tobytes()) # TODO
+        sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
                 "will have to change the way data is stored in the binary "
                 "file to handle such amount of text.")
-write.extend(bitfield(len(filters[samples[0]].bitarray)))
+    write.extend(bitfield(filters[sample].bitarray.length(), 16))
-write.extend(filters[samples[0]].bitarray)
+    write.extend(filters[sample].bitarray)
-print(write)
+
 with open('../data/search_index', 'wb') as index_fh:
    index_fh.write(write.tobytes())
 with open('../data/pages_index.json', 'w') as pages_fh:
    pages_fh.write(json.dumps(samples))
--- a/js/bloom.js
+++ b/js/bloom.js
@ -1,6 +1,6 @@
 var loading = false;
 var usable = false;
-var index = false;
+var search_index = false;
 window.onload = function() {
 };
@ -14,12 +14,12 @@ document.getElementById('search').addEventListener('click', function() {
        this.value = "";
    }
-    if(index === false) {
+    if(search_index === false) {
        loading = true;
        document.getElementById("loading").innerHTML = "Loading index file...";
        var oReq = new XMLHttpRequest();
-        oReq.open("GET", "/index_generation/search_index", true);
+        oReq.open("GET", "/data/search_index", true);
        oReq.responseType = "arraybuffer";
        oReq.onload = function (oEvent) {
@ -30,12 +30,51 @@ document.getElementById('search').addEventListener('click', function() {
                document.getElementById("loading").innerHTML = "";
                var tmp = new Uint8Array(arrayBuffer);
-                for (var i = 0; i < tmp.byteLength; i++) {
+                var nb_filters = 0;
-                    // TODO
+
                // First 16 bits == number of bitarrays
                for (var i = 0; i < 16; i++) {
                    nb_filters += tmp[i] << i;
                }
                search_index = new Array(nb_filters);
                // For each of the bitarrays, parse it
                var offset = 0;
                for (var i = 0; i < nb_filters; i++) {
                    // Size of the filter
                    var length = 0;
                    for (var j = offset; j < offset + 16; j++) {
                        length += tmp[j] << j;
                    }
                    search_index[i] = new Uint8Array(length);
                    // Parse filter
                    for (var j = 16; j < 16 + length; j++) {
                        search_index[i][j] = tmp[j + offset];
                    }
                    offset += 16 + length;
                }
            }
            else {
                document.getElementById("loading").innerHTML = "Error while loading search index.";
            }
        };
        oReq.send(null);
        var oReq2 = new XMLHttpRequest();
        oReq2.open("GET", "data/pages_index.json", true);
        oReq2.onreadystatechange = function() {
            if (this.readyState == 4) {
                if (this.status == 200) {
                    pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")");
                }
                else {
                    document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText;
                }
            }
        }
        oReq2.send();
    }
 });
@ -45,11 +84,11 @@ function callback_change() {
    }
    var search = document.getElementById("search").value;
    document.getElementById("results").innerHTML = "<h2>Results :</h2>";
-    for(var key in index) {
+/*TODO    for(var key in index) {
-        //if(index[key].test(search)) { TODO
+        if(index[key].test(search)) {
            document.getElementById("results").innerHTML += "<p>"+key+"</p>";
-        //}
+        }
-    }
+    }*/
    if(!document.querySelectorAll("#results p").length) {
        document.getElementById("results").innerHTML += "<p>No results...</p>";
    }