Finished handling of binary data by JS

TODO : Recode the pybloom library in JS to load back the BloomFilters in the JS script.
2014-01-11 01:19:50 +01:00 · 2014-01-11 01:19:50 +01:00 · 7eb55478ed
parent 0c33477699
commit 7eb55478ed
5 changed files with 84 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 */__pycache__
 */search_index
+*/pages_index.json
--- a/README.md
+++ b/README.md
@ -21,12 +21,24 @@ An index is generated by a Python script, upon generation of the pages, and is d
 * `index.html`
 * `js/bloom.js`: main JS code
 * `js/bloomfilters.js`: JS library to use BloomFilters
-* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future.

 ### Examples

 * `samples/`: samples for testing purpose (taken from my blog articles)

+## Data storing
+
+One of the main problem was to transmit the binary data from the Python script to the JS script. I found [an article about handling binary data in JavaScript](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Sending_and_Receiving_Binary_Data) which helped me a lot.
+
+Data from the python script is just the array of bloomfilters bitarray written as a binary file (`data/search_index`), which I open with JS. The list of articles is also written in JSON form in a specific file (`data/pages_index.json`).
+
+Here's the format of the output from the python script:
+
+* [16 bits] : number of articles (== number of bitarrays)
+* for each bitarray:
+    * [16 bits] : length of the bitarray
+    * […] : the bitarray itself
+
 ## Notes
 * I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:

--- a/data/index.html
+++ b/data/index.html
--- a/index_generation/generate_index.py
+++ b/index_generation/generate_index.py
@ -1,9 +1,11 @@
 #!/usr/bin/env python3

 import os
+import sys
 from lxml import html
 import re
 import stemmer
+import json
 from bitarray import bitarray
 from pybloom import BloomFilter

@ -22,14 +24,19 @@ def remove_common_words(words):
    return returned


-def bitfield(n):
-    return [1 if digit=='1' else 0 for digit in bin(n)[2:]]
+def bitfield(n, fill):
+    return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]

 # =============================================================================
 samples = list_directory("../samples/")
 filters = {}
 p = stemmer.PorterStemmer()
-write = bitarray()
+write = bitarray(bitfield(len(samples), 16))
+
+if len(samples) > 65535:
+    sys.exit("[ERROR] Too many articles to index. You will have to change the "
+             "way data is stored in the binary file to handle such amount of "
+             "files.")

 for sample in samples:
    with open(sample, 'r') as sample_fh:
@ -50,9 +57,16 @@ for sample in samples:
    for word in words:
        filters[sample].add(word)

-with open('search_index', 'wb') as index_fh:
-    index_fh.write(filters[samples[0]].bitarray.tobytes()) # TODO
+    if filters[sample].bitarray.length() > 65535:
+        sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
+                 "will have to change the way data is stored in the binary "
+                 "file to handle such amount of text.")

-write.extend(bitfield(len(filters[samples[0]].bitarray)))
-write.extend(filters[samples[0]].bitarray)
-print(write)
+    write.extend(bitfield(filters[sample].bitarray.length(), 16))
+    write.extend(filters[sample].bitarray)
+
+with open('../data/search_index', 'wb') as index_fh:
+    index_fh.write(write.tobytes())
+
+with open('../data/pages_index.json', 'w') as pages_fh:
+    pages_fh.write(json.dumps(samples))
--- a/js/bloom.js
+++ b/js/bloom.js
@ -1,6 +1,6 @@
 var loading = false;
 var usable = false;
-var index = false;
+var search_index = false;

 window.onload = function() {
 };
@ -14,12 +14,12 @@ document.getElementById('search').addEventListener('click', function() {
        this.value = "";
    }

-    if(index === false) {
+    if(search_index === false) {
        loading = true;
        document.getElementById("loading").innerHTML = "Loading index file...";

        var oReq = new XMLHttpRequest();
-        oReq.open("GET", "/index_generation/search_index", true);
+        oReq.open("GET", "/data/search_index", true);
        oReq.responseType = "arraybuffer";

        oReq.onload = function (oEvent) {
@ -30,12 +30,51 @@ document.getElementById('search').addEventListener('click', function() {
                document.getElementById("loading").innerHTML = "";

                var tmp = new Uint8Array(arrayBuffer);
-                for (var i = 0; i < tmp.byteLength; i++) {
-                    // TODO
+                var nb_filters = 0;
+
+                // First 16 bits == number of bitarrays
+                for (var i = 0; i < 16; i++) {
+                    nb_filters += tmp[i] << i;
                }
+                search_index = new Array(nb_filters);
+
+                // For each of the bitarrays, parse it
+                var offset = 0;
+                for (var i = 0; i < nb_filters; i++) {
+                    // Size of the filter
+                    var length = 0;
+                    for (var j = offset; j < offset + 16; j++) {
+                        length += tmp[j] << j;
+                    }
+                    search_index[i] = new Uint8Array(length);
+
+                    // Parse filter
+                    for (var j = 16; j < 16 + length; j++) {
+                        search_index[i][j] = tmp[j + offset];
+                    }
+
+                    offset += 16 + length;
+                }
+            }
+            else {
+                document.getElementById("loading").innerHTML = "Error while loading search index.";
            }
        };
        oReq.send(null);
+
+        var oReq2 = new XMLHttpRequest();
+        oReq2.open("GET", "data/pages_index.json", true);
+        oReq2.onreadystatechange = function() {
+            if (this.readyState == 4) {
+                if (this.status == 200) {
+                    pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")");
+                }
+                else {
+                    document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText;
+                }
+            }
+        }
+        oReq2.send();
    }
 });

@ -45,11 +84,11 @@ function callback_change() {
    }
    var search = document.getElementById("search").value;
    document.getElementById("results").innerHTML = "<h2>Results :</h2>";
-    for(var key in index) {
-        //if(index[key].test(search)) { TODO
+/*TODO    for(var key in index) {
+        if(index[key].test(search)) {
            document.getElementById("results").innerHTML += "<p>"+key+"</p>";
-        //}
-    }
+        }
+    }*/
    if(!document.querySelectorAll("#results p").length) {
        document.getElementById("results").innerHTML += "<p>No results...</p>";
    }