From 7eb55478ed62386212676fe1b2777c7ed9e3e397 Mon Sep 17 00:00:00 2001
From: Phyks <webmaster@phyks.me>
Date: Sat, 11 Jan 2014 01:19:50 +0100
Subject: [PATCH] Finished handling of binary data by JS

TODO : Recode the pybloom library in JS to load back the BloomFilters in
the JS script.
---
 .gitignore                         |  1 +
 README.md                          | 14 +++++++-
 data/index.html                    |  0
 index_generation/generate_index.py | 30 +++++++++++-----
 js/bloom.js                        | 57 +++++++++++++++++++++++++-----
 5 files changed, 84 insertions(+), 18 deletions(-)
 create mode 100644 data/index.html

diff --git a/.gitignore b/.gitignore
index ee29b3d..e8f0a64 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 */__pycache__
 */search_index
+*/pages_index.json
diff --git a/README.md b/README.md
index e820858..f814ee8 100644
--- a/README.md
+++ b/README.md
@@ -21,12 +21,24 @@ An index is generated by a Python script, upon generation of the pages, and is d
 * `index.html`
 * `js/bloom.js`: main JS code
 * `js/bloomfilters.js`: JS library to use BloomFilters
-* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future.
 
 ### Examples
 
 * `samples/`: samples for testing purpose (taken from my blog articles)
 
+## Data storing
+
+One of the main problem was to transmit the binary data from the Python script to the JS script. I found [an article about handling binary data in JavaScript](https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest/Sending_and_Receiving_Binary_Data) which helped me a lot.
+
+Data from the python script is just the array of bloomfilters bitarray written as a binary file (`data/search_index`), which I open with JS. The list of articles is also written in JSON form in a specific file (`data/pages_index.json`).
+
+Here's the format of the output from the python script:
+
+* [16 bits] : number of articles (== number of bitarrays)
+* for each bitarray:
+    * [16 bits] : length of the bitarray
+    * […] : the bitarray itself
+
 ## Notes
 * I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:
 
diff --git a/data/index.html b/data/index.html
new file mode 100644
index 0000000..e69de29
diff --git a/index_generation/generate_index.py b/index_generation/generate_index.py
index a0464ba..24a52fe 100755
--- a/index_generation/generate_index.py
+++ b/index_generation/generate_index.py
@@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 
 import os
+import sys
 from lxml import html
 import re
 import stemmer
+import json
 from bitarray import bitarray
 from pybloom import BloomFilter
 
@@ -22,14 +24,19 @@ def remove_common_words(words):
     return returned
 
 
-def bitfield(n):
-    return [1 if digit=='1' else 0 for digit in bin(n)[2:]]
+def bitfield(n, fill):
+    return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]
 
 # =============================================================================
 samples = list_directory("../samples/")
 filters = {}
 p = stemmer.PorterStemmer()
-write = bitarray()
+write = bitarray(bitfield(len(samples), 16))
+
+if len(samples) > 65535:
+    sys.exit("[ERROR] Too many articles to index. You will have to change the "
+             "way data is stored in the binary file to handle such amount of "
+             "files.")
 
 for sample in samples:
     with open(sample, 'r') as sample_fh:
@@ -50,9 +57,16 @@ for sample in samples:
     for word in words:
         filters[sample].add(word)
 
-with open('search_index', 'wb') as index_fh:
-    index_fh.write(filters[samples[0]].bitarray.tobytes()) # TODO
+    if filters[sample].bitarray.length() > 65535:
+        sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
+                 "will have to change the way data is stored in the binary "
+                 "file to handle such amount of text.")
 
-write.extend(bitfield(len(filters[samples[0]].bitarray)))
-write.extend(filters[samples[0]].bitarray)
-print(write)
+    write.extend(bitfield(filters[sample].bitarray.length(), 16))
+    write.extend(filters[sample].bitarray)
+
+with open('../data/search_index', 'wb') as index_fh:
+    index_fh.write(write.tobytes())
+
+with open('../data/pages_index.json', 'w') as pages_fh:
+    pages_fh.write(json.dumps(samples))
diff --git a/js/bloom.js b/js/bloom.js
index f22e0fe..c1344ac 100644
--- a/js/bloom.js
+++ b/js/bloom.js
@@ -1,6 +1,6 @@
 var loading = false;
 var usable = false;
-var index = false;
+var search_index = false;
 
 window.onload = function() {
 };
@@ -14,12 +14,12 @@ document.getElementById('search').addEventListener('click', function() {
         this.value = "";
     }
 
-    if(index === false) {
+    if(search_index === false) {
         loading = true;
         document.getElementById("loading").innerHTML = "Loading index file...";
 
         var oReq = new XMLHttpRequest();
-        oReq.open("GET", "/index_generation/search_index", true);
+        oReq.open("GET", "/data/search_index", true);
         oReq.responseType = "arraybuffer";
 
         oReq.onload = function (oEvent) {
@@ -30,12 +30,51 @@ document.getElementById('search').addEventListener('click', function() {
                 document.getElementById("loading").innerHTML = "";
 
                 var tmp = new Uint8Array(arrayBuffer);
-                for (var i = 0; i < tmp.byteLength; i++) {
-                    // TODO
+                var nb_filters = 0;
+
+                // First 16 bits == number of bitarrays
+                for (var i = 0; i < 16; i++) {
+                    nb_filters += tmp[i] << i;
                 }
+                search_index = new Array(nb_filters);
+
+                // For each of the bitarrays, parse it
+                var offset = 0;
+                for (var i = 0; i < nb_filters; i++) {
+                    // Size of the filter
+                    var length = 0;
+                    for (var j = offset; j < offset + 16; j++) {
+                        length += tmp[j] << j;
+                    }
+                    search_index[i] = new Uint8Array(length);
+
+                    // Parse filter
+                    for (var j = 16; j < 16 + length; j++) {
+                        search_index[i][j] = tmp[j + offset];
+                    }
+
+                    offset += 16 + length;
+                }
+            }
+            else {
+                document.getElementById("loading").innerHTML = "Error while loading search index.";
             }
         };
         oReq.send(null);
+
+        var oReq2 = new XMLHttpRequest();
+        oReq2.open("GET", "data/pages_index.json", true);
+        oReq2.onreadystatechange = function() {
+            if (this.readyState == 4) {
+                if (this.status == 200) {
+                    pages_index = window.JSON ? JSON.parse(this.responseText) : eval("("+this.responseText+")");
+                }
+                else {
+                    document.getElementById("loading").innerHTML = "Error while loading pages index : HTTP error " + this.status + " " + this.statusText;
+                }
+            }
+        }
+        oReq2.send();
     }
 });
 
@@ -45,11 +84,11 @@ function callback_change() {
     }
     var search = document.getElementById("search").value;
     document.getElementById("results").innerHTML = "<h2>Results :</h2>";
-    for(var key in index) {
-        //if(index[key].test(search)) { TODO
+/*TODO    for(var key in index) {
+        if(index[key].test(search)) {
             document.getElementById("results").innerHTML += "<p>"+key+"</p>";
-        //}
-    }
+        }
+    }*/
     if(!document.querySelectorAll("#results p").length) {
         document.getElementById("results").innerHTML += "<p>No results...</p>";
     }