From 496e2f823f07d90219593d0f3a4e125c29213b38 Mon Sep 17 00:00:00 2001
From: Phyks <webmaster@phyks.me>
Date: Tue, 28 Oct 2014 02:36:28 +0100
Subject: [PATCH] Commit before debugging bloom.py

---
 data/filters                       | Bin 0 -> 180 bytes
 data/pages.json                    |   1 +
 data/test                          | Bin 0 -> 12 bytes
 data/words                         | Bin 0 -> 248 bytes
 index.html                         |  17 +-
 index_generation/bloom.py          | 162 +++++++++++++++++
 index_generation/generate_index.py | 125 ++++++-------
 index_generation/pybloom.py        | 277 -----------------------------
 js/app.js                          | 121 +++++++++++++
 js/bloom.js                        |   9 +-
 js/stemmer.js                      | 186 +++++++++++++++++++
 js/test.js                         |   2 +-
 js/test2.js                        |  25 +++
 test.html                          |  13 ++
 14 files changed, 584 insertions(+), 354 deletions(-)
 create mode 100644 data/filters
 create mode 100644 data/pages.json
 create mode 100644 data/test
 create mode 100644 data/words
 create mode 100644 index_generation/bloom.py
 delete mode 100644 index_generation/pybloom.py
 create mode 100644 js/app.js
 create mode 100644 js/stemmer.js
 create mode 100644 js/test2.js
 create mode 100644 test.html
diff --git a/data/filters b/data/filters
new file mode 100644
index 0000000000000000000000000000000000000000..22b4dacd3de6294e95c847b195f8e1dba76e5f39
GIT binary patch
literal 180
zcmV;l089S@0000E0000S0002h-?ls^>GtZSiYJ^c7Eg4b2qE!*W?h(*lUCnt==<_~
z1Zo9$-Cf^A{3mh_{@<&DQ5z{1L^bv9>d+LmtU><Sr>8928MbAg%lAbw{QgPri`$gn
zy84RRC~Zym`PpTYZ8j(}(l(kZpT=T$Aj0%>c|0EC&8|o_`X`2|Smj^1XIb~^BkUem
iQ1a`?MA3=6eFI}w{OErQHC=k^-v;m1!k?Mlp)c=5bzHsx

literal 0
HcmV?d00001

diff --git a/data/pages.json b/data/pages.json
new file mode 100644
index 0000000..a860202
--- /dev/null
+++ b/data/pages.json
@@ -0,0 +1 @@
+{"index": ["../samples/cryptdevice_multi.html", "../samples/highmon_weechat.html"]}
\ No newline at end of file
diff --git a/data/test b/data/test
new file mode 100644
index 0000000000000000000000000000000000000000..658b8a864a8e88fcc42a08633e6ef57591378a8f
GIT binary patch
literal 12
RcmZQ%U|?Vb;*N#|O#lUg0xbXl

literal 0
HcmV?d00001

diff --git a/data/words b/data/words
new file mode 100644
index 0000000000000000000000000000000000000000..3058f6fd2ec64b78fc5423eb67378d3e4852f2f2
GIT binary patch
literal 248
zcmeytz`($&r>9?>m|KvOTCAU3R9R4xl3JFToEo27np2Xgmr;_N!(a|n#<0i4Zr2In
z-tMW_UH`sw=Q>$4KO*aUua{i?&xe!Oyg#bF{MsJZip3uLrdKq+`SWjc?W{W&C(qcn
z3fcIK%=C=h{Ji+`)YRmR#1e?%c_72@JXz~yf4Hw+Wq#a`pJ&*AUEQ`ova+gt#lG|@
zg3hN;#((Uc8|Z)B$mX`}+m%Px8~c6P{4jq8cg^AX@2t<16)S4H#mtv|(7erV*D>yd
pnz^?378@7W{jOR%DfCv_Cb@m-udBaqkJ38*onryhhL!u+832V>b_)Oi

literal 0
HcmV?d00001

diff --git a/index.html b/index.html
index c348094..20aff01 100644
--- a/index.html
+++ b/index.html
@@ -3,17 +3,22 @@
     <head>
         <meta charset="utf-8">
         <title>BloomJS demo</title>
+        <style type="text/css">
+            #error {
+                font-weight: bold;
+                color: red;
+            }
+        </style>
     </head>
 <body>
     <h1>Bloom.JS demo</h1>
-    <p id="loading"></p>
-    <form id="search_form">
-        <p>
-            <input type="text" id="search" name="search" value="Search for articles..."/>
-        </p>
-    </form>
+    <p id="error"></p>
+    <div id="main">
+        <p>Loading…</p>
+    </div>
     <div id="results"></div>
     <script type="text/javascript" src="js/bloom.js"></script>
+    <script type="text/javascript" src="js/stemmer.js"></script>
     <script type="text/javascript" src="js/app.js"></script>
 </body>
 </html>
diff --git a/index_generation/bloom.py b/index_generation/bloom.py
new file mode 100644
index 0000000..562c2f1
--- /dev/null
+++ b/index_generation/bloom.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+
+"""
+This is a translation of the bloom.js script (originally from
+https://github.com/jasondavies/bloomfilter.js) in Python.
+
+Due to its status of translation of the previously mentionned JS code, you
+should refer to this one for any particular doc that should be missing in this
+implementation.
+
+Needs the bitarray python module to work.
+
+Note : Depending on your use case, the pybloom module available on Pypi may
+better suits your needs. I reimplemented the above mentionned JS script in
+Python mostly because I had to for this script, as the pybloom module uses
+advanced hashing techniques, difficult to implement in JS.
+
+This script has been written by Phyks and is in the public domain (or whatever
+is closer to public domain in your country).
+"""
+
+import math
+
+try:
+    import numpy as np
+except ImportError:
+    raise ImportError('This script requires numpy')
+
+
+class BloomFilter():
+    def __init__(self, capacity, error_rate=0.1):
+        """
+        Implements a space-efficient probabilistic data structure.
+
+        capacity
+            This is the capacity of the BloomFilter. So to speak, it should be
+            able to store at least *capacity* elements
+        error_rate
+            the error rate of the filter returning false positives. This
+            determines the filters capacity. Inserting more than capacity
+            elements greatly increases the chance of false positive.
+        """
+        if not (0 < error_rate < 1):
+            raise ValueError("Error_Rate must be between 0 and 1.")
+        if not capacity > 0 or type(capacity) != int:
+            raise ValueError("Capacity must be > 0")
+
+        # Same calculation as in the js file, see it for reference purpose
+        # Basically determines the number of bits and slices from the capacity
+        # and error_rate.
+        k = math.ceil(- math.log(error_rate, 2))
+        m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k
+        n = math.ceil(m / 32)
+        m = n * 32
+        self.m = m
+        self.k = k
+
+        kbytes = 1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))
+        self.buckets = np.zeros(n, dtype=np.int32)
+        if kbytes == 1:
+            loc_type = np.uint8
+        elif kbytes == 2:
+            loc_type = np.uint16
+        else:
+            loc_type = np.int32
+        self._locations = np.zeros(k, dtype=loc_type)
+
+    def locations(self, v):
+        r = self._locations
+        a = self.fnv_1a(v)
+        b = self.fnv_1a_b(a)
+        print(b)
+        i = 0
+        x = a % self.m
+        while i < self.k:
+            r[i] = (x + self.m) if x < 0 else x
+            x = (x + b) % self.m
+            i += 1
+        return r
+
+    def add(self, v):
+        l = self.locations(v + "")
+        i = 0
+        buckets = self.buckets
+        while i < self.k:
+            buckets[math.floor(l[i] / 32)] |= 1 << int(l[i] % 32)
+            i += 1
+
+    def test(self, v):
+        l = self.locations(v + "")
+        i = 0
+        buckets = self.buckets
+        while i < self.k:
+            b = l[i]
+            if buckets[math.floor(b / 32)] & (1 << int(b % 32)) == 0:
+                return False
+            i += 1
+        return True
+
+    def size(self):
+        """
+        Estimated cardinality
+        """
+        bits = 0
+        buckets = self.buckets
+        for i in range(0, len(buckets)):
+            bits += self.popcnt(buckets[i])
+        return -self.m * math.log(1 - bits / self.m) / self.k
+
+    def popcnt(self, v):
+        """
+        http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+        """
+        v -= (v >> 1) & 0x55555555
+        v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
+        return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24
+
+    def fnv_1a(self, v):
+        """
+        Fowler/Noll/Vo hashing.
+        """
+        n = len(v)
+        a = 2166136261
+        i = 0
+        while i < n:
+            c = ord(v[i])
+            d = c & 0xff000000
+            if d:
+                a ^= d >> 24
+                a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
+            d = c & 0xff0000
+            if d:
+                a ^= d >> 16
+                a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
+            d = c & 0xff00
+            if d:
+                a ^= d >> 8
+                a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
+            a ^= c & 0xff
+            print(a ^ (c & 0xff))
+            a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
+            i += 1
+        # From http://home.comcast.net/~bretm/hash/6.html
+        a += a << 13
+        a ^= a >> 7
+        a += a << 3
+        a ^= a >> 17
+        a += a << 5
+        return a & 0xffffffff
+
+    def fnv_1a_b(self, a):
+        """
+        One additional iteration of FNV, given a hash.
+        """
+        a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
+        a += a << 13
+        a ^= a >> 7
+        a += a << 3
+        a ^= a >> 17
+        a += a << 5
+        print(a)
+        return a & 0xffffffff
diff --git a/index_generation/generate_index.py b/index_generation/generate_index.py
index 3e5ae55..d399244 100755
--- a/index_generation/generate_index.py
+++ b/index_generation/generate_index.py
@@ -1,92 +1,83 @@
 #!/usr/bin/env python3
 
+"""
+Inspired by
+http://www.stavros.io/posts/bloom-filter-search-engine/?print
+
+
+You have to install the numpy python module for bloom to work.
+"""
+
+import bloom
+import json
 import os
-import sys
-from lxml import html
 import re
 import stemmer
-import json
-from bitarray import bitarray
-from pybloom import BloomFilter
+import struct
+import sys
+
+from lxml import html
 
 
-# List all files in path directory
 def list_directory(path):
-    fichier = []
+    """Recursively list all files in a given directory."""
+    files_list = []
     for root, dirs, files in os.walk(path):
         for i in files:
-            fichier.append(os.path.join(root, i))
-    return fichier
+            files_list.append(os.path.join(root, i))
+    return files_list
 
 
 def remove_common_words(words):
+    """Removes all words that are less than 3 characters long."""
     returned = [word for word in words if len(word) > 3]
     return returned
 
-def padding_16(x):
-    if x < 256:
-        return bytes([0,x])
-    else:
-        return bytes([int(x/256), x%256])
 
-# =============================================================================
-samples = list_directory("../samples/")
-filters = {}
-p = stemmer.PorterStemmer()
-write_little = bitarray(endian="little")
-write_big = bitarray(endian="big")
+if __name__ == "__main__":
+    error_rate = 0.1
+    samples = list_directory("../samples/")
+    filters = []
+    p = stemmer.PorterStemmer()
 
-write_little.frombytes(padding_16(len(samples)))
-write_big.frombytes(padding_16(len(samples)))
+    for sample in samples:
+        with open(sample, 'r') as sample_fh:
+            content = sample_fh.read()
 
-if len(samples) > 65535:
-    sys.exit("[ERROR] Too many articles to index. You will have to change the "
-             "way data is stored in the binary file to handle such amount of "
-             "files.")
+        # Get text from HTML content
+        words = html.fromstring(content).text_content().replace("\n", "")
+        words = re.findall(r"[\w]+", words)
+        # Remove all punctuation etc., convert words to lower and delete
+        # duplicates
+        words = list(set([word.lower() for word in words]))
 
-for sample in samples:
-    with open(sample, 'r') as sample_fh:
-        content = sample_fh.read()
+        # Remove common words
+        words = remove_common_words(words)
+        # Stemming to reduce the number of words
+        words = list(set([p.stem(word, 0, len(word)-1) for word in words]))
 
-    # Get text from HTML content
-    words = html.fromstring(content).text_content().replace("\n", "")
-    words = re.findall(r"[\w]+", words)
-    # Remove all punctuation etc., convert words to lower and delete duplicates
-    words = list(set([word.lower() for word in words]))
+        tmp_filter = bloom.BloomFilter(capacity=len(words),
+                                       error_rate=error_rate)
+        words = json.loads('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]')
+        for word in words:
+            tmp_filter.add(word)
 
-    # Remove common words
-    words = remove_common_words(words)
-    # Stemming to reduce the number of words
-    words = [p.stem(word, 0, len(word)-1) for word in words]
+        filters.append(tmp_filter.buckets)
+        print(tmp_filter.buckets)
+        sys.exit()
 
-    filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1)
-    for word in words:
-        filters[sample].add(word)
+    # First Int32 is length
+    filters_to_write = struct.pack("<i", len(filters))
+    # Then comes the length of each filter
+    for i in filters:
+        filters_to_write += struct.pack("<i", len(i))
+    # Finally comes the filters themselves
+    for i in filters:
+        filters_to_write += struct.pack("<%di" % len(i), *i)
 
-    if filters[sample].bitarray.length() > 65535:
-        sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
-                 "will have to change the way data is stored in the binary "
-                 "file to handle such amount of text.")
+    # Write everything
+    with open("../data/filters", "wb") as index_fh:
+        index_fh.write(filters_to_write)
 
-    tmp = bitarray(endian="little")
-    tmp.frombytes(padding_16(filters[sample].bitarray.length()))
-    write_little.extend(tmp)
-    write_little.extend(filters[sample].bitarray)
-    write_little.extend([0 for i in range(filters[sample].bitarray.length() %
-                                          8)])
-    tmp = bitarray(endian="big")
-    tmp.frombytes(padding_16(filters[sample].bitarray.length()))
-    write_big.extend(tmp)
-    write_big.extend(filters[sample].bitarray)
-    write_big.extend([0 for i in range(filters[sample].bitarray.length() %
-                                          8)])
-
-with open('../data/search_index_little', 'wb') as index_fh:
-    print(write_little)
-    write_little.tofile(index_fh)
-with open('../data/search_index_big', 'wb') as index_fh:
-    print(write_big)
-    write_big.tofile(index_fh)
-
-with open('../data/pages_index.json', 'w') as pages_fh:
-    pages_fh.write(json.dumps(samples))
+    with open("../data/pages.json", "w") as pages_fh:
+        pages_fh.write(json.dumps({"index": samples}))
diff --git a/index_generation/pybloom.py b/index_generation/pybloom.py
deleted file mode 100644
index ce3d54a..0000000
--- a/index_generation/pybloom.py
+++ /dev/null
@@ -1,277 +0,0 @@
-import math
-import hashlib
-from struct import unpack, pack, calcsize
-
-try:
-    import bitarray
-except ImportError:
-    raise ImportError('pybloom requires bitarray >= 0.3.4')
-
-__version__ = '2.0'
-__author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
-Marius Eriksen <marius@monkey.org>,\
-Alex Brasetvik <alex@brasetvik.com>"
-
-def make_hashfuncs(num_slices, num_bits):
-    if num_bits >= (1 << 31):
-        fmt_code, chunk_size = 'Q', 8
-    elif num_bits >= (1 << 15):
-        fmt_code, chunk_size = 'I', 4
-    else:
-        fmt_code, chunk_size = 'H', 2
-    total_hash_bits = 8 * num_slices * chunk_size
-    if total_hash_bits > 384:
-        hashfn = hashlib.sha512
-    elif total_hash_bits > 256:
-        hashfn = hashlib.sha384
-    elif total_hash_bits > 160:
-        hashfn = hashlib.sha256
-    elif total_hash_bits > 128:
-        hashfn = hashlib.sha1
-    else:
-        hashfn = hashlib.md5
-    fmt = fmt_code * (hashfn().digest_size // chunk_size)
-    num_salts, extra = divmod(num_slices, len(fmt))
-    if extra:
-        num_salts += 1
-    salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]
-    def _make_hashfuncs(key):
-        key = str(key)
-        rval = []
-        for salt in salts:
-            h = salt.copy()
-            h.update(key.encode('utf-8'))
-            rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
-        del rval[num_slices:]
-        return rval
-    return _make_hashfuncs
-
-
-class BloomFilter(object):
-    FILE_FMT = '<dQQQQ'
-
-    def __init__(self, capacity, error_rate=0.001):
-        if not (0 < error_rate < 1):
-            raise ValueError("Error_Rate must be between 0 and 1.")
-        if not capacity > 0:
-            raise ValueError("Capacity must be > 0")
-        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
-        bits_per_slice = int(math.ceil(
-            (capacity * abs(math.log(error_rate))) /
-            (num_slices * (math.log(2) ** 2))))
-        self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
-        self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
-        self.bitarray.setall(False)
-
-    def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
-        self.error_rate = error_rate
-        self.num_slices = num_slices
-        self.bits_per_slice = bits_per_slice
-        self.capacity = capacity
-        self.num_bits = num_slices * bits_per_slice
-        self.count = count
-        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
-
-    def __contains__(self, key):
-        bits_per_slice = self.bits_per_slice
-        bitarray = self.bitarray
-        if not isinstance(key, list):
-            hashes = self.make_hashes(key)
-        else:
-            hashes = key
-        offset = 0
-        for k in hashes:
-            if not bitarray[offset + k]:
-                return False
-            offset += bits_per_slice
-        return True
-
-    def __len__(self):
-        """Return the number of keys stored by this bloom filter."""
-        return self.count
-
-    def add(self, key, skip_check=False):
-        bitarray = self.bitarray
-        bits_per_slice = self.bits_per_slice
-        hashes = self.make_hashes(key)
-        if not skip_check and hashes in self:
-            return True
-        if self.count > self.capacity:
-            raise IndexError("BloomFilter is at capacity")
-        offset = 0
-        for k in hashes:
-            self.bitarray[offset + k] = True
-            offset += bits_per_slice
-        self.count += 1
-        return False
-
-    def copy(self):
-        """Return a copy of this bloom filter.
-"""
-        new_filter = BloomFilter(self.capacity, self.error_rate)
-        new_filter.bitarray = self.bitarray.copy()
-        return new_filter
-
-    def union(self, other):
-        """ Calculates the union of the two underlying bitarrays and returns
-a new bloom filter object."""
-        if self.capacity != other.capacity or \
-            self.error_rate != other.error_rate:
-            raise ValueError("Unioning filters requires both filters to have \
-both the same capacity and error rate")
-        new_bloom = self.copy()
-        new_bloom.bitarray = new_bloom.bitarray | other.bitarray
-        return new_bloom
-
-    def __or__(self, other):
-        return self.union(other)
-
-    def intersection(self, other):
-        """ Calculates the intersection of the two underlying bitarrays and returns
-a new bloom filter object."""
-        if self.capacity != other.capacity or \
-            self.error_rate != other.error_rate:
-            raise ValueError("Intersecting filters requires both filters to \
-have equal capacity and error rate")
-        new_bloom = self.copy()
-        new_bloom.bitarray = new_bloom.bitarray & other.bitarray
-        return new_bloom
-
-    def __and__(self, other):
-        return self.intersection(other)
-
-    def tofile(self, f):
-        """Write the bloom filter to file object `f'. Underlying bits
-are written as machine values. This is much more space
-efficient than pickling the object."""
-        f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
-                     self.bits_per_slice, self.capacity, self.count))
-        self.bitarray.tofile(f)
-
-    @classmethod
-    def fromfile(cls, f, n=-1):
-        """Read a bloom filter from file-object `f' serialized with
-``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
-        headerlen = calcsize(cls.FILE_FMT)
-
-        if 0 < n < headerlen:
-            raise ValueError('n too small!')
-
-        filter = cls(1) # Bogus instantiation, we will `_setup'.
-        filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
-        filter.bitarray = bitarray.bitarray(endian='little')
-        if n > 0:
-            filter.bitarray.fromfile(f, n - headerlen)
-        else:
-            filter.bitarray.fromfile(f)
-        if filter.num_bits != filter.bitarray.length() and \
-               (filter.num_bits + (8 - filter.num_bits % 8)
-                != filter.bitarray.length()):
-            raise ValueError('Bit length mismatch!')
-
-        return filter
-
-    def __getstate__(self):
-        d = self.__dict__.copy()
-        del d['make_hashes']
-        return d
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
-
-class ScalableBloomFilter(object):
-    SMALL_SET_GROWTH = 2 # slower, but takes up less memory
-    LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
-    FILE_FMT = '<idQd'
-
-    def __init__(self, initial_capacity=100, error_rate=0.001,
-                 mode=SMALL_SET_GROWTH):
-        if not error_rate or error_rate < 0:
-            raise ValueError("Error_Rate must be a decimal less than 0.")
-        self._setup(mode, 0.9, initial_capacity, error_rate)
-        self.filters = []
-
-    def _setup(self, mode, ratio, initial_capacity, error_rate):
-        self.scale = mode
-        self.ratio = ratio
-        self.initial_capacity = initial_capacity
-        self.error_rate = error_rate
-
-    def __contains__(self, key):
-        for f in reversed(self.filters):
-            if key in f:
-                return True
-        return False
-
-    def add(self, key):
-        if key in self:
-            return True
-        if not self.filters:
-            filter = BloomFilter(
-                capacity=self.initial_capacity,
-                error_rate=self.error_rate * (1.0 - self.ratio))
-            self.filters.append(filter)
-        else:
-            filter = self.filters[-1]
-            if filter.count >= filter.capacity:
-                filter = BloomFilter(
-                    capacity=filter.capacity * self.scale,
-                    error_rate=filter.error_rate * self.ratio)
-                self.filters.append(filter)
-        filter.add(key, skip_check=True)
-        return False
-
-    @property
-    def capacity(self):
-        """Returns the total capacity for all filters in this SBF"""
-        return sum([f.capacity for f in self.filters])
-
-    @property
-    def count(self):
-        return len(self)
-
-    def tofile(self, f):
-        """Serialize this ScalableBloomFilter into the file-object
-`f'."""
-        f.write(pack(self.FILE_FMT, self.scale, self.ratio,
-                     self.initial_capacity, self.error_rate))
-
-        # Write #-of-filters
-        f.write(pack('<l', len(self.filters)))
-
-        if len(self.filters) > 0:
-            # Then each filter directly, with a header describing
-            # their lengths.
-            headerpos = f.tell()
-            headerfmt = '<' + 'Q'*(len(self.filters))
-            f.write('.' * calcsize(headerfmt))
-            filter_sizes = []
-            for filter in self.filters:
-                begin = f.tell()
-                filter.tofile(f)
-                filter_sizes.append(f.tell() - begin)
-
-            f.seek(headerpos)
-            f.write(pack(headerfmt, *filter_sizes))
-
-    @classmethod
-    def fromfile(cls, f):
-        """Deserialize the ScalableBloomFilter in file object `f'."""
-        filter = cls()
-        filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
-        nfilters, = unpack('<l', f.read(calcsize('<l')))
-        if nfilters > 0:
-            header_fmt = '<' + 'Q'*nfilters
-            bytes = f.read(calcsize(header_fmt))
-            filter_lengths = unpack(header_fmt, bytes)
-            for fl in filter_lengths:
-                filter.filters.append(BloomFilter.fromfile(f, fl))
-        else:
-            filter.filters = []
-
-        return filter
-
-    def __len__(self):
-        """Returns the total number of elements stored in this SBF"""
-        return sum([f.count for f in self.filters])
diff --git a/js/app.js b/js/app.js
new file mode 100644
index 0000000..e326a57
--- /dev/null
+++ b/js/app.js
@@ -0,0 +1,121 @@
+/* Params */
+var error_rate = 0.1;
+
+
+/* Vars */
+var bloom = Array(), index;
+var ready = false;
+
+
+/* Functions */
+function callback() {
+    if (typeof(index) === 'undefined' || bloom.length == 0) {
+        return;
+    }
+
+    // Sets up the page, that is now ready
+    ready = true;
+    document.getElementById('main').innerHTML = '<form id="search_form"><p><input type="text" id="search" name="search" placeholder="Search for articles..."/></p></form>';
+
+    // Handle onchange actions
+    document.getElementById('search').oninput = function (e) {
+        if (!ready) {
+            return;
+        }
+
+        filter_results(e.target.value);
+    }
+}
+
+// Returns true iff all the terms in the array are in the bloom filter b
+function terms_in_bloom(terms, b) {
+    for (var i = 0; i < terms.length; i++) {
+        if (!b.test(terms[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// Filter the results to match the query
+function filter_results(query) {
+    var search_terms = query.trim();
+    if (search_terms === "") {
+        document.getElementById('results').innerHTML = "";
+    }
+    search_terms = query.split(" ").map(stemmer);
+
+    var results = Array();
+    for (var i = 0; i < index.length; i++) {
+        if (terms_in_bloom(search_terms, bloom[i])) {
+            results.push(index[i]);
+        }
+    }
+
+    if (results.length > 0) {
+        results_html = '<ul>';
+        for (var i = 0; i < results.length; i++) {
+            results_html += '<li>' + results[i] + '</li>';
+        }
+        results_html += '</ul>'
+    }
+    else {
+        results_html = '<p>Aucun résultat.</p>';
+    }
+    document.getElementById('results').innerHTML = results_html;
+}
+
+
+/* App */
+
+// Get the words index (a.k.a. Bloom Filter)
+var oReq = new XMLHttpRequest();
+oReq.open("GET", "data/filters", true);
+oReq.responseType = "arraybuffer";
+oReq.onload = function (oEvent) {
+    var array_buffer = oReq.response;
+    if (array_buffer) {
+        var byte_array = new Int32Array(array_buffer);
+
+        // First element is the number of bloom filters in the binary file
+        var nb_bloom_filters = byte_array[0];
+        // nb_bloom_filters next elements are the lengths of the arrays
+        var lengths = Array();
+        for (var i = 0; i < nb_bloom_filters; i++) {
+            lengths.push(byte_array[1 + i]);
+        }
+        // Then, builds Bloom filters
+        var l = 0, tmp_array;
+        for (var i = 0; i < nb_bloom_filters; i++) {
+            tmp_array = byte_array.subarray(1 + nb_bloom_filters + l, 1 + nb_bloom_filters + l + lengths[i]);
+            var l = lengths[i];
+            bloom.push(new BloomFilter(tmp_array, error_rate));
+            console.log(tmp_array);
+            console.log(bloom[0].test("concern"));
+        }
+
+        callback();
+    }
+    else {
+        document.getElementById('error').innerHTML = 'Unable to load the bloom filters.';
+    }
+};
+oReq.send(null);
+
+// Get the pages index
+var req = new XMLHttpRequest();
+req.open('GET', 'data/pages.json', true);
+req.onreadystatechange = function () {
+    if (req.readyState == 4) {
+        if (req.status == 200) {
+            var tmp = JSON.parse(req.responseText);
+            index = tmp['index'];
+
+            callback();
+        }
+        else {
+            document.getElementById('error').innerHTML = 'Unable to load the index.';
+        }
+    }
+};
+req.send(null);
diff --git a/js/bloom.js b/js/bloom.js
index 4bf381a..e0cf72e 100644
--- a/js/bloom.js
+++ b/js/bloom.js
@@ -22,23 +22,26 @@
     function BloomFilter(capacity, error_rate) {
         // *m* is the number of bits. Note that *m* is rounded up to
         // the nearest multiple of 32. *k* specifies the number of hashing functions.
+        if (error_rate < 0 || error_rate > 1 || (typeof(capacity) === "number" && capacity < 0)) {
+            return false;
+        }
         var a, i = -1;
         // Number of slices, k
         var k = Math.ceil(- Math.log(error_rate) / Math.log(2));
         // Total number of bits, m
-        // Size of the UInt32 table, n
+        // Size of the Int32 table, n
         var m, n;
         if (typeof capacity !== "number") {
             a = capacity;
             // Total number of bits, m
             m = a.length * 32;
-            // Size of the UInt32 table, n
+            // Size of the Int32 table, n
             n = a.length;
         }
         else {
             // Total number of bits, m
             m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k;
-            // Size of the UInt32 table, n
+            // Size of the Int32 table, n
             n = Math.ceil(m / 32);
             // Round total number of bits to closest multiple of 32
             m = n * 32;
diff --git a/js/stemmer.js b/js/stemmer.js
new file mode 100644
index 0000000..ceade1a
--- /dev/null
+++ b/js/stemmer.js
@@ -0,0 +1,186 @@
+// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
+// paper, in
+//
+//  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+//  no. 3, pp 130-137,
+//
+// see also http://www.tartarus.org/~martin/PorterStemmer
+
+// Release 1 be 'andargor', Jul 2004
+// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
+
+var stemmer = (function(){
+	var step2list = {
+			"ational" : "ate",
+			"tional" : "tion",
+			"enci" : "ence",
+			"anci" : "ance",
+			"izer" : "ize",
+			"bli" : "ble",
+			"alli" : "al",
+			"entli" : "ent",
+			"eli" : "e",
+			"ousli" : "ous",
+			"ization" : "ize",
+			"ation" : "ate",
+			"ator" : "ate",
+			"alism" : "al",
+			"iveness" : "ive",
+			"fulness" : "ful",
+			"ousness" : "ous",
+			"aliti" : "al",
+			"iviti" : "ive",
+			"biliti" : "ble",
+			"logi" : "log"
+		},
+
+		step3list = {
+			"icate" : "ic",
+			"ative" : "",
+			"alize" : "al",
+			"iciti" : "ic",
+			"ical" : "ic",
+			"ful" : "",
+			"ness" : ""
+		},
+
+		c = "[^aeiou]",          // consonant
+		v = "[aeiouy]",          // vowel
+		C = c + "[^aeiouy]*",    // consonant sequence
+		V = v + "[aeiou]*",      // vowel sequence
+
+		mgr0 = "^(" + C + ")?" + V + C,               // [C]VC... is m>0
+		meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$",  // [C]VC[V] is m=1
+		mgr1 = "^(" + C + ")?" + V + C + V + C,       // [C]VCVC... is m>1
+		s_v = "^(" + C + ")?" + v;                   // vowel in stem
+
+	return function (w) {
+		var 	stem,
+			suffix,
+			firstch,
+			re,
+			re2,
+			re3,
+			re4,
+			origword = w;
+
+		if (w.length < 3) { return w; }
+
+		firstch = w.substr(0,1);
+		if (firstch == "y") {
+			w = firstch.toUpperCase() + w.substr(1);
+		}
+
+		// Step 1a
+		re = /^(.+?)(ss|i)es$/;
+		re2 = /^(.+?)([^s])s$/;
+
+		if (re.test(w)) { w = w.replace(re,"$1$2"); }
+		else if (re2.test(w)) {	w = w.replace(re2,"$1$2"); }
+
+		// Step 1b
+		re = /^(.+?)eed$/;
+		re2 = /^(.+?)(ed|ing)$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			re = new RegExp(mgr0);
+			if (re.test(fp[1])) {
+				re = /.$/;
+				w = w.replace(re,"");
+			}
+		} else if (re2.test(w)) {
+			var fp = re2.exec(w);
+			stem = fp[1];
+			re2 = new RegExp(s_v);
+			if (re2.test(stem)) {
+				w = stem;
+				re2 = /(at|bl|iz)$/;
+				re3 = new RegExp("([^aeiouylsz])\\1$");
+				re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+				if (re2.test(w)) {	w = w + "e"; }
+				else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
+				else if (re4.test(w)) { w = w + "e"; }
+			}
+		}
+
+		// Step 1c
+		re = /^(.+?)y$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			re = new RegExp(s_v);
+			if (re.test(stem)) { w = stem + "i"; }
+		}
+
+		// Step 2
+		re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			suffix = fp[2];
+			re = new RegExp(mgr0);
+			if (re.test(stem)) {
+				w = stem + step2list[suffix];
+			}
+		}
+
+		// Step 3
+		re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			suffix = fp[2];
+			re = new RegExp(mgr0);
+			if (re.test(stem)) {
+				w = stem + step3list[suffix];
+			}
+		}
+
+		// Step 4
+		re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
+		re2 = /^(.+?)(s|t)(ion)$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			re = new RegExp(mgr1);
+			if (re.test(stem)) {
+				w = stem;
+			}
+		} else if (re2.test(w)) {
+			var fp = re2.exec(w);
+			stem = fp[1] + fp[2];
+			re2 = new RegExp(mgr1);
+			if (re2.test(stem)) {
+				w = stem;
+			}
+		}
+
+		// Step 5
+		re = /^(.+?)e$/;
+		if (re.test(w)) {
+			var fp = re.exec(w);
+			stem = fp[1];
+			re = new RegExp(mgr1);
+			re2 = new RegExp(meq1);
+			re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
+			if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
+				w = stem;
+			}
+		}
+
+		re = /ll$/;
+		re2 = new RegExp(mgr1);
+		if (re.test(w) && re2.test(w)) {
+			re = /.$/;
+			w = w.replace(re,"");
+		}
+
+		// and turn initial Y back to y
+
+		if (firstch == "y") {
+			w = firstch.toLowerCase() + w.substr(1);
+		}
+
+		return w;
+	}
+})();
diff --git a/js/test.js b/js/test.js
index 9441115..c83bc6b 100644
--- a/js/test.js
+++ b/js/test.js
@@ -1,7 +1,6 @@
 /* These are some basic unit-tests for the bloom.js module */
 
 var bloom = new BloomFilter(4, 0.1);
-console.log(bloom);
 
 // Add some elements to the filter.
 bloom.add("foo");
@@ -22,6 +21,7 @@ var array = [].slice.call(bloom.buckets),
     json = JSON.stringify(array);
 
 console.log(array);
+console.log(json);
 
 // Deserialisation. Note that the any array-like object is supported, but
 // this will be used directly, so you may wish to use a typed array for
diff --git a/js/test2.js b/js/test2.js
new file mode 100644
index 0000000..8a12f76
--- /dev/null
+++ b/js/test2.js
@@ -0,0 +1,25 @@
+/* These are some basic unit-tests for the bloom.js module */
+
+var words = JSON.parse('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]');
+
+var bloom2 = new BloomFilter(words.length, 0.1);
+console.log(bloom2);
+
+// Add some elements to the filter.
+for (var i = 0; i < words.length; i++) {
+    bloom2.add(words[i]);
+}
+
+// Test if an item is in our filter.
+// Returns true if an item is probably in the set,
+// or false if an item is definitely not in the set.
+for (var i = 0; i < words.length; i++) {
+    console.log(words[i] + " : " + bloom2.test(words[i]));
+}
+
+// Serialisation. Note that bloom.buckets may be a typed array,
+// so we convert to a normal array first.
+var array = [].slice.call(bloom2.buckets),
+    json = JSON.stringify(array);
+
+console.log(bloom2.buckets);
diff --git a/test.html b/test.html
new file mode 100644
index 0000000..09db9d2
--- /dev/null
+++ b/test.html
@@ -0,0 +1,13 @@
+<!doctype html>
+<html lang="fr">
+    <head>
+        <meta charset="utf-8">
+        <title>BloomJS demo</title>
+    </head>
+<body>
+    <h1>Bloom.JS demo</h1>
+    <p>This page runs the <code>bloom.js</code> library unit-tests. Look at your console output for <code>assert</code> error and verbose debugging.
+    <script type="text/javascript" src="js/bloom.js"></script>
+    <script type="text/javascript" src="js/test2.js"></script>
+</body>
+</html>