Commit before debugging bloom.py

2014-10-28 02:36:28 +01:00 · 2014-10-28 02:36:28 +01:00 · 496e2f823f
commit 496e2f823f
parent 5b4cc421df
14 changed files with 584 additions and 354 deletions
--- a/data/filters
+++ b/data/filters
--- a/data/pages.json
+++ b/data/pages.json
@ -0,0 +1 @@
 {"index": ["../samples/cryptdevice_multi.html", "../samples/highmon_weechat.html"]}
--- a/data/test
+++ b/data/test
--- a/data/words
+++ b/data/words
--- a/index.html
+++ b/index.html
@ -3,17 +3,22 @@
    <head>
        <meta charset="utf-8">
        <title>BloomJS demo</title>
        <style type="text/css">
            #error {
                font-weight: bold;
                color: red;
            }
        </style>
    </head>
 <body>
    <h1>Bloom.JS demo</h1>
-    <p id="loading"></p>
+    <p id="error"></p>
-    <form id="search_form">
+    <div id="main">
-        <p>
+        <p>Loading…</p>
-            <input type="text" id="search" name="search" value="Search for articles..."/>
+    </div>
        </p>
    </form>
    <div id="results"></div>
    <script type="text/javascript" src="js/bloom.js"></script>
    <script type="text/javascript" src="js/stemmer.js"></script>
    <script type="text/javascript" src="js/app.js"></script>
 </body>
 </html>
--- a/index_generation/bloom.py
+++ b/index_generation/bloom.py
@ -0,0 +1,162 @@
 #!/usr/bin/env python3
 """
 This is a translation of the bloom.js script (originally from
 https://github.com/jasondavies/bloomfilter.js) in Python.
 Due to its status of translation of the previously mentionned JS code, you
 should refer to this one for any particular doc that should be missing in this
 implementation.
 Needs the bitarray python module to work.
 Note : Depending on your use case, the pybloom module available on Pypi may
 better suits your needs. I reimplemented the above mentionned JS script in
 Python mostly because I had to for this script, as the pybloom module uses
 advanced hashing techniques, difficult to implement in JS.
 This script has been written by Phyks and is in the public domain (or whatever
 is closer to public domain in your country).
 """
 import math
 try:
    import numpy as np
 except ImportError:
    raise ImportError('This script requires numpy')
 class BloomFilter():
    def __init__(self, capacity, error_rate=0.1):
        """
        Implements a space-efficient probabilistic data structure.
        capacity
            This is the capacity of the BloomFilter. So to speak, it should be
            able to store at least *capacity* elements
        error_rate
            the error rate of the filter returning false positives. This
            determines the filters capacity. Inserting more than capacity
            elements greatly increases the chance of false positive.
        """
        if not (0 < error_rate < 1):
            raise ValueError("Error_Rate must be between 0 and 1.")
        if not capacity > 0 or type(capacity) != int:
            raise ValueError("Capacity must be > 0")
        # Same calculation as in the js file, see it for reference purpose
        # Basically determines the number of bits and slices from the capacity
        # and error_rate.
        k = math.ceil(- math.log(error_rate, 2))
        m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k
        n = math.ceil(m / 32)
        m = n * 32
        self.m = m
        self.k = k
        kbytes = 1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))
        self.buckets = np.zeros(n, dtype=np.int32)
        if kbytes == 1:
            loc_type = np.uint8
        elif kbytes == 2:
            loc_type = np.uint16
        else:
            loc_type = np.int32
        self._locations = np.zeros(k, dtype=loc_type)
    def locations(self, v):
        r = self._locations
        a = self.fnv_1a(v)
        b = self.fnv_1a_b(a)
        print(b)
        i = 0
        x = a % self.m
        while i < self.k:
            r[i] = (x + self.m) if x < 0 else x
            x = (x + b) % self.m
            i += 1
        return r
    def add(self, v):
        l = self.locations(v + "")
        i = 0
        buckets = self.buckets
        while i < self.k:
            buckets[math.floor(l[i] / 32)] |= 1 << int(l[i] % 32)
            i += 1
    def test(self, v):
        l = self.locations(v + "")
        i = 0
        buckets = self.buckets
        while i < self.k:
            b = l[i]
            if buckets[math.floor(b / 32)] & (1 << int(b % 32)) == 0:
                return False
            i += 1
        return True
    def size(self):
        """
        Estimated cardinality
        """
        bits = 0
        buckets = self.buckets
        for i in range(0, len(buckets)):
            bits += self.popcnt(buckets[i])
        return -self.m * math.log(1 - bits / self.m) / self.k
    def popcnt(self, v):
        """
        http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
        """
        v -= (v >> 1) & 0x55555555
        v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
        return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24
    def fnv_1a(self, v):
        """
        Fowler/Noll/Vo hashing.
        """
        n = len(v)
        a = 2166136261
        i = 0
        while i < n:
            c = ord(v[i])
            d = c & 0xff000000
            if d:
                a ^= d >> 24
                a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
            d = c & 0xff0000
            if d:
                a ^= d >> 16
                a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
            d = c & 0xff00
            if d:
                a ^= d >> 8
                a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
            a ^= c & 0xff
            print(a ^ (c & 0xff))
            a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
            i += 1
        # From http://home.comcast.net/~bretm/hash/6.html
        a += a << 13
        a ^= a >> 7
        a += a << 3
        a ^= a >> 17
        a += a << 5
        return a & 0xffffffff
    def fnv_1a_b(self, a):
        """
        One additional iteration of FNV, given a hash.
        """
        a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
        a += a << 13
        a ^= a >> 7
        a += a << 3
        a ^= a >> 17
        a += a << 5
        print(a)
        return a & 0xffffffff
--- a/index_generation/generate_index.py
+++ b/index_generation/generate_index.py
@ -1,48 +1,44 @@
 #!/usr/bin/env python3
 """
 Inspired by
 http://www.stavros.io/posts/bloom-filter-search-engine/?print
 You have to install the numpy python module for bloom to work.
 """
 import bloom
 import json
 import os
 import sys
 from lxml import html
 import re
 import stemmer
-import json
+import struct
-from bitarray import bitarray
+import sys
-from pybloom import BloomFilter
+
 from lxml import html
 # List all files in path directory
 def list_directory(path):
-    fichier = []
+    """Recursively list all files in a given directory."""
    files_list = []
    for root, dirs, files in os.walk(path):
        for i in files:
-            fichier.append(os.path.join(root, i))
+            files_list.append(os.path.join(root, i))
-    return fichier
+    return files_list
 def remove_common_words(words):
    """Removes all words that are less than 3 characters long."""
    returned = [word for word in words if len(word) > 3]
    return returned
 def padding_16(x):
    if x < 256:
        return bytes([0,x])
    else:
        return bytes([int(x/256), x%256])
-# =============================================================================
+if __name__ == "__main__":
    error_rate = 0.1
    samples = list_directory("../samples/")
-filters = {}
+    filters = []
    p = stemmer.PorterStemmer()
 write_little = bitarray(endian="little")
 write_big = bitarray(endian="big")
 write_little.frombytes(padding_16(len(samples)))
 write_big.frombytes(padding_16(len(samples)))
 if len(samples) > 65535:
    sys.exit("[ERROR] Too many articles to index. You will have to change the "
             "way data is stored in the binary file to handle such amount of "
             "files.")
    for sample in samples:
        with open(sample, 'r') as sample_fh:
@ -51,42 +47,37 @@ for sample in samples:
        # Get text from HTML content
        words = html.fromstring(content).text_content().replace("\n", "")
        words = re.findall(r"[\w]+", words)
-    # Remove all punctuation etc., convert words to lower and delete duplicates
+        # Remove all punctuation etc., convert words to lower and delete
        # duplicates
        words = list(set([word.lower() for word in words]))
        # Remove common words
        words = remove_common_words(words)
        # Stemming to reduce the number of words
-    words = [p.stem(word, 0, len(word)-1) for word in words]
+        words = list(set([p.stem(word, 0, len(word)-1) for word in words]))
-    filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1)
+        tmp_filter = bloom.BloomFilter(capacity=len(words),
                                       error_rate=error_rate)
        words = json.loads('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]')
        for word in words:
-        filters[sample].add(word)
+            tmp_filter.add(word)
-    if filters[sample].bitarray.length() > 65535:
+        filters.append(tmp_filter.buckets)
-        sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
+        print(tmp_filter.buckets)
-                 "will have to change the way data is stored in the binary "
+        sys.exit()
                 "file to handle such amount of text.")
-    tmp = bitarray(endian="little")
+    # First Int32 is length
-    tmp.frombytes(padding_16(filters[sample].bitarray.length()))
+    filters_to_write = struct.pack("<i", len(filters))
-    write_little.extend(tmp)
+    # Then comes the length of each filter
-    write_little.extend(filters[sample].bitarray)
+    for i in filters:
-    write_little.extend([0 for i in range(filters[sample].bitarray.length() %
+        filters_to_write += struct.pack("<i", len(i))
-                                          8)])
+    # Finally comes the filters themselves
-    tmp = bitarray(endian="big")
+    for i in filters:
-    tmp.frombytes(padding_16(filters[sample].bitarray.length()))
+        filters_to_write += struct.pack("<%di" % len(i), *i)
    write_big.extend(tmp)
    write_big.extend(filters[sample].bitarray)
    write_big.extend([0 for i in range(filters[sample].bitarray.length() %
                                          8)])
-with open('../data/search_index_little', 'wb') as index_fh:
+    # Write everything
-    print(write_little)
+    with open("../data/filters", "wb") as index_fh:
-    write_little.tofile(index_fh)
+        index_fh.write(filters_to_write)
 with open('../data/search_index_big', 'wb') as index_fh:
    print(write_big)
    write_big.tofile(index_fh)
-with open('../data/pages_index.json', 'w') as pages_fh:
+    with open("../data/pages.json", "w") as pages_fh:
-    pages_fh.write(json.dumps(samples))
+        pages_fh.write(json.dumps({"index": samples}))
--- a/index_generation/pybloom.py
+++ b/index_generation/pybloom.py
@ -1,277 +0,0 @@
 import math
 import hashlib
 from struct import unpack, pack, calcsize
 try:
    import bitarray
 except ImportError:
    raise ImportError('pybloom requires bitarray >= 0.3.4')
 __version__ = '2.0'
 __author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
 Marius Eriksen <marius@monkey.org>,\
 Alex Brasetvik <alex@brasetvik.com>"
 def make_hashfuncs(num_slices, num_bits):
    if num_bits >= (1 << 31):
        fmt_code, chunk_size = 'Q', 8
    elif num_bits >= (1 << 15):
        fmt_code, chunk_size = 'I', 4
    else:
        fmt_code, chunk_size = 'H', 2
    total_hash_bits = 8 * num_slices * chunk_size
    if total_hash_bits > 384:
        hashfn = hashlib.sha512
    elif total_hash_bits > 256:
        hashfn = hashlib.sha384
    elif total_hash_bits > 160:
        hashfn = hashlib.sha256
    elif total_hash_bits > 128:
        hashfn = hashlib.sha1
    else:
        hashfn = hashlib.md5
    fmt = fmt_code * (hashfn().digest_size // chunk_size)
    num_salts, extra = divmod(num_slices, len(fmt))
    if extra:
        num_salts += 1
    salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]
    def _make_hashfuncs(key):
        key = str(key)
        rval = []
        for salt in salts:
            h = salt.copy()
            h.update(key.encode('utf-8'))
            rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
        del rval[num_slices:]
        return rval
    return _make_hashfuncs
 class BloomFilter(object):
    FILE_FMT = '<dQQQQ'
    def __init__(self, capacity, error_rate=0.001):
        if not (0 < error_rate < 1):
            raise ValueError("Error_Rate must be between 0 and 1.")
        if not capacity > 0:
            raise ValueError("Capacity must be > 0")
        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
        bits_per_slice = int(math.ceil(
            (capacity * abs(math.log(error_rate))) /
            (num_slices * (math.log(2) ** 2))))
        self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
        self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
        self.bitarray.setall(False)
    def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
        self.error_rate = error_rate
        self.num_slices = num_slices
        self.bits_per_slice = bits_per_slice
        self.capacity = capacity
        self.num_bits = num_slices * bits_per_slice
        self.count = count
        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
    def __contains__(self, key):
        bits_per_slice = self.bits_per_slice
        bitarray = self.bitarray
        if not isinstance(key, list):
            hashes = self.make_hashes(key)
        else:
            hashes = key
        offset = 0
        for k in hashes:
            if not bitarray[offset + k]:
                return False
            offset += bits_per_slice
        return True
    def __len__(self):
        """Return the number of keys stored by this bloom filter."""
        return self.count
    def add(self, key, skip_check=False):
        bitarray = self.bitarray
        bits_per_slice = self.bits_per_slice
        hashes = self.make_hashes(key)
        if not skip_check and hashes in self:
            return True
        if self.count > self.capacity:
            raise IndexError("BloomFilter is at capacity")
        offset = 0
        for k in hashes:
            self.bitarray[offset + k] = True
            offset += bits_per_slice
        self.count += 1
        return False
    def copy(self):
        """Return a copy of this bloom filter.
 """
        new_filter = BloomFilter(self.capacity, self.error_rate)
        new_filter.bitarray = self.bitarray.copy()
        return new_filter
    def union(self, other):
        """ Calculates the union of the two underlying bitarrays and returns
 a new bloom filter object."""
        if self.capacity != other.capacity or \
            self.error_rate != other.error_rate:
            raise ValueError("Unioning filters requires both filters to have \
 both the same capacity and error rate")
        new_bloom = self.copy()
        new_bloom.bitarray = new_bloom.bitarray | other.bitarray
        return new_bloom
    def __or__(self, other):
        return self.union(other)
    def intersection(self, other):
        """ Calculates the intersection of the two underlying bitarrays and returns
 a new bloom filter object."""
        if self.capacity != other.capacity or \
            self.error_rate != other.error_rate:
            raise ValueError("Intersecting filters requires both filters to \
 have equal capacity and error rate")
        new_bloom = self.copy()
        new_bloom.bitarray = new_bloom.bitarray & other.bitarray
        return new_bloom
    def __and__(self, other):
        return self.intersection(other)
    def tofile(self, f):
        """Write the bloom filter to file object `f'. Underlying bits
 are written as machine values. This is much more space
 efficient than pickling the object."""
        f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
                     self.bits_per_slice, self.capacity, self.count))
        self.bitarray.tofile(f)
    @classmethod
    def fromfile(cls, f, n=-1):
        """Read a bloom filter from file-object `f' serialized with
 ``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
        headerlen = calcsize(cls.FILE_FMT)
        if 0 < n < headerlen:
            raise ValueError('n too small!')
        filter = cls(1) # Bogus instantiation, we will `_setup'.
        filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
        filter.bitarray = bitarray.bitarray(endian='little')
        if n > 0:
            filter.bitarray.fromfile(f, n - headerlen)
        else:
            filter.bitarray.fromfile(f)
        if filter.num_bits != filter.bitarray.length() and \
               (filter.num_bits + (8 - filter.num_bits % 8)
                != filter.bitarray.length()):
            raise ValueError('Bit length mismatch!')
        return filter
    def __getstate__(self):
        d = self.__dict__.copy()
        del d['make_hashes']
        return d
    def __setstate__(self, d):
        self.__dict__.update(d)
        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
 class ScalableBloomFilter(object):
    SMALL_SET_GROWTH = 2 # slower, but takes up less memory
    LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
    FILE_FMT = '<idQd'
    def __init__(self, initial_capacity=100, error_rate=0.001,
                 mode=SMALL_SET_GROWTH):
        if not error_rate or error_rate < 0:
            raise ValueError("Error_Rate must be a decimal less than 0.")
        self._setup(mode, 0.9, initial_capacity, error_rate)
        self.filters = []
    def _setup(self, mode, ratio, initial_capacity, error_rate):
        self.scale = mode
        self.ratio = ratio
        self.initial_capacity = initial_capacity
        self.error_rate = error_rate
    def __contains__(self, key):
        for f in reversed(self.filters):
            if key in f:
                return True
        return False
    def add(self, key):
        if key in self:
            return True
        if not self.filters:
            filter = BloomFilter(
                capacity=self.initial_capacity,
                error_rate=self.error_rate * (1.0 - self.ratio))
            self.filters.append(filter)
        else:
            filter = self.filters[-1]
            if filter.count >= filter.capacity:
                filter = BloomFilter(
                    capacity=filter.capacity * self.scale,
                    error_rate=filter.error_rate * self.ratio)
                self.filters.append(filter)
        filter.add(key, skip_check=True)
        return False
    @property
    def capacity(self):
        """Returns the total capacity for all filters in this SBF"""
        return sum([f.capacity for f in self.filters])
    @property
    def count(self):
        return len(self)
    def tofile(self, f):
        """Serialize this ScalableBloomFilter into the file-object
 `f'."""
        f.write(pack(self.FILE_FMT, self.scale, self.ratio,
                     self.initial_capacity, self.error_rate))
        # Write #-of-filters
        f.write(pack('<l', len(self.filters)))
        if len(self.filters) > 0:
            # Then each filter directly, with a header describing
            # their lengths.
            headerpos = f.tell()
            headerfmt = '<' + 'Q'*(len(self.filters))
            f.write('.' * calcsize(headerfmt))
            filter_sizes = []
            for filter in self.filters:
                begin = f.tell()
                filter.tofile(f)
                filter_sizes.append(f.tell() - begin)
            f.seek(headerpos)
            f.write(pack(headerfmt, *filter_sizes))
    @classmethod
    def fromfile(cls, f):
        """Deserialize the ScalableBloomFilter in file object `f'."""
        filter = cls()
        filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
        nfilters, = unpack('<l', f.read(calcsize('<l')))
        if nfilters > 0:
            header_fmt = '<' + 'Q'*nfilters
            bytes = f.read(calcsize(header_fmt))
            filter_lengths = unpack(header_fmt, bytes)
            for fl in filter_lengths:
                filter.filters.append(BloomFilter.fromfile(f, fl))
        else:
            filter.filters = []
        return filter
    def __len__(self):
        """Returns the total number of elements stored in this SBF"""
        return sum([f.count for f in self.filters])
--- a/js/app.js
+++ b/js/app.js
@ -0,0 +1,121 @@
 /* Params */
 var error_rate = 0.1;
 /* Vars */
 var bloom = Array(), index;
 var ready = false;
 /* Functions */
 function callback() {
    if (typeof(index) === 'undefined' || bloom.length == 0) {
        return;
    }
    // Sets up the page, that is now ready
    ready = true;
    document.getElementById('main').innerHTML = '<form id="search_form"><p><input type="text" id="search" name="search" placeholder="Search for articles..."/></p></form>';
    // Handle onchange actions
    document.getElementById('search').oninput = function (e) {
        if (!ready) {
            return;
        }
        filter_results(e.target.value);
    }
 }
 // Returns true iff all the terms in the array are in the bloom filter b
 function terms_in_bloom(terms, b) {
    for (var i = 0; i < terms.length; i++) {
        if (!b.test(terms[i])) {
            return false;
        }
    }
    return true;
 }
 // Filter the results to match the query
 function filter_results(query) {
    var search_terms = query.trim();
    if (search_terms === "") {
        document.getElementById('results').innerHTML = "";
    }
    search_terms = query.split(" ").map(stemmer);
    var results = Array();
    for (var i = 0; i < index.length; i++) {
        if (terms_in_bloom(search_terms, bloom[i])) {
            results.push(index[i]);
        }
    }
    if (results.length > 0) {
        results_html = '<ul>';
        for (var i = 0; i < results.length; i++) {
            results_html += '<li>' + results[i] + '</li>';
        }
        results_html += '</ul>'
    }
    else {
        results_html = '<p>Aucun résultat.</p>';
    }
    document.getElementById('results').innerHTML = results_html;
 }
 /* App */
 // Get the words index (a.k.a. Bloom Filter)
 var oReq = new XMLHttpRequest();
 oReq.open("GET", "data/filters", true);
 oReq.responseType = "arraybuffer";
 oReq.onload = function (oEvent) {
    var array_buffer = oReq.response;
    if (array_buffer) {
        var byte_array = new Int32Array(array_buffer);
        // First element is the number of bloom filters in the binary file
        var nb_bloom_filters = byte_array[0];
        // nb_bloom_filters next elements are the lengths of the arrays
        var lengths = Array();
        for (var i = 0; i < nb_bloom_filters; i++) {
            lengths.push(byte_array[1 + i]);
        }
        // Then, builds Bloom filters
        var l = 0, tmp_array;
        for (var i = 0; i < nb_bloom_filters; i++) {
            tmp_array = byte_array.subarray(1 + nb_bloom_filters + l, 1 + nb_bloom_filters + l + lengths[i]);
            var l = lengths[i];
            bloom.push(new BloomFilter(tmp_array, error_rate));
            console.log(tmp_array);
            console.log(bloom[0].test("concern"));
        }
        callback();
    }
    else {
        document.getElementById('error').innerHTML = 'Unable to load the bloom filters.';
    }
 };
 oReq.send(null);
 // Get the pages index
 var req = new XMLHttpRequest();
 req.open('GET', 'data/pages.json', true);
 req.onreadystatechange = function () {
    if (req.readyState == 4) {
        if (req.status == 200) {
            var tmp = JSON.parse(req.responseText);
            index = tmp['index'];
            callback();
        }
        else {
            document.getElementById('error').innerHTML = 'Unable to load the index.';
        }
    }
 };
 req.send(null);
--- a/js/bloom.js
+++ b/js/bloom.js
@ -22,23 +22,26 @@
    function BloomFilter(capacity, error_rate) {
        // *m* is the number of bits. Note that *m* is rounded up to
        // the nearest multiple of 32. *k* specifies the number of hashing functions.
        if (error_rate < 0 || error_rate > 1 || (typeof(capacity) === "number" && capacity < 0)) {
            return false;
        }
        var a, i = -1;
        // Number of slices, k
        var k = Math.ceil(- Math.log(error_rate) / Math.log(2));
        // Total number of bits, m
-        // Size of the UInt32 table, n
+        // Size of the Int32 table, n
        var m, n;
        if (typeof capacity !== "number") {
            a = capacity;
            // Total number of bits, m
            m = a.length * 32;
-            // Size of the UInt32 table, n
+            // Size of the Int32 table, n
            n = a.length;
        }
        else {
            // Total number of bits, m
            m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k;
-            // Size of the UInt32 table, n
+            // Size of the Int32 table, n
            n = Math.ceil(m / 32);
            // Round total number of bits to closest multiple of 32
            m = n * 32;
--- a/js/stemmer.js
+++ b/js/stemmer.js
@ -0,0 +1,186 @@
 // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
 // paper, in
 //
 //  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
 //  no. 3, pp 130-137,
 //
 // see also http://www.tartarus.org/~martin/PorterStemmer
 // Release 1 be 'andargor', Jul 2004
 // Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
 var stemmer = (function(){
 	var step2list = {
 			"ational" : "ate",
 			"tional" : "tion",
 			"enci" : "ence",
 			"anci" : "ance",
 			"izer" : "ize",
 			"bli" : "ble",
 			"alli" : "al",
 			"entli" : "ent",
 			"eli" : "e",
 			"ousli" : "ous",
 			"ization" : "ize",
 			"ation" : "ate",
 			"ator" : "ate",
 			"alism" : "al",
 			"iveness" : "ive",
 			"fulness" : "ful",
 			"ousness" : "ous",
 			"aliti" : "al",
 			"iviti" : "ive",
 			"biliti" : "ble",
 			"logi" : "log"
 		},
 		step3list = {
 			"icate" : "ic",
 			"ative" : "",
 			"alize" : "al",
 			"iciti" : "ic",
 			"ical" : "ic",
 			"ful" : "",
 			"ness" : ""
 		},
 		c = "[^aeiou]",          // consonant
 		v = "[aeiouy]",          // vowel
 		C = c + "[^aeiouy]*",    // consonant sequence
 		V = v + "[aeiou]*",      // vowel sequence
 		mgr0 = "^(" + C + ")?" + V + C,               // [C]VC... is m>0
 		meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$",  // [C]VC[V] is m=1
 		mgr1 = "^(" + C + ")?" + V + C + V + C,       // [C]VCVC... is m>1
 		s_v = "^(" + C + ")?" + v;                   // vowel in stem
 	return function (w) {
 		var 	stem,
 			suffix,
 			firstch,
 			re,
 			re2,
 			re3,
 			re4,
 			origword = w;
 		if (w.length < 3) { return w; }
 		firstch = w.substr(0,1);
 		if (firstch == "y") {
 			w = firstch.toUpperCase() + w.substr(1);
 		}
 		// Step 1a
 		re = /^(.+?)(ss|i)es$/;
 		re2 = /^(.+?)([^s])s$/;
 		if (re.test(w)) { w = w.replace(re,"$1$2"); }
 		else if (re2.test(w)) {	w = w.replace(re2,"$1$2"); }
 		// Step 1b
 		re = /^(.+?)eed$/;
 		re2 = /^(.+?)(ed|ing)$/;
 		if (re.test(w)) {
 			var fp = re.exec(w);
 			re = new RegExp(mgr0);
 			if (re.test(fp[1])) {
 				re = /.$/;
 				w = w.replace(re,"");
 			}
 		} else if (re2.test(w)) {
 			var fp = re2.exec(w);
 			stem = fp[1];
 			re2 = new RegExp(s_v);
 			if (re2.test(stem)) {
 				w = stem;
 				re2 = /(at|bl|iz)$/;
 				re3 = new RegExp("([^aeiouylsz])\\1$");
 				re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
 				if (re2.test(w)) {	w = w + "e"; }
 				else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
 				else if (re4.test(w)) { w = w + "e"; }
 			}
 		}
 		// Step 1c
 		re = /^(.+?)y$/;
 		if (re.test(w)) {
 			var fp = re.exec(w);
 			stem = fp[1];
 			re = new RegExp(s_v);
 			if (re.test(stem)) { w = stem + "i"; }
 		}
 		// Step 2
 		re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
 		if (re.test(w)) {
 			var fp = re.exec(w);
 			stem = fp[1];
 			suffix = fp[2];
 			re = new RegExp(mgr0);
 			if (re.test(stem)) {
 				w = stem + step2list[suffix];
 			}
 		}
 		// Step 3
 		re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
 		if (re.test(w)) {
 			var fp = re.exec(w);
 			stem = fp[1];
 			suffix = fp[2];
 			re = new RegExp(mgr0);
 			if (re.test(stem)) {
 				w = stem + step3list[suffix];
 			}
 		}
 		// Step 4
 		re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
 		re2 = /^(.+?)(s|t)(ion)$/;
 		if (re.test(w)) {
 			var fp = re.exec(w);
 			stem = fp[1];
 			re = new RegExp(mgr1);
 			if (re.test(stem)) {
 				w = stem;
 			}
 		} else if (re2.test(w)) {
 			var fp = re2.exec(w);
 			stem = fp[1] + fp[2];
 			re2 = new RegExp(mgr1);
 			if (re2.test(stem)) {
 				w = stem;
 			}
 		}
 		// Step 5
 		re = /^(.+?)e$/;
 		if (re.test(w)) {
 			var fp = re.exec(w);
 			stem = fp[1];
 			re = new RegExp(mgr1);
 			re2 = new RegExp(meq1);
 			re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
 			if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
 				w = stem;
 			}
 		}
 		re = /ll$/;
 		re2 = new RegExp(mgr1);
 		if (re.test(w) && re2.test(w)) {
 			re = /.$/;
 			w = w.replace(re,"");
 		}
 		// and turn initial Y back to y
 		if (firstch == "y") {
 			w = firstch.toLowerCase() + w.substr(1);
 		}
 		return w;
 	}
 })();
--- a/js/test.js
+++ b/js/test.js
@ -1,7 +1,6 @@
 /* These are some basic unit-tests for the bloom.js module */
 var bloom = new BloomFilter(4, 0.1);
 console.log(bloom);
 // Add some elements to the filter.
 bloom.add("foo");
@ -22,6 +21,7 @@ var array = [].slice.call(bloom.buckets),
    json = JSON.stringify(array);
 console.log(array);
 console.log(json);
 // Deserialisation. Note that the any array-like object is supported, but
 // this will be used directly, so you may wish to use a typed array for
--- a/js/test2.js
+++ b/js/test2.js
@ -0,0 +1,25 @@
 /* These are some basic unit-tests for the bloom.js module */
 var words = JSON.parse('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]');
 var bloom2 = new BloomFilter(words.length, 0.1);
 console.log(bloom2);
 // Add some elements to the filter.
 for (var i = 0; i < words.length; i++) {
    bloom2.add(words[i]);
 }
 // Test if an item is in our filter.
 // Returns true if an item is probably in the set,
 // or false if an item is definitely not in the set.
 for (var i = 0; i < words.length; i++) {
    console.log(words[i] + " : " + bloom2.test(words[i]));
 }
 // Serialisation. Note that bloom.buckets may be a typed array,
 // so we convert to a normal array first.
 var array = [].slice.call(bloom2.buckets),
    json = JSON.stringify(array);
 console.log(bloom2.buckets);
--- a/test.html
+++ b/test.html
@ -0,0 +1,13 @@
 <!doctype html>
 <html lang="fr">
    <head>
        <meta charset="utf-8">
        <title>BloomJS demo</title>
    </head>
 <body>
    <h1>Bloom.JS demo</h1>
    <p>This page runs the <code>bloom.js</code> library unit-tests. Look at your console output for <code>assert</code> error and verbose debugging.
    <script type="text/javascript" src="js/bloom.js"></script>
    <script type="text/javascript" src="js/test2.js"></script>
 </body>
 </html>
		`@ -0,0 +1 @@`
							`{"index": ["../samples/cryptdevice_multi.html", "../samples/highmon_weechat.html"]}`