bloomysearch/index_generation/bloom.py

#!/usr/bin/env python3

"""
This is a translation of the bloom.js script (originally from
https://github.com/jasondavies/bloomfilter.js) in Python.

Due to its status of translation of the previously mentionned JS code, you
should refer to this one for any particular doc that should be missing in this
implementation.

Needs the bitarray python module to work.

Note : Depending on your use case, the pybloom module available on Pypi may
better suits your needs. I reimplemented the above mentionned JS script in
Python mostly because I had to for this script, as the pybloom module uses
advanced hashing techniques, difficult to implement in JS.

This script has been written by Phyks and is in the public domain (or whatever
is closer to public domain in your country).
"""

import ctypes
import math

try:
    import numpy as np
except ImportError:
    raise ImportError('This script requires numpy')


class BloomFilter():
    def __init__(self, capacity, error_rate=0.1):
        """
        Implements a space-efficient probabilistic data structure.

        capacity
            This is the capacity of the BloomFilter. So to speak, it should be
            able to store at least *capacity* elements
        error_rate
            the error rate of the filter returning false positives. This
            determines the filters capacity. Inserting more than capacity
            elements greatly increases the chance of false positive.
        """
        if not (0 < error_rate < 1):
            raise ValueError("Error_Rate must be between 0 and 1.")
        if not capacity > 0 or type(capacity) != int:
            raise ValueError("Capacity must be > 0")

        # Same calculation as in the js file, see it for reference purpose
        # Basically determines the number of bits and slices from the capacity
        # and error_rate.
        k = math.ceil(- math.log(error_rate, 2))
        m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k
        n = math.ceil(m / 32)
        m = n * 32
        self.m = m
        self.k = k

        kbytes = ctypes.c_int(1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))).value
        self.buckets = np.zeros(n, dtype=np.int32)
        if kbytes == 1:
            loc_type = np.uint8
        elif kbytes == 2:
            loc_type = np.uint16
        else:
            loc_type = np.int32
        self._locations = np.zeros(k, dtype=loc_type)

    def mod(self, a, b):
        """
        Tweak the % operator so that it behaves like in C and in JS.
        """
        if a > 0:
            return a % b
        else:
            return - (abs(a) % b)

    def locations(self, v):
        r = self._locations
        a = self.fnv_1a(v)
        b = self.fnv_1a_b(a)
        i = 0
        x = self.mod(a, self.m)
        while i < self.k:
            r[i] = (x + self.m) if x < 0 else x
            x = self.mod(x + b, self.m)
            i += 1
        return r

    def add(self, v):
        l = self.locations(str(v))
        i = 0
        buckets = self.buckets
        while i < self.k:
            index = math.floor(l[i] / 32)
            buckets[index] |= ctypes.c_int(1 << self.mod(l[i], 32)).value
            buckets[index] = ctypes.c_int(buckets[index]).value
            i += 1

    def test(self, v):
        l = self.locations(str(v))
        i = 0
        buckets = self.buckets
        while i < self.k:
            b = l[i]
            if ctypes.c_int(buckets[math.floor(b / 32)] & ctypes.c_int(1 << (self.mod(b, 32))).value).value == 0:
                return False
            i += 1
        return True

    def size(self):
        """
        Estimated cardinality
        """
        bits = 0
        buckets = self.buckets
        for i in range(0, len(buckets)):
            bits += self.popcnt(buckets[i])
        return -self.m * math.log(1 - bits / self.m) / self.k

    def popcnt(self, v):
        """
        http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
        """
        v -= ctypes.c_int(ctypes.c_int(v >> 1).value & ctypes.c_int(0x55555555).value).value
        v = ctypes.c_int(v & 0x33333333).value + c_types.c_int(ctypes.c_int(v >> 2).value & 0x33333333).value
        return ctypes.c_int((ctypes.c_int((v + ctypes.c_int(v >> 4).value) & 0xF0F0F0F).value * 0x1010101) >> 24).value


    def rshift(self,val, n):
        """
        Implements the >>> JS operator.

        From https://stackoverflow.com/questions/5832982/how-to-get-the-logical-right-binary-shift-in-python
        """
        return (val % 0x100000000) >> n

    def fnv_1a(self, v):
        """
        Fowler/Noll/Vo hashing.

        Uses a lot of ctypes.c_int because int in JS are represented as 64 bits
        floats. This representation is used for every arithmetical operations
        but not for bitwise operations. In this case they are treated as 32 bits
        integers !
        """
        n = len(v)
        a = 2166136261
        i = 0
        while i < n:
            c = ord(v[i])
            d = ctypes.c_int(c & 0xff000000).value
            if d:
                a ^= ctypes.c_int(d >> 24).value
                a = ctypes.c_int(a).valu
                a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
            d = ctypes.c_int(c & 0xff0000).value
            if d:
                a ^= ctypes.c_int(d >> 16).value
                a = ctypes.c_int(a).valu
                a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
            d = ctypes.c_int(c & 0xff00).value
            if d:
                a ^= ctypes.c_int(d >> 8).value
                a = ctypes.c_int(a).value
                a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
            a ^= ctypes.c_int(c & 0xff).value
            a = ctypes.c_int(a).value
            a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
            i += 1
        # From http://home.comcast.net/~bretm/hash/6.html
        a += ctypes.c_int(a << 13).value
        a ^= ctypes.c_int(self.rshift(a, 7)).value
        a = ctypes.c_int(a).value
        a += ctypes.c_int(a << 3).value
        a ^= ctypes.c_int(self.rshift(a, 17)).value
        a = ctypes.c_int(a).value
        a += ctypes.c_int(a << 5).value
        return ctypes.c_int(a & 0xffffffff).value

    def fnv_1a_b(self, a):
        """
        One additional iteration of FNV, given a hash.

        ctypes used, as explained above.
        """
        a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
        a += ctypes.c_int(a << 13).value
        a ^= ctypes.c_int(self.rshift(a, 7)).value
        a = ctypes.c_int(a).value
        a += ctypes.c_int(a << 3).value
        a ^= ctypes.c_int(self.rshift(a, 17)).value
        a = ctypes.c_int(a).value
        a += ctypes.c_int(a << 5).value
        return ctypes.c_int(a & 0xffffffff).value
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`#!/usr/bin/env python3`

			`"""`
			`This is a translation of the bloom.js script (originally from`
			`https://github.com/jasondavies/bloomfilter.js) in Python.`

			`Due to its status of translation of the previously mentionned JS code, you`
			`should refer to this one for any particular doc that should be missing in this`
			`implementation.`

			`Needs the bitarray python module to work.`

			`Note : Depending on your use case, the pybloom module available on Pypi may`
			`better suits your needs. I reimplemented the above mentionned JS script in`
			`Python mostly because I had to for this script, as the pybloom module uses`
			`advanced hashing techniques, difficult to implement in JS.`

			`This script has been written by Phyks and is in the public domain (or whatever`
			`is closer to public domain in your country).`
			`"""`

Working ! 2014-10-28 20:08:47 +01:00			`import ctypes`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`import math`

			`try:`
			`import numpy as np`
			`except ImportError:`
			`raise ImportError('This script requires numpy')`


			`class BloomFilter():`
			`def __init__(self, capacity, error_rate=0.1):`
			`"""`
			`Implements a space-efficient probabilistic data structure.`

			`capacity`
			`This is the capacity of the BloomFilter. So to speak, it should be`
			`able to store at least capacity elements`
			`error_rate`
			`the error rate of the filter returning false positives. This`
			`determines the filters capacity. Inserting more than capacity`
			`elements greatly increases the chance of false positive.`
			`"""`
			`if not (0 < error_rate < 1):`
			`raise ValueError("Error_Rate must be between 0 and 1.")`
			`if not capacity > 0 or type(capacity) != int:`
			`raise ValueError("Capacity must be > 0")`

			`# Same calculation as in the js file, see it for reference purpose`
			`# Basically determines the number of bits and slices from the capacity`
			`# and error_rate.`
			`k = math.ceil(- math.log(error_rate, 2))`
			`m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k`
			`n = math.ceil(m / 32)`
			`m = n * 32`
			`self.m = m`
			`self.k = k`

Working ! 2014-10-28 20:08:47 +01:00			`kbytes = ctypes.c_int(1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))).value`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`self.buckets = np.zeros(n, dtype=np.int32)`
			`if kbytes == 1:`
			`loc_type = np.uint8`
			`elif kbytes == 2:`
			`loc_type = np.uint16`
			`else:`
			`loc_type = np.int32`
			`self._locations = np.zeros(k, dtype=loc_type)`

Working ! 2014-10-28 20:08:47 +01:00			`def mod(self, a, b):`
			`"""`
			`Tweak the % operator so that it behaves like in C and in JS.`
			`"""`
			`if a > 0:`
			`return a % b`
			`else:`
			`return - (abs(a) % b)`

Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`def locations(self, v):`
			`r = self._locations`
			`a = self.fnv_1a(v)`
			`b = self.fnv_1a_b(a)`
			`i = 0`
Working ! 2014-10-28 20:08:47 +01:00			`x = self.mod(a, self.m)`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`while i < self.k:`
			`r[i] = (x + self.m) if x < 0 else x`
Working ! 2014-10-28 20:08:47 +01:00			`x = self.mod(x + b, self.m)`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`i += 1`
			`return r`

			`def add(self, v):`
Working ! 2014-10-28 20:08:47 +01:00			`l = self.locations(str(v))`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`i = 0`
			`buckets = self.buckets`
			`while i < self.k:`
Working ! 2014-10-28 20:08:47 +01:00			`index = math.floor(l[i] / 32)`
			`buckets[index] \|= ctypes.c_int(1 << self.mod(l[i], 32)).value`
			`buckets[index] = ctypes.c_int(buckets[index]).value`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`i += 1`

			`def test(self, v):`
Working ! 2014-10-28 20:08:47 +01:00			`l = self.locations(str(v))`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`i = 0`
			`buckets = self.buckets`
			`while i < self.k:`
			`b = l[i]`
Working ! 2014-10-28 20:08:47 +01:00			`if ctypes.c_int(buckets[math.floor(b / 32)] & ctypes.c_int(1 << (self.mod(b, 32))).value).value == 0:`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`return False`
			`i += 1`
			`return True`

			`def size(self):`
			`"""`
			`Estimated cardinality`
			`"""`
			`bits = 0`
			`buckets = self.buckets`
			`for i in range(0, len(buckets)):`
			`bits += self.popcnt(buckets[i])`
			`return -self.m * math.log(1 - bits / self.m) / self.k`

			`def popcnt(self, v):`
			`"""`
			`http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel`
			`"""`
Working ! 2014-10-28 20:08:47 +01:00			`v -= ctypes.c_int(ctypes.c_int(v >> 1).value & ctypes.c_int(0x55555555).value).value`
			`v = ctypes.c_int(v & 0x33333333).value + c_types.c_int(ctypes.c_int(v >> 2).value & 0x33333333).value`
			`return ctypes.c_int((ctypes.c_int((v + ctypes.c_int(v >> 4).value) & 0xF0F0F0F).value * 0x1010101) >> 24).value`


			`def rshift(self,val, n):`
			`"""`
			`Implements the >>> JS operator.`

			`From https://stackoverflow.com/questions/5832982/how-to-get-the-logical-right-binary-shift-in-python`
			`"""`
			`return (val % 0x100000000) >> n`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00
			`def fnv_1a(self, v):`
			`"""`
			`Fowler/Noll/Vo hashing.`
Working ! 2014-10-28 20:08:47 +01:00
			`Uses a lot of ctypes.c_int because int in JS are represented as 64 bits`
			`floats. This representation is used for every arithmetical operations`
			`but not for bitwise operations. In this case they are treated as 32 bits`
			`integers !`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`"""`
			`n = len(v)`
TODO : https://stackoverflow.com/questions/1694507/difference-between-operator-in-js-and-python 2014-10-28 02:54:43 +01:00			`a = 2166136261`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`i = 0`
			`while i < n:`
TODO : https://stackoverflow.com/questions/1694507/difference-between-operator-in-js-and-python 2014-10-28 02:54:43 +01:00			`c = ord(v[i])`
Working ! 2014-10-28 20:08:47 +01:00			`d = ctypes.c_int(c & 0xff000000).value`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`if d:`
Working ! 2014-10-28 20:08:47 +01:00			`a ^= ctypes.c_int(d >> 24).value`
			`a = ctypes.c_int(a).valu`
			`a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value`
			`d = ctypes.c_int(c & 0xff0000).value`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`if d:`
Working ! 2014-10-28 20:08:47 +01:00			`a ^= ctypes.c_int(d >> 16).value`
			`a = ctypes.c_int(a).valu`
			`a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value`
			`d = ctypes.c_int(c & 0xff00).value`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`if d:`
Working ! 2014-10-28 20:08:47 +01:00			`a ^= ctypes.c_int(d >> 8).value`
			`a = ctypes.c_int(a).value`
			`a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value`
			`a ^= ctypes.c_int(c & 0xff).value`
			`a = ctypes.c_int(a).value`
			`a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`i += 1`
			`# From http://home.comcast.net/~bretm/hash/6.html`
Working ! 2014-10-28 20:08:47 +01:00			`a += ctypes.c_int(a << 13).value`
			`a ^= ctypes.c_int(self.rshift(a, 7)).value`
			`a = ctypes.c_int(a).value`
			`a += ctypes.c_int(a << 3).value`
			`a ^= ctypes.c_int(self.rshift(a, 17)).value`
			`a = ctypes.c_int(a).value`
			`a += ctypes.c_int(a << 5).value`
			`return ctypes.c_int(a & 0xffffffff).value`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00
			`def fnv_1a_b(self, a):`
			`"""`
			`One additional iteration of FNV, given a hash.`
Working ! 2014-10-28 20:08:47 +01:00
			`ctypes used, as explained above.`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`"""`
Working ! 2014-10-28 20:08:47 +01:00			`a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value`
			`a += ctypes.c_int(a << 13).value`
			`a ^= ctypes.c_int(self.rshift(a, 7)).value`
			`a = ctypes.c_int(a).value`
			`a += ctypes.c_int(a << 3).value`
			`a ^= ctypes.c_int(self.rshift(a, 17)).value`
			`a = ctypes.c_int(a).value`
			`a += ctypes.c_int(a << 5).value`
			`return ctypes.c_int(a & 0xffffffff).value`