2014-10-28 02:36:28 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
"""
|
|
|
|
This is a translation of the bloom.js script (originally from
|
|
|
|
https://github.com/jasondavies/bloomfilter.js) in Python.
|
|
|
|
|
|
|
|
Due to its status of translation of the previously mentionned JS code, you
|
|
|
|
should refer to this one for any particular doc that should be missing in this
|
|
|
|
implementation.
|
|
|
|
|
|
|
|
Needs the bitarray python module to work.
|
|
|
|
|
|
|
|
Note : Depending on your use case, the pybloom module available on Pypi may
|
|
|
|
better suits your needs. I reimplemented the above mentionned JS script in
|
|
|
|
Python mostly because I had to for this script, as the pybloom module uses
|
|
|
|
advanced hashing techniques, difficult to implement in JS.
|
|
|
|
|
|
|
|
This script has been written by Phyks and is in the public domain (or whatever
|
|
|
|
is closer to public domain in your country).
|
|
|
|
"""
|
|
|
|
|
2014-10-28 20:08:47 +01:00
|
|
|
import ctypes
|
2014-10-28 02:36:28 +01:00
|
|
|
import math
|
|
|
|
|
|
|
|
try:
|
|
|
|
import numpy as np
|
|
|
|
except ImportError:
|
|
|
|
raise ImportError('This script requires numpy')
|
|
|
|
|
|
|
|
|
|
|
|
class BloomFilter():
|
|
|
|
def __init__(self, capacity, error_rate=0.1):
|
|
|
|
"""
|
|
|
|
Implements a space-efficient probabilistic data structure.
|
|
|
|
|
|
|
|
capacity
|
|
|
|
This is the capacity of the BloomFilter. So to speak, it should be
|
|
|
|
able to store at least *capacity* elements
|
|
|
|
error_rate
|
|
|
|
the error rate of the filter returning false positives. This
|
|
|
|
determines the filters capacity. Inserting more than capacity
|
|
|
|
elements greatly increases the chance of false positive.
|
|
|
|
"""
|
|
|
|
if not (0 < error_rate < 1):
|
|
|
|
raise ValueError("Error_Rate must be between 0 and 1.")
|
|
|
|
if not capacity > 0 or type(capacity) != int:
|
|
|
|
raise ValueError("Capacity must be > 0")
|
|
|
|
|
|
|
|
# Same calculation as in the js file, see it for reference purpose
|
|
|
|
# Basically determines the number of bits and slices from the capacity
|
|
|
|
# and error_rate.
|
|
|
|
k = math.ceil(- math.log(error_rate, 2))
|
|
|
|
m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k
|
|
|
|
n = math.ceil(m / 32)
|
|
|
|
m = n * 32
|
|
|
|
self.m = m
|
|
|
|
self.k = k
|
|
|
|
|
2014-10-28 20:08:47 +01:00
|
|
|
kbytes = ctypes.c_int(1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))).value
|
2014-10-28 02:36:28 +01:00
|
|
|
self.buckets = np.zeros(n, dtype=np.int32)
|
|
|
|
if kbytes == 1:
|
|
|
|
loc_type = np.uint8
|
|
|
|
elif kbytes == 2:
|
|
|
|
loc_type = np.uint16
|
|
|
|
else:
|
|
|
|
loc_type = np.int32
|
|
|
|
self._locations = np.zeros(k, dtype=loc_type)
|
|
|
|
|
2014-10-28 20:08:47 +01:00
|
|
|
def mod(self, a, b):
|
|
|
|
"""
|
|
|
|
Tweak the % operator so that it behaves like in C and in JS.
|
|
|
|
"""
|
|
|
|
if a > 0:
|
|
|
|
return a % b
|
|
|
|
else:
|
|
|
|
return - (abs(a) % b)
|
|
|
|
|
2014-10-28 02:36:28 +01:00
|
|
|
def locations(self, v):
|
|
|
|
r = self._locations
|
|
|
|
a = self.fnv_1a(v)
|
|
|
|
b = self.fnv_1a_b(a)
|
|
|
|
i = 0
|
2014-10-28 20:08:47 +01:00
|
|
|
x = self.mod(a, self.m)
|
2014-10-28 02:36:28 +01:00
|
|
|
while i < self.k:
|
|
|
|
r[i] = (x + self.m) if x < 0 else x
|
2014-10-28 20:08:47 +01:00
|
|
|
x = self.mod(x + b, self.m)
|
2014-10-28 02:36:28 +01:00
|
|
|
i += 1
|
|
|
|
return r
|
|
|
|
|
|
|
|
def add(self, v):
|
2014-10-28 20:08:47 +01:00
|
|
|
l = self.locations(str(v))
|
2014-10-28 02:36:28 +01:00
|
|
|
i = 0
|
|
|
|
buckets = self.buckets
|
|
|
|
while i < self.k:
|
2014-10-28 20:08:47 +01:00
|
|
|
index = math.floor(l[i] / 32)
|
|
|
|
buckets[index] |= ctypes.c_int(1 << self.mod(l[i], 32)).value
|
|
|
|
buckets[index] = ctypes.c_int(buckets[index]).value
|
2014-10-28 02:36:28 +01:00
|
|
|
i += 1
|
|
|
|
|
|
|
|
def test(self, v):
|
2014-10-28 20:08:47 +01:00
|
|
|
l = self.locations(str(v))
|
2014-10-28 02:36:28 +01:00
|
|
|
i = 0
|
|
|
|
buckets = self.buckets
|
|
|
|
while i < self.k:
|
|
|
|
b = l[i]
|
2014-10-28 20:08:47 +01:00
|
|
|
if ctypes.c_int(buckets[math.floor(b / 32)] & ctypes.c_int(1 << (self.mod(b, 32))).value).value == 0:
|
2014-10-28 02:36:28 +01:00
|
|
|
return False
|
|
|
|
i += 1
|
|
|
|
return True
|
|
|
|
|
|
|
|
def size(self):
|
|
|
|
"""
|
|
|
|
Estimated cardinality
|
|
|
|
"""
|
|
|
|
bits = 0
|
|
|
|
buckets = self.buckets
|
|
|
|
for i in range(0, len(buckets)):
|
|
|
|
bits += self.popcnt(buckets[i])
|
|
|
|
return -self.m * math.log(1 - bits / self.m) / self.k
|
|
|
|
|
|
|
|
def popcnt(self, v):
|
|
|
|
"""
|
|
|
|
http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
|
|
|
"""
|
2014-10-28 20:08:47 +01:00
|
|
|
v -= ctypes.c_int(ctypes.c_int(v >> 1).value & ctypes.c_int(0x55555555).value).value
|
|
|
|
v = ctypes.c_int(v & 0x33333333).value + c_types.c_int(ctypes.c_int(v >> 2).value & 0x33333333).value
|
|
|
|
return ctypes.c_int((ctypes.c_int((v + ctypes.c_int(v >> 4).value) & 0xF0F0F0F).value * 0x1010101) >> 24).value
|
|
|
|
|
|
|
|
|
|
|
|
def rshift(self,val, n):
|
|
|
|
"""
|
|
|
|
Implements the >>> JS operator.
|
|
|
|
|
|
|
|
From https://stackoverflow.com/questions/5832982/how-to-get-the-logical-right-binary-shift-in-python
|
|
|
|
"""
|
|
|
|
return (val % 0x100000000) >> n
|
2014-10-28 02:36:28 +01:00
|
|
|
|
|
|
|
def fnv_1a(self, v):
|
|
|
|
"""
|
|
|
|
Fowler/Noll/Vo hashing.
|
2014-10-28 20:08:47 +01:00
|
|
|
|
|
|
|
Uses a lot of ctypes.c_int because int in JS are represented as 64 bits
|
|
|
|
floats. This representation is used for every arithmetical operations
|
|
|
|
but not for bitwise operations. In this case they are treated as 32 bits
|
|
|
|
integers !
|
2014-10-28 02:36:28 +01:00
|
|
|
"""
|
|
|
|
n = len(v)
|
2014-10-28 02:54:43 +01:00
|
|
|
a = 2166136261
|
2014-10-28 02:36:28 +01:00
|
|
|
i = 0
|
|
|
|
while i < n:
|
2014-10-28 02:54:43 +01:00
|
|
|
c = ord(v[i])
|
2014-10-28 20:08:47 +01:00
|
|
|
d = ctypes.c_int(c & 0xff000000).value
|
2014-10-28 02:36:28 +01:00
|
|
|
if d:
|
2014-10-28 20:08:47 +01:00
|
|
|
a ^= ctypes.c_int(d >> 24).value
|
|
|
|
a = ctypes.c_int(a).valu
|
|
|
|
a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
|
|
|
|
d = ctypes.c_int(c & 0xff0000).value
|
2014-10-28 02:36:28 +01:00
|
|
|
if d:
|
2014-10-28 20:08:47 +01:00
|
|
|
a ^= ctypes.c_int(d >> 16).value
|
|
|
|
a = ctypes.c_int(a).valu
|
|
|
|
a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
|
|
|
|
d = ctypes.c_int(c & 0xff00).value
|
2014-10-28 02:36:28 +01:00
|
|
|
if d:
|
2014-10-28 20:08:47 +01:00
|
|
|
a ^= ctypes.c_int(d >> 8).value
|
|
|
|
a = ctypes.c_int(a).value
|
|
|
|
a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
|
|
|
|
a ^= ctypes.c_int(c & 0xff).value
|
|
|
|
a = ctypes.c_int(a).value
|
|
|
|
a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
|
2014-10-28 02:36:28 +01:00
|
|
|
i += 1
|
|
|
|
# From http://home.comcast.net/~bretm/hash/6.html
|
2014-10-28 20:08:47 +01:00
|
|
|
a += ctypes.c_int(a << 13).value
|
|
|
|
a ^= ctypes.c_int(self.rshift(a, 7)).value
|
|
|
|
a = ctypes.c_int(a).value
|
|
|
|
a += ctypes.c_int(a << 3).value
|
|
|
|
a ^= ctypes.c_int(self.rshift(a, 17)).value
|
|
|
|
a = ctypes.c_int(a).value
|
|
|
|
a += ctypes.c_int(a << 5).value
|
|
|
|
return ctypes.c_int(a & 0xffffffff).value
|
2014-10-28 02:36:28 +01:00
|
|
|
|
|
|
|
def fnv_1a_b(self, a):
|
|
|
|
"""
|
|
|
|
One additional iteration of FNV, given a hash.
|
2014-10-28 20:08:47 +01:00
|
|
|
|
|
|
|
ctypes used, as explained above.
|
2014-10-28 02:36:28 +01:00
|
|
|
"""
|
2014-10-28 20:08:47 +01:00
|
|
|
a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value
|
|
|
|
a += ctypes.c_int(a << 13).value
|
|
|
|
a ^= ctypes.c_int(self.rshift(a, 7)).value
|
|
|
|
a = ctypes.c_int(a).value
|
|
|
|
a += ctypes.c_int(a << 3).value
|
|
|
|
a ^= ctypes.c_int(self.rshift(a, 17)).value
|
|
|
|
a = ctypes.c_int(a).value
|
|
|
|
a += ctypes.c_int(a << 5).value
|
|
|
|
return ctypes.c_int(a & 0xffffffff).value
|