diff --git a/data/filters b/data/filters index 22b4dac..cac22af 100644 Binary files a/data/filters and b/data/filters differ diff --git a/index_generation/bloom.py b/index_generation/bloom.py index 5655c6d..982f76f 100644 --- a/index_generation/bloom.py +++ b/index_generation/bloom.py @@ -19,6 +19,7 @@ This script has been written by Phyks and is in the public domain (or whatever is closer to public domain in your country). """ +import ctypes import math try: @@ -55,7 +56,7 @@ class BloomFilter(): self.m = m self.k = k - kbytes = 1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2)) + kbytes = ctypes.c_int(1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))).value self.buckets = np.zeros(n, dtype=np.int32) if kbytes == 1: loc_type = np.uint8 @@ -65,34 +66,44 @@ class BloomFilter(): loc_type = np.int32 self._locations = np.zeros(k, dtype=loc_type) + def mod(self, a, b): + """ + Tweak the % operator so that it behaves like in C and in JS. + """ + if a > 0: + return a % b + else: + return - (abs(a) % b) + def locations(self, v): r = self._locations a = self.fnv_1a(v) b = self.fnv_1a_b(a) - print(b) i = 0 - x = a % self.m + x = self.mod(a, self.m) while i < self.k: r[i] = (x + self.m) if x < 0 else x - x = (x + b) % self.m + x = self.mod(x + b, self.m) i += 1 return r def add(self, v): - l = self.locations(v + "") + l = self.locations(str(v)) i = 0 buckets = self.buckets while i < self.k: - buckets[math.floor(l[i] / 32)] |= 1 << int(l[i] % 32) + index = math.floor(l[i] / 32) + buckets[index] |= ctypes.c_int(1 << self.mod(l[i], 32)).value + buckets[index] = ctypes.c_int(buckets[index]).value i += 1 def test(self, v): - l = self.locations(v + "") + l = self.locations(str(v)) i = 0 buckets = self.buckets while i < self.k: b = l[i] - if buckets[math.floor(b / 32)] & (1 << int(b % 32)) == 0: + if ctypes.c_int(buckets[math.floor(b / 32)] & ctypes.c_int(1 << (self.mod(b, 32))).value).value == 0: return False i += 1 return True @@ -111,51 +122,74 @@ class BloomFilter(): """ http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel """ - v -= (v >> 1) & 0x55555555 - v = (v & 0x33333333) + ((v >> 2) & 0x33333333) - return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24 + v -= ctypes.c_int(ctypes.c_int(v >> 1).value & ctypes.c_int(0x55555555).value).value + v = ctypes.c_int(v & 0x33333333).value + c_types.c_int(ctypes.c_int(v >> 2).value & 0x33333333).value + return ctypes.c_int((ctypes.c_int((v + ctypes.c_int(v >> 4).value) & 0xF0F0F0F).value * 0x1010101) >> 24).value + + + def rshift(self,val, n): + """ + Implements the >>> JS operator. + + From https://stackoverflow.com/questions/5832982/how-to-get-the-logical-right-binary-shift-in-python + """ + return (val % 0x100000000) >> n def fnv_1a(self, v): """ Fowler/Noll/Vo hashing. + + Uses a lot of ctypes.c_int because int in JS are represented as 64 bits + floats. This representation is used for every arithmetical operations + but not for bitwise operations. In this case they are treated as 32 bits + integers ! """ n = len(v) a = 2166136261 i = 0 while i < n: c = ord(v[i]) - d = c & 0xff000000 + d = ctypes.c_int(c & 0xff000000).value if d: - a ^= d >> 24 - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) - d = c & 0xff0000 + a ^= ctypes.c_int(d >> 24).value + a = ctypes.c_int(a).valu + a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value + d = ctypes.c_int(c & 0xff0000).value if d: - a ^= d >> 16 - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) - d = c & 0xff00 + a ^= ctypes.c_int(d >> 16).value + a = ctypes.c_int(a).valu + a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value + d = ctypes.c_int(c & 0xff00).value if d: - a ^= d >> 8 - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) - a ^= c & 0xff - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) + a ^= ctypes.c_int(d >> 8).value + a = ctypes.c_int(a).value + a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value + a ^= ctypes.c_int(c & 0xff).value + a = ctypes.c_int(a).value + a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value i += 1 # From http://home.comcast.net/~bretm/hash/6.html - a += a << 13 - a ^= a >> 7 - a += a << 3 - a ^= a >> 17 - a += a << 5 - return a & 0xffffffff + a += ctypes.c_int(a << 13).value + a ^= ctypes.c_int(self.rshift(a, 7)).value + a = ctypes.c_int(a).value + a += ctypes.c_int(a << 3).value + a ^= ctypes.c_int(self.rshift(a, 17)).value + a = ctypes.c_int(a).value + a += ctypes.c_int(a << 5).value + return ctypes.c_int(a & 0xffffffff).value def fnv_1a_b(self, a): """ One additional iteration of FNV, given a hash. + + ctypes used, as explained above. """ - a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) - a += a << 13 - a ^= a >> 7 - a += a << 3 - a ^= a >> 17 - a += a << 5 - print(a) - return a & 0xffffffff + a += ctypes.c_int(a << 1).value + ctypes.c_int(a << 4).value + ctypes.c_int(a << 7).value + ctypes.c_int(a << 8).value + ctypes.c_int(a << 24).value + a += ctypes.c_int(a << 13).value + a ^= ctypes.c_int(self.rshift(a, 7)).value + a = ctypes.c_int(a).value + a += ctypes.c_int(a << 3).value + a ^= ctypes.c_int(self.rshift(a, 17)).value + a = ctypes.c_int(a).value + a += ctypes.c_int(a << 5).value + return ctypes.c_int(a & 0xffffffff).value diff --git a/index_generation/generate_index.py b/index_generation/generate_index.py index d399244..ce3e143 100755 --- a/index_generation/generate_index.py +++ b/index_generation/generate_index.py @@ -63,8 +63,6 @@ if __name__ == "__main__": tmp_filter.add(word) filters.append(tmp_filter.buckets) - print(tmp_filter.buckets) - sys.exit() # First Int32 is length filters_to_write = struct.pack("Bloom.JS demo
This page runs the bloom.js
library unit-tests. Look at your console output for assert
error and verbose debugging.
+