Commit before debugging bloom.py
This commit is contained in:
parent
5b4cc421df
commit
496e2f823f
BIN
data/filters
Normal file
BIN
data/filters
Normal file
Binary file not shown.
1
data/pages.json
Normal file
1
data/pages.json
Normal file
@ -0,0 +1 @@
|
||||
{"index": ["../samples/cryptdevice_multi.html", "../samples/highmon_weechat.html"]}
|
BIN
data/words
Normal file
BIN
data/words
Normal file
Binary file not shown.
17
index.html
17
index.html
@ -3,17 +3,22 @@
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>BloomJS demo</title>
|
||||
<style type="text/css">
|
||||
#error {
|
||||
font-weight: bold;
|
||||
color: red;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Bloom.JS demo</h1>
|
||||
<p id="loading"></p>
|
||||
<form id="search_form">
|
||||
<p>
|
||||
<input type="text" id="search" name="search" value="Search for articles..."/>
|
||||
</p>
|
||||
</form>
|
||||
<p id="error"></p>
|
||||
<div id="main">
|
||||
<p>Loading…</p>
|
||||
</div>
|
||||
<div id="results"></div>
|
||||
<script type="text/javascript" src="js/bloom.js"></script>
|
||||
<script type="text/javascript" src="js/stemmer.js"></script>
|
||||
<script type="text/javascript" src="js/app.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
162
index_generation/bloom.py
Normal file
162
index_generation/bloom.py
Normal file
@ -0,0 +1,162 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
This is a translation of the bloom.js script (originally from
|
||||
https://github.com/jasondavies/bloomfilter.js) in Python.
|
||||
|
||||
Due to its status of translation of the previously mentionned JS code, you
|
||||
should refer to this one for any particular doc that should be missing in this
|
||||
implementation.
|
||||
|
||||
Needs the bitarray python module to work.
|
||||
|
||||
Note : Depending on your use case, the pybloom module available on Pypi may
|
||||
better suits your needs. I reimplemented the above mentionned JS script in
|
||||
Python mostly because I had to for this script, as the pybloom module uses
|
||||
advanced hashing techniques, difficult to implement in JS.
|
||||
|
||||
This script has been written by Phyks and is in the public domain (or whatever
|
||||
is closer to public domain in your country).
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
raise ImportError('This script requires numpy')
|
||||
|
||||
|
||||
class BloomFilter():
|
||||
def __init__(self, capacity, error_rate=0.1):
|
||||
"""
|
||||
Implements a space-efficient probabilistic data structure.
|
||||
|
||||
capacity
|
||||
This is the capacity of the BloomFilter. So to speak, it should be
|
||||
able to store at least *capacity* elements
|
||||
error_rate
|
||||
the error rate of the filter returning false positives. This
|
||||
determines the filters capacity. Inserting more than capacity
|
||||
elements greatly increases the chance of false positive.
|
||||
"""
|
||||
if not (0 < error_rate < 1):
|
||||
raise ValueError("Error_Rate must be between 0 and 1.")
|
||||
if not capacity > 0 or type(capacity) != int:
|
||||
raise ValueError("Capacity must be > 0")
|
||||
|
||||
# Same calculation as in the js file, see it for reference purpose
|
||||
# Basically determines the number of bits and slices from the capacity
|
||||
# and error_rate.
|
||||
k = math.ceil(- math.log(error_rate, 2))
|
||||
m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k
|
||||
n = math.ceil(m / 32)
|
||||
m = n * 32
|
||||
self.m = m
|
||||
self.k = k
|
||||
|
||||
kbytes = 1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))
|
||||
self.buckets = np.zeros(n, dtype=np.int32)
|
||||
if kbytes == 1:
|
||||
loc_type = np.uint8
|
||||
elif kbytes == 2:
|
||||
loc_type = np.uint16
|
||||
else:
|
||||
loc_type = np.int32
|
||||
self._locations = np.zeros(k, dtype=loc_type)
|
||||
|
||||
def locations(self, v):
|
||||
r = self._locations
|
||||
a = self.fnv_1a(v)
|
||||
b = self.fnv_1a_b(a)
|
||||
print(b)
|
||||
i = 0
|
||||
x = a % self.m
|
||||
while i < self.k:
|
||||
r[i] = (x + self.m) if x < 0 else x
|
||||
x = (x + b) % self.m
|
||||
i += 1
|
||||
return r
|
||||
|
||||
def add(self, v):
|
||||
l = self.locations(v + "")
|
||||
i = 0
|
||||
buckets = self.buckets
|
||||
while i < self.k:
|
||||
buckets[math.floor(l[i] / 32)] |= 1 << int(l[i] % 32)
|
||||
i += 1
|
||||
|
||||
def test(self, v):
|
||||
l = self.locations(v + "")
|
||||
i = 0
|
||||
buckets = self.buckets
|
||||
while i < self.k:
|
||||
b = l[i]
|
||||
if buckets[math.floor(b / 32)] & (1 << int(b % 32)) == 0:
|
||||
return False
|
||||
i += 1
|
||||
return True
|
||||
|
||||
def size(self):
|
||||
"""
|
||||
Estimated cardinality
|
||||
"""
|
||||
bits = 0
|
||||
buckets = self.buckets
|
||||
for i in range(0, len(buckets)):
|
||||
bits += self.popcnt(buckets[i])
|
||||
return -self.m * math.log(1 - bits / self.m) / self.k
|
||||
|
||||
def popcnt(self, v):
|
||||
"""
|
||||
http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||
"""
|
||||
v -= (v >> 1) & 0x55555555
|
||||
v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
|
||||
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24
|
||||
|
||||
def fnv_1a(self, v):
|
||||
"""
|
||||
Fowler/Noll/Vo hashing.
|
||||
"""
|
||||
n = len(v)
|
||||
a = 2166136261
|
||||
i = 0
|
||||
while i < n:
|
||||
c = ord(v[i])
|
||||
d = c & 0xff000000
|
||||
if d:
|
||||
a ^= d >> 24
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||
d = c & 0xff0000
|
||||
if d:
|
||||
a ^= d >> 16
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||
d = c & 0xff00
|
||||
if d:
|
||||
a ^= d >> 8
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||
a ^= c & 0xff
|
||||
print(a ^ (c & 0xff))
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||
i += 1
|
||||
# From http://home.comcast.net/~bretm/hash/6.html
|
||||
a += a << 13
|
||||
a ^= a >> 7
|
||||
a += a << 3
|
||||
a ^= a >> 17
|
||||
a += a << 5
|
||||
return a & 0xffffffff
|
||||
|
||||
def fnv_1a_b(self, a):
|
||||
"""
|
||||
One additional iteration of FNV, given a hash.
|
||||
"""
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||
a += a << 13
|
||||
a ^= a >> 7
|
||||
a += a << 3
|
||||
a ^= a >> 17
|
||||
a += a << 5
|
||||
print(a)
|
||||
return a & 0xffffffff
|
@ -1,92 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Inspired by
|
||||
http://www.stavros.io/posts/bloom-filter-search-engine/?print
|
||||
|
||||
|
||||
You have to install the numpy python module for bloom to work.
|
||||
"""
|
||||
|
||||
import bloom
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from lxml import html
|
||||
import re
|
||||
import stemmer
|
||||
import json
|
||||
from bitarray import bitarray
|
||||
from pybloom import BloomFilter
|
||||
import struct
|
||||
import sys
|
||||
|
||||
from lxml import html
|
||||
|
||||
|
||||
# List all files in path directory
|
||||
def list_directory(path):
|
||||
fichier = []
|
||||
"""Recursively list all files in a given directory."""
|
||||
files_list = []
|
||||
for root, dirs, files in os.walk(path):
|
||||
for i in files:
|
||||
fichier.append(os.path.join(root, i))
|
||||
return fichier
|
||||
files_list.append(os.path.join(root, i))
|
||||
return files_list
|
||||
|
||||
|
||||
def remove_common_words(words):
|
||||
"""Removes all words that are less than 3 characters long."""
|
||||
returned = [word for word in words if len(word) > 3]
|
||||
return returned
|
||||
|
||||
def padding_16(x):
|
||||
if x < 256:
|
||||
return bytes([0,x])
|
||||
else:
|
||||
return bytes([int(x/256), x%256])
|
||||
|
||||
# =============================================================================
|
||||
samples = list_directory("../samples/")
|
||||
filters = {}
|
||||
p = stemmer.PorterStemmer()
|
||||
write_little = bitarray(endian="little")
|
||||
write_big = bitarray(endian="big")
|
||||
if __name__ == "__main__":
|
||||
error_rate = 0.1
|
||||
samples = list_directory("../samples/")
|
||||
filters = []
|
||||
p = stemmer.PorterStemmer()
|
||||
|
||||
write_little.frombytes(padding_16(len(samples)))
|
||||
write_big.frombytes(padding_16(len(samples)))
|
||||
for sample in samples:
|
||||
with open(sample, 'r') as sample_fh:
|
||||
content = sample_fh.read()
|
||||
|
||||
if len(samples) > 65535:
|
||||
sys.exit("[ERROR] Too many articles to index. You will have to change the "
|
||||
"way data is stored in the binary file to handle such amount of "
|
||||
"files.")
|
||||
# Get text from HTML content
|
||||
words = html.fromstring(content).text_content().replace("\n", "")
|
||||
words = re.findall(r"[\w]+", words)
|
||||
# Remove all punctuation etc., convert words to lower and delete
|
||||
# duplicates
|
||||
words = list(set([word.lower() for word in words]))
|
||||
|
||||
for sample in samples:
|
||||
with open(sample, 'r') as sample_fh:
|
||||
content = sample_fh.read()
|
||||
# Remove common words
|
||||
words = remove_common_words(words)
|
||||
# Stemming to reduce the number of words
|
||||
words = list(set([p.stem(word, 0, len(word)-1) for word in words]))
|
||||
|
||||
# Get text from HTML content
|
||||
words = html.fromstring(content).text_content().replace("\n", "")
|
||||
words = re.findall(r"[\w]+", words)
|
||||
# Remove all punctuation etc., convert words to lower and delete duplicates
|
||||
words = list(set([word.lower() for word in words]))
|
||||
tmp_filter = bloom.BloomFilter(capacity=len(words),
|
||||
error_rate=error_rate)
|
||||
words = json.loads('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]')
|
||||
for word in words:
|
||||
tmp_filter.add(word)
|
||||
|
||||
# Remove common words
|
||||
words = remove_common_words(words)
|
||||
# Stemming to reduce the number of words
|
||||
words = [p.stem(word, 0, len(word)-1) for word in words]
|
||||
filters.append(tmp_filter.buckets)
|
||||
print(tmp_filter.buckets)
|
||||
sys.exit()
|
||||
|
||||
filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1)
|
||||
for word in words:
|
||||
filters[sample].add(word)
|
||||
# First Int32 is length
|
||||
filters_to_write = struct.pack("<i", len(filters))
|
||||
# Then comes the length of each filter
|
||||
for i in filters:
|
||||
filters_to_write += struct.pack("<i", len(i))
|
||||
# Finally comes the filters themselves
|
||||
for i in filters:
|
||||
filters_to_write += struct.pack("<%di" % len(i), *i)
|
||||
|
||||
if filters[sample].bitarray.length() > 65535:
|
||||
sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
|
||||
"will have to change the way data is stored in the binary "
|
||||
"file to handle such amount of text.")
|
||||
# Write everything
|
||||
with open("../data/filters", "wb") as index_fh:
|
||||
index_fh.write(filters_to_write)
|
||||
|
||||
tmp = bitarray(endian="little")
|
||||
tmp.frombytes(padding_16(filters[sample].bitarray.length()))
|
||||
write_little.extend(tmp)
|
||||
write_little.extend(filters[sample].bitarray)
|
||||
write_little.extend([0 for i in range(filters[sample].bitarray.length() %
|
||||
8)])
|
||||
tmp = bitarray(endian="big")
|
||||
tmp.frombytes(padding_16(filters[sample].bitarray.length()))
|
||||
write_big.extend(tmp)
|
||||
write_big.extend(filters[sample].bitarray)
|
||||
write_big.extend([0 for i in range(filters[sample].bitarray.length() %
|
||||
8)])
|
||||
|
||||
with open('../data/search_index_little', 'wb') as index_fh:
|
||||
print(write_little)
|
||||
write_little.tofile(index_fh)
|
||||
with open('../data/search_index_big', 'wb') as index_fh:
|
||||
print(write_big)
|
||||
write_big.tofile(index_fh)
|
||||
|
||||
with open('../data/pages_index.json', 'w') as pages_fh:
|
||||
pages_fh.write(json.dumps(samples))
|
||||
with open("../data/pages.json", "w") as pages_fh:
|
||||
pages_fh.write(json.dumps({"index": samples}))
|
||||
|
@ -1,277 +0,0 @@
|
||||
import math
|
||||
import hashlib
|
||||
from struct import unpack, pack, calcsize
|
||||
|
||||
try:
|
||||
import bitarray
|
||||
except ImportError:
|
||||
raise ImportError('pybloom requires bitarray >= 0.3.4')
|
||||
|
||||
__version__ = '2.0'
|
||||
__author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
|
||||
Marius Eriksen <marius@monkey.org>,\
|
||||
Alex Brasetvik <alex@brasetvik.com>"
|
||||
|
||||
def make_hashfuncs(num_slices, num_bits):
|
||||
if num_bits >= (1 << 31):
|
||||
fmt_code, chunk_size = 'Q', 8
|
||||
elif num_bits >= (1 << 15):
|
||||
fmt_code, chunk_size = 'I', 4
|
||||
else:
|
||||
fmt_code, chunk_size = 'H', 2
|
||||
total_hash_bits = 8 * num_slices * chunk_size
|
||||
if total_hash_bits > 384:
|
||||
hashfn = hashlib.sha512
|
||||
elif total_hash_bits > 256:
|
||||
hashfn = hashlib.sha384
|
||||
elif total_hash_bits > 160:
|
||||
hashfn = hashlib.sha256
|
||||
elif total_hash_bits > 128:
|
||||
hashfn = hashlib.sha1
|
||||
else:
|
||||
hashfn = hashlib.md5
|
||||
fmt = fmt_code * (hashfn().digest_size // chunk_size)
|
||||
num_salts, extra = divmod(num_slices, len(fmt))
|
||||
if extra:
|
||||
num_salts += 1
|
||||
salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]
|
||||
def _make_hashfuncs(key):
|
||||
key = str(key)
|
||||
rval = []
|
||||
for salt in salts:
|
||||
h = salt.copy()
|
||||
h.update(key.encode('utf-8'))
|
||||
rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
|
||||
del rval[num_slices:]
|
||||
return rval
|
||||
return _make_hashfuncs
|
||||
|
||||
|
||||
class BloomFilter(object):
|
||||
FILE_FMT = '<dQQQQ'
|
||||
|
||||
def __init__(self, capacity, error_rate=0.001):
|
||||
if not (0 < error_rate < 1):
|
||||
raise ValueError("Error_Rate must be between 0 and 1.")
|
||||
if not capacity > 0:
|
||||
raise ValueError("Capacity must be > 0")
|
||||
num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
|
||||
bits_per_slice = int(math.ceil(
|
||||
(capacity * abs(math.log(error_rate))) /
|
||||
(num_slices * (math.log(2) ** 2))))
|
||||
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
|
||||
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
|
||||
self.bitarray.setall(False)
|
||||
|
||||
def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
|
||||
self.error_rate = error_rate
|
||||
self.num_slices = num_slices
|
||||
self.bits_per_slice = bits_per_slice
|
||||
self.capacity = capacity
|
||||
self.num_bits = num_slices * bits_per_slice
|
||||
self.count = count
|
||||
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
|
||||
|
||||
def __contains__(self, key):
|
||||
bits_per_slice = self.bits_per_slice
|
||||
bitarray = self.bitarray
|
||||
if not isinstance(key, list):
|
||||
hashes = self.make_hashes(key)
|
||||
else:
|
||||
hashes = key
|
||||
offset = 0
|
||||
for k in hashes:
|
||||
if not bitarray[offset + k]:
|
||||
return False
|
||||
offset += bits_per_slice
|
||||
return True
|
||||
|
||||
def __len__(self):
|
||||
"""Return the number of keys stored by this bloom filter."""
|
||||
return self.count
|
||||
|
||||
def add(self, key, skip_check=False):
|
||||
bitarray = self.bitarray
|
||||
bits_per_slice = self.bits_per_slice
|
||||
hashes = self.make_hashes(key)
|
||||
if not skip_check and hashes in self:
|
||||
return True
|
||||
if self.count > self.capacity:
|
||||
raise IndexError("BloomFilter is at capacity")
|
||||
offset = 0
|
||||
for k in hashes:
|
||||
self.bitarray[offset + k] = True
|
||||
offset += bits_per_slice
|
||||
self.count += 1
|
||||
return False
|
||||
|
||||
def copy(self):
|
||||
"""Return a copy of this bloom filter.
|
||||
"""
|
||||
new_filter = BloomFilter(self.capacity, self.error_rate)
|
||||
new_filter.bitarray = self.bitarray.copy()
|
||||
return new_filter
|
||||
|
||||
def union(self, other):
|
||||
""" Calculates the union of the two underlying bitarrays and returns
|
||||
a new bloom filter object."""
|
||||
if self.capacity != other.capacity or \
|
||||
self.error_rate != other.error_rate:
|
||||
raise ValueError("Unioning filters requires both filters to have \
|
||||
both the same capacity and error rate")
|
||||
new_bloom = self.copy()
|
||||
new_bloom.bitarray = new_bloom.bitarray | other.bitarray
|
||||
return new_bloom
|
||||
|
||||
def __or__(self, other):
|
||||
return self.union(other)
|
||||
|
||||
def intersection(self, other):
|
||||
""" Calculates the intersection of the two underlying bitarrays and returns
|
||||
a new bloom filter object."""
|
||||
if self.capacity != other.capacity or \
|
||||
self.error_rate != other.error_rate:
|
||||
raise ValueError("Intersecting filters requires both filters to \
|
||||
have equal capacity and error rate")
|
||||
new_bloom = self.copy()
|
||||
new_bloom.bitarray = new_bloom.bitarray & other.bitarray
|
||||
return new_bloom
|
||||
|
||||
def __and__(self, other):
|
||||
return self.intersection(other)
|
||||
|
||||
def tofile(self, f):
|
||||
"""Write the bloom filter to file object `f'. Underlying bits
|
||||
are written as machine values. This is much more space
|
||||
efficient than pickling the object."""
|
||||
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
|
||||
self.bits_per_slice, self.capacity, self.count))
|
||||
self.bitarray.tofile(f)
|
||||
|
||||
@classmethod
|
||||
def fromfile(cls, f, n=-1):
|
||||
"""Read a bloom filter from file-object `f' serialized with
|
||||
``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
|
||||
headerlen = calcsize(cls.FILE_FMT)
|
||||
|
||||
if 0 < n < headerlen:
|
||||
raise ValueError('n too small!')
|
||||
|
||||
filter = cls(1) # Bogus instantiation, we will `_setup'.
|
||||
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
|
||||
filter.bitarray = bitarray.bitarray(endian='little')
|
||||
if n > 0:
|
||||
filter.bitarray.fromfile(f, n - headerlen)
|
||||
else:
|
||||
filter.bitarray.fromfile(f)
|
||||
if filter.num_bits != filter.bitarray.length() and \
|
||||
(filter.num_bits + (8 - filter.num_bits % 8)
|
||||
!= filter.bitarray.length()):
|
||||
raise ValueError('Bit length mismatch!')
|
||||
|
||||
return filter
|
||||
|
||||
def __getstate__(self):
|
||||
d = self.__dict__.copy()
|
||||
del d['make_hashes']
|
||||
return d
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__.update(d)
|
||||
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
|
||||
|
||||
class ScalableBloomFilter(object):
|
||||
SMALL_SET_GROWTH = 2 # slower, but takes up less memory
|
||||
LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
|
||||
FILE_FMT = '<idQd'
|
||||
|
||||
def __init__(self, initial_capacity=100, error_rate=0.001,
|
||||
mode=SMALL_SET_GROWTH):
|
||||
if not error_rate or error_rate < 0:
|
||||
raise ValueError("Error_Rate must be a decimal less than 0.")
|
||||
self._setup(mode, 0.9, initial_capacity, error_rate)
|
||||
self.filters = []
|
||||
|
||||
def _setup(self, mode, ratio, initial_capacity, error_rate):
|
||||
self.scale = mode
|
||||
self.ratio = ratio
|
||||
self.initial_capacity = initial_capacity
|
||||
self.error_rate = error_rate
|
||||
|
||||
def __contains__(self, key):
|
||||
for f in reversed(self.filters):
|
||||
if key in f:
|
||||
return True
|
||||
return False
|
||||
|
||||
def add(self, key):
|
||||
if key in self:
|
||||
return True
|
||||
if not self.filters:
|
||||
filter = BloomFilter(
|
||||
capacity=self.initial_capacity,
|
||||
error_rate=self.error_rate * (1.0 - self.ratio))
|
||||
self.filters.append(filter)
|
||||
else:
|
||||
filter = self.filters[-1]
|
||||
if filter.count >= filter.capacity:
|
||||
filter = BloomFilter(
|
||||
capacity=filter.capacity * self.scale,
|
||||
error_rate=filter.error_rate * self.ratio)
|
||||
self.filters.append(filter)
|
||||
filter.add(key, skip_check=True)
|
||||
return False
|
||||
|
||||
@property
|
||||
def capacity(self):
|
||||
"""Returns the total capacity for all filters in this SBF"""
|
||||
return sum([f.capacity for f in self.filters])
|
||||
|
||||
@property
|
||||
def count(self):
|
||||
return len(self)
|
||||
|
||||
def tofile(self, f):
|
||||
"""Serialize this ScalableBloomFilter into the file-object
|
||||
`f'."""
|
||||
f.write(pack(self.FILE_FMT, self.scale, self.ratio,
|
||||
self.initial_capacity, self.error_rate))
|
||||
|
||||
# Write #-of-filters
|
||||
f.write(pack('<l', len(self.filters)))
|
||||
|
||||
if len(self.filters) > 0:
|
||||
# Then each filter directly, with a header describing
|
||||
# their lengths.
|
||||
headerpos = f.tell()
|
||||
headerfmt = '<' + 'Q'*(len(self.filters))
|
||||
f.write('.' * calcsize(headerfmt))
|
||||
filter_sizes = []
|
||||
for filter in self.filters:
|
||||
begin = f.tell()
|
||||
filter.tofile(f)
|
||||
filter_sizes.append(f.tell() - begin)
|
||||
|
||||
f.seek(headerpos)
|
||||
f.write(pack(headerfmt, *filter_sizes))
|
||||
|
||||
@classmethod
|
||||
def fromfile(cls, f):
|
||||
"""Deserialize the ScalableBloomFilter in file object `f'."""
|
||||
filter = cls()
|
||||
filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
|
||||
nfilters, = unpack('<l', f.read(calcsize('<l')))
|
||||
if nfilters > 0:
|
||||
header_fmt = '<' + 'Q'*nfilters
|
||||
bytes = f.read(calcsize(header_fmt))
|
||||
filter_lengths = unpack(header_fmt, bytes)
|
||||
for fl in filter_lengths:
|
||||
filter.filters.append(BloomFilter.fromfile(f, fl))
|
||||
else:
|
||||
filter.filters = []
|
||||
|
||||
return filter
|
||||
|
||||
def __len__(self):
|
||||
"""Returns the total number of elements stored in this SBF"""
|
||||
return sum([f.count for f in self.filters])
|
121
js/app.js
Normal file
121
js/app.js
Normal file
@ -0,0 +1,121 @@
|
||||
/* Params */
|
||||
var error_rate = 0.1;
|
||||
|
||||
|
||||
/* Vars */
|
||||
var bloom = Array(), index;
|
||||
var ready = false;
|
||||
|
||||
|
||||
/* Functions */
|
||||
function callback() {
|
||||
if (typeof(index) === 'undefined' || bloom.length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Sets up the page, that is now ready
|
||||
ready = true;
|
||||
document.getElementById('main').innerHTML = '<form id="search_form"><p><input type="text" id="search" name="search" placeholder="Search for articles..."/></p></form>';
|
||||
|
||||
// Handle onchange actions
|
||||
document.getElementById('search').oninput = function (e) {
|
||||
if (!ready) {
|
||||
return;
|
||||
}
|
||||
|
||||
filter_results(e.target.value);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true iff all the terms in the array are in the bloom filter b
|
||||
function terms_in_bloom(terms, b) {
|
||||
for (var i = 0; i < terms.length; i++) {
|
||||
if (!b.test(terms[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Filter the results to match the query
|
||||
function filter_results(query) {
|
||||
var search_terms = query.trim();
|
||||
if (search_terms === "") {
|
||||
document.getElementById('results').innerHTML = "";
|
||||
}
|
||||
search_terms = query.split(" ").map(stemmer);
|
||||
|
||||
var results = Array();
|
||||
for (var i = 0; i < index.length; i++) {
|
||||
if (terms_in_bloom(search_terms, bloom[i])) {
|
||||
results.push(index[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (results.length > 0) {
|
||||
results_html = '<ul>';
|
||||
for (var i = 0; i < results.length; i++) {
|
||||
results_html += '<li>' + results[i] + '</li>';
|
||||
}
|
||||
results_html += '</ul>'
|
||||
}
|
||||
else {
|
||||
results_html = '<p>Aucun résultat.</p>';
|
||||
}
|
||||
document.getElementById('results').innerHTML = results_html;
|
||||
}
|
||||
|
||||
|
||||
/* App */
|
||||
|
||||
// Get the words index (a.k.a. Bloom Filter)
|
||||
var oReq = new XMLHttpRequest();
|
||||
oReq.open("GET", "data/filters", true);
|
||||
oReq.responseType = "arraybuffer";
|
||||
oReq.onload = function (oEvent) {
|
||||
var array_buffer = oReq.response;
|
||||
if (array_buffer) {
|
||||
var byte_array = new Int32Array(array_buffer);
|
||||
|
||||
// First element is the number of bloom filters in the binary file
|
||||
var nb_bloom_filters = byte_array[0];
|
||||
// nb_bloom_filters next elements are the lengths of the arrays
|
||||
var lengths = Array();
|
||||
for (var i = 0; i < nb_bloom_filters; i++) {
|
||||
lengths.push(byte_array[1 + i]);
|
||||
}
|
||||
// Then, builds Bloom filters
|
||||
var l = 0, tmp_array;
|
||||
for (var i = 0; i < nb_bloom_filters; i++) {
|
||||
tmp_array = byte_array.subarray(1 + nb_bloom_filters + l, 1 + nb_bloom_filters + l + lengths[i]);
|
||||
var l = lengths[i];
|
||||
bloom.push(new BloomFilter(tmp_array, error_rate));
|
||||
console.log(tmp_array);
|
||||
console.log(bloom[0].test("concern"));
|
||||
}
|
||||
|
||||
callback();
|
||||
}
|
||||
else {
|
||||
document.getElementById('error').innerHTML = 'Unable to load the bloom filters.';
|
||||
}
|
||||
};
|
||||
oReq.send(null);
|
||||
|
||||
// Get the pages index
|
||||
var req = new XMLHttpRequest();
|
||||
req.open('GET', 'data/pages.json', true);
|
||||
req.onreadystatechange = function () {
|
||||
if (req.readyState == 4) {
|
||||
if (req.status == 200) {
|
||||
var tmp = JSON.parse(req.responseText);
|
||||
index = tmp['index'];
|
||||
|
||||
callback();
|
||||
}
|
||||
else {
|
||||
document.getElementById('error').innerHTML = 'Unable to load the index.';
|
||||
}
|
||||
}
|
||||
};
|
||||
req.send(null);
|
@ -22,23 +22,26 @@
|
||||
function BloomFilter(capacity, error_rate) {
|
||||
// *m* is the number of bits. Note that *m* is rounded up to
|
||||
// the nearest multiple of 32. *k* specifies the number of hashing functions.
|
||||
if (error_rate < 0 || error_rate > 1 || (typeof(capacity) === "number" && capacity < 0)) {
|
||||
return false;
|
||||
}
|
||||
var a, i = -1;
|
||||
// Number of slices, k
|
||||
var k = Math.ceil(- Math.log(error_rate) / Math.log(2));
|
||||
// Total number of bits, m
|
||||
// Size of the UInt32 table, n
|
||||
// Size of the Int32 table, n
|
||||
var m, n;
|
||||
if (typeof capacity !== "number") {
|
||||
a = capacity;
|
||||
// Total number of bits, m
|
||||
m = a.length * 32;
|
||||
// Size of the UInt32 table, n
|
||||
// Size of the Int32 table, n
|
||||
n = a.length;
|
||||
}
|
||||
else {
|
||||
// Total number of bits, m
|
||||
m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k;
|
||||
// Size of the UInt32 table, n
|
||||
// Size of the Int32 table, n
|
||||
n = Math.ceil(m / 32);
|
||||
// Round total number of bits to closest multiple of 32
|
||||
m = n * 32;
|
||||
|
186
js/stemmer.js
Normal file
186
js/stemmer.js
Normal file
@ -0,0 +1,186 @@
|
||||
// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
|
||||
// paper, in
|
||||
//
|
||||
// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
||||
// no. 3, pp 130-137,
|
||||
//
|
||||
// see also http://www.tartarus.org/~martin/PorterStemmer
|
||||
|
||||
// Release 1 be 'andargor', Jul 2004
|
||||
// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
|
||||
|
||||
var stemmer = (function(){
|
||||
var step2list = {
|
||||
"ational" : "ate",
|
||||
"tional" : "tion",
|
||||
"enci" : "ence",
|
||||
"anci" : "ance",
|
||||
"izer" : "ize",
|
||||
"bli" : "ble",
|
||||
"alli" : "al",
|
||||
"entli" : "ent",
|
||||
"eli" : "e",
|
||||
"ousli" : "ous",
|
||||
"ization" : "ize",
|
||||
"ation" : "ate",
|
||||
"ator" : "ate",
|
||||
"alism" : "al",
|
||||
"iveness" : "ive",
|
||||
"fulness" : "ful",
|
||||
"ousness" : "ous",
|
||||
"aliti" : "al",
|
||||
"iviti" : "ive",
|
||||
"biliti" : "ble",
|
||||
"logi" : "log"
|
||||
},
|
||||
|
||||
step3list = {
|
||||
"icate" : "ic",
|
||||
"ative" : "",
|
||||
"alize" : "al",
|
||||
"iciti" : "ic",
|
||||
"ical" : "ic",
|
||||
"ful" : "",
|
||||
"ness" : ""
|
||||
},
|
||||
|
||||
c = "[^aeiou]", // consonant
|
||||
v = "[aeiouy]", // vowel
|
||||
C = c + "[^aeiouy]*", // consonant sequence
|
||||
V = v + "[aeiou]*", // vowel sequence
|
||||
|
||||
mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
|
||||
meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
|
||||
mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
|
||||
s_v = "^(" + C + ")?" + v; // vowel in stem
|
||||
|
||||
return function (w) {
|
||||
var stem,
|
||||
suffix,
|
||||
firstch,
|
||||
re,
|
||||
re2,
|
||||
re3,
|
||||
re4,
|
||||
origword = w;
|
||||
|
||||
if (w.length < 3) { return w; }
|
||||
|
||||
firstch = w.substr(0,1);
|
||||
if (firstch == "y") {
|
||||
w = firstch.toUpperCase() + w.substr(1);
|
||||
}
|
||||
|
||||
// Step 1a
|
||||
re = /^(.+?)(ss|i)es$/;
|
||||
re2 = /^(.+?)([^s])s$/;
|
||||
|
||||
if (re.test(w)) { w = w.replace(re,"$1$2"); }
|
||||
else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
|
||||
|
||||
// Step 1b
|
||||
re = /^(.+?)eed$/;
|
||||
re2 = /^(.+?)(ed|ing)$/;
|
||||
if (re.test(w)) {
|
||||
var fp = re.exec(w);
|
||||
re = new RegExp(mgr0);
|
||||
if (re.test(fp[1])) {
|
||||
re = /.$/;
|
||||
w = w.replace(re,"");
|
||||
}
|
||||
} else if (re2.test(w)) {
|
||||
var fp = re2.exec(w);
|
||||
stem = fp[1];
|
||||
re2 = new RegExp(s_v);
|
||||
if (re2.test(stem)) {
|
||||
w = stem;
|
||||
re2 = /(at|bl|iz)$/;
|
||||
re3 = new RegExp("([^aeiouylsz])\\1$");
|
||||
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
||||
if (re2.test(w)) { w = w + "e"; }
|
||||
else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
|
||||
else if (re4.test(w)) { w = w + "e"; }
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1c
|
||||
re = /^(.+?)y$/;
|
||||
if (re.test(w)) {
|
||||
var fp = re.exec(w);
|
||||
stem = fp[1];
|
||||
re = new RegExp(s_v);
|
||||
if (re.test(stem)) { w = stem + "i"; }
|
||||
}
|
||||
|
||||
// Step 2
|
||||
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
|
||||
if (re.test(w)) {
|
||||
var fp = re.exec(w);
|
||||
stem = fp[1];
|
||||
suffix = fp[2];
|
||||
re = new RegExp(mgr0);
|
||||
if (re.test(stem)) {
|
||||
w = stem + step2list[suffix];
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3
|
||||
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
|
||||
if (re.test(w)) {
|
||||
var fp = re.exec(w);
|
||||
stem = fp[1];
|
||||
suffix = fp[2];
|
||||
re = new RegExp(mgr0);
|
||||
if (re.test(stem)) {
|
||||
w = stem + step3list[suffix];
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4
|
||||
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
|
||||
re2 = /^(.+?)(s|t)(ion)$/;
|
||||
if (re.test(w)) {
|
||||
var fp = re.exec(w);
|
||||
stem = fp[1];
|
||||
re = new RegExp(mgr1);
|
||||
if (re.test(stem)) {
|
||||
w = stem;
|
||||
}
|
||||
} else if (re2.test(w)) {
|
||||
var fp = re2.exec(w);
|
||||
stem = fp[1] + fp[2];
|
||||
re2 = new RegExp(mgr1);
|
||||
if (re2.test(stem)) {
|
||||
w = stem;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5
|
||||
re = /^(.+?)e$/;
|
||||
if (re.test(w)) {
|
||||
var fp = re.exec(w);
|
||||
stem = fp[1];
|
||||
re = new RegExp(mgr1);
|
||||
re2 = new RegExp(meq1);
|
||||
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
||||
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
|
||||
w = stem;
|
||||
}
|
||||
}
|
||||
|
||||
re = /ll$/;
|
||||
re2 = new RegExp(mgr1);
|
||||
if (re.test(w) && re2.test(w)) {
|
||||
re = /.$/;
|
||||
w = w.replace(re,"");
|
||||
}
|
||||
|
||||
// and turn initial Y back to y
|
||||
|
||||
if (firstch == "y") {
|
||||
w = firstch.toLowerCase() + w.substr(1);
|
||||
}
|
||||
|
||||
return w;
|
||||
}
|
||||
})();
|
@ -1,7 +1,6 @@
|
||||
/* These are some basic unit-tests for the bloom.js module */
|
||||
|
||||
var bloom = new BloomFilter(4, 0.1);
|
||||
console.log(bloom);
|
||||
|
||||
// Add some elements to the filter.
|
||||
bloom.add("foo");
|
||||
@ -22,6 +21,7 @@ var array = [].slice.call(bloom.buckets),
|
||||
json = JSON.stringify(array);
|
||||
|
||||
console.log(array);
|
||||
console.log(json);
|
||||
|
||||
// Deserialisation. Note that the any array-like object is supported, but
|
||||
// this will be used directly, so you may wish to use a typed array for
|
||||
|
25
js/test2.js
Normal file
25
js/test2.js
Normal file
@ -0,0 +1,25 @@
|
||||
/* These are some basic unit-tests for the bloom.js module */
|
||||
|
||||
var words = JSON.parse('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]');
|
||||
|
||||
var bloom2 = new BloomFilter(words.length, 0.1);
|
||||
console.log(bloom2);
|
||||
|
||||
// Add some elements to the filter.
|
||||
for (var i = 0; i < words.length; i++) {
|
||||
bloom2.add(words[i]);
|
||||
}
|
||||
|
||||
// Test if an item is in our filter.
|
||||
// Returns true if an item is probably in the set,
|
||||
// or false if an item is definitely not in the set.
|
||||
for (var i = 0; i < words.length; i++) {
|
||||
console.log(words[i] + " : " + bloom2.test(words[i]));
|
||||
}
|
||||
|
||||
// Serialisation. Note that bloom.buckets may be a typed array,
|
||||
// so we convert to a normal array first.
|
||||
var array = [].slice.call(bloom2.buckets),
|
||||
json = JSON.stringify(array);
|
||||
|
||||
console.log(bloom2.buckets);
|
13
test.html
Normal file
13
test.html
Normal file
@ -0,0 +1,13 @@
|
||||
<!doctype html>
|
||||
<html lang="fr">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>BloomJS demo</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Bloom.JS demo</h1>
|
||||
<p>This page runs the <code>bloom.js</code> library unit-tests. Look at your console output for <code>assert</code> error and verbose debugging.
|
||||
<script type="text/javascript" src="js/bloom.js"></script>
|
||||
<script type="text/javascript" src="js/test2.js"></script>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user