Commit before debugging bloom.py
This commit is contained in:
parent
5b4cc421df
commit
496e2f823f
BIN
data/filters
Normal file
BIN
data/filters
Normal file
Binary file not shown.
1
data/pages.json
Normal file
1
data/pages.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"index": ["../samples/cryptdevice_multi.html", "../samples/highmon_weechat.html"]}
|
BIN
data/words
Normal file
BIN
data/words
Normal file
Binary file not shown.
17
index.html
17
index.html
@ -3,17 +3,22 @@
|
|||||||
<head>
|
<head>
|
||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
<title>BloomJS demo</title>
|
<title>BloomJS demo</title>
|
||||||
|
<style type="text/css">
|
||||||
|
#error {
|
||||||
|
font-weight: bold;
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<h1>Bloom.JS demo</h1>
|
<h1>Bloom.JS demo</h1>
|
||||||
<p id="loading"></p>
|
<p id="error"></p>
|
||||||
<form id="search_form">
|
<div id="main">
|
||||||
<p>
|
<p>Loading…</p>
|
||||||
<input type="text" id="search" name="search" value="Search for articles..."/>
|
</div>
|
||||||
</p>
|
|
||||||
</form>
|
|
||||||
<div id="results"></div>
|
<div id="results"></div>
|
||||||
<script type="text/javascript" src="js/bloom.js"></script>
|
<script type="text/javascript" src="js/bloom.js"></script>
|
||||||
|
<script type="text/javascript" src="js/stemmer.js"></script>
|
||||||
<script type="text/javascript" src="js/app.js"></script>
|
<script type="text/javascript" src="js/app.js"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
162
index_generation/bloom.py
Normal file
162
index_generation/bloom.py
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This is a translation of the bloom.js script (originally from
|
||||||
|
https://github.com/jasondavies/bloomfilter.js) in Python.
|
||||||
|
|
||||||
|
Due to its status of translation of the previously mentionned JS code, you
|
||||||
|
should refer to this one for any particular doc that should be missing in this
|
||||||
|
implementation.
|
||||||
|
|
||||||
|
Needs the bitarray python module to work.
|
||||||
|
|
||||||
|
Note : Depending on your use case, the pybloom module available on Pypi may
|
||||||
|
better suits your needs. I reimplemented the above mentionned JS script in
|
||||||
|
Python mostly because I had to for this script, as the pybloom module uses
|
||||||
|
advanced hashing techniques, difficult to implement in JS.
|
||||||
|
|
||||||
|
This script has been written by Phyks and is in the public domain (or whatever
|
||||||
|
is closer to public domain in your country).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import math
|
||||||
|
|
||||||
|
try:
|
||||||
|
import numpy as np
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError('This script requires numpy')
|
||||||
|
|
||||||
|
|
||||||
|
class BloomFilter():
|
||||||
|
def __init__(self, capacity, error_rate=0.1):
|
||||||
|
"""
|
||||||
|
Implements a space-efficient probabilistic data structure.
|
||||||
|
|
||||||
|
capacity
|
||||||
|
This is the capacity of the BloomFilter. So to speak, it should be
|
||||||
|
able to store at least *capacity* elements
|
||||||
|
error_rate
|
||||||
|
the error rate of the filter returning false positives. This
|
||||||
|
determines the filters capacity. Inserting more than capacity
|
||||||
|
elements greatly increases the chance of false positive.
|
||||||
|
"""
|
||||||
|
if not (0 < error_rate < 1):
|
||||||
|
raise ValueError("Error_Rate must be between 0 and 1.")
|
||||||
|
if not capacity > 0 or type(capacity) != int:
|
||||||
|
raise ValueError("Capacity must be > 0")
|
||||||
|
|
||||||
|
# Same calculation as in the js file, see it for reference purpose
|
||||||
|
# Basically determines the number of bits and slices from the capacity
|
||||||
|
# and error_rate.
|
||||||
|
k = math.ceil(- math.log(error_rate, 2))
|
||||||
|
m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k
|
||||||
|
n = math.ceil(m / 32)
|
||||||
|
m = n * 32
|
||||||
|
self.m = m
|
||||||
|
self.k = k
|
||||||
|
|
||||||
|
kbytes = 1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))
|
||||||
|
self.buckets = np.zeros(n, dtype=np.int32)
|
||||||
|
if kbytes == 1:
|
||||||
|
loc_type = np.uint8
|
||||||
|
elif kbytes == 2:
|
||||||
|
loc_type = np.uint16
|
||||||
|
else:
|
||||||
|
loc_type = np.int32
|
||||||
|
self._locations = np.zeros(k, dtype=loc_type)
|
||||||
|
|
||||||
|
def locations(self, v):
|
||||||
|
r = self._locations
|
||||||
|
a = self.fnv_1a(v)
|
||||||
|
b = self.fnv_1a_b(a)
|
||||||
|
print(b)
|
||||||
|
i = 0
|
||||||
|
x = a % self.m
|
||||||
|
while i < self.k:
|
||||||
|
r[i] = (x + self.m) if x < 0 else x
|
||||||
|
x = (x + b) % self.m
|
||||||
|
i += 1
|
||||||
|
return r
|
||||||
|
|
||||||
|
def add(self, v):
|
||||||
|
l = self.locations(v + "")
|
||||||
|
i = 0
|
||||||
|
buckets = self.buckets
|
||||||
|
while i < self.k:
|
||||||
|
buckets[math.floor(l[i] / 32)] |= 1 << int(l[i] % 32)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
def test(self, v):
|
||||||
|
l = self.locations(v + "")
|
||||||
|
i = 0
|
||||||
|
buckets = self.buckets
|
||||||
|
while i < self.k:
|
||||||
|
b = l[i]
|
||||||
|
if buckets[math.floor(b / 32)] & (1 << int(b % 32)) == 0:
|
||||||
|
return False
|
||||||
|
i += 1
|
||||||
|
return True
|
||||||
|
|
||||||
|
def size(self):
|
||||||
|
"""
|
||||||
|
Estimated cardinality
|
||||||
|
"""
|
||||||
|
bits = 0
|
||||||
|
buckets = self.buckets
|
||||||
|
for i in range(0, len(buckets)):
|
||||||
|
bits += self.popcnt(buckets[i])
|
||||||
|
return -self.m * math.log(1 - bits / self.m) / self.k
|
||||||
|
|
||||||
|
def popcnt(self, v):
|
||||||
|
"""
|
||||||
|
http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||||
|
"""
|
||||||
|
v -= (v >> 1) & 0x55555555
|
||||||
|
v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
|
||||||
|
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24
|
||||||
|
|
||||||
|
def fnv_1a(self, v):
|
||||||
|
"""
|
||||||
|
Fowler/Noll/Vo hashing.
|
||||||
|
"""
|
||||||
|
n = len(v)
|
||||||
|
a = 2166136261
|
||||||
|
i = 0
|
||||||
|
while i < n:
|
||||||
|
c = ord(v[i])
|
||||||
|
d = c & 0xff000000
|
||||||
|
if d:
|
||||||
|
a ^= d >> 24
|
||||||
|
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||||
|
d = c & 0xff0000
|
||||||
|
if d:
|
||||||
|
a ^= d >> 16
|
||||||
|
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||||
|
d = c & 0xff00
|
||||||
|
if d:
|
||||||
|
a ^= d >> 8
|
||||||
|
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||||
|
a ^= c & 0xff
|
||||||
|
print(a ^ (c & 0xff))
|
||||||
|
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||||
|
i += 1
|
||||||
|
# From http://home.comcast.net/~bretm/hash/6.html
|
||||||
|
a += a << 13
|
||||||
|
a ^= a >> 7
|
||||||
|
a += a << 3
|
||||||
|
a ^= a >> 17
|
||||||
|
a += a << 5
|
||||||
|
return a & 0xffffffff
|
||||||
|
|
||||||
|
def fnv_1a_b(self, a):
|
||||||
|
"""
|
||||||
|
One additional iteration of FNV, given a hash.
|
||||||
|
"""
|
||||||
|
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
|
||||||
|
a += a << 13
|
||||||
|
a ^= a >> 7
|
||||||
|
a += a << 3
|
||||||
|
a ^= a >> 17
|
||||||
|
a += a << 5
|
||||||
|
print(a)
|
||||||
|
return a & 0xffffffff
|
@ -1,48 +1,44 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
Inspired by
|
||||||
|
http://www.stavros.io/posts/bloom-filter-search-engine/?print
|
||||||
|
|
||||||
|
|
||||||
|
You have to install the numpy python module for bloom to work.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import bloom
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
from lxml import html
|
|
||||||
import re
|
import re
|
||||||
import stemmer
|
import stemmer
|
||||||
import json
|
import struct
|
||||||
from bitarray import bitarray
|
import sys
|
||||||
from pybloom import BloomFilter
|
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
|
||||||
# List all files in path directory
|
|
||||||
def list_directory(path):
|
def list_directory(path):
|
||||||
fichier = []
|
"""Recursively list all files in a given directory."""
|
||||||
|
files_list = []
|
||||||
for root, dirs, files in os.walk(path):
|
for root, dirs, files in os.walk(path):
|
||||||
for i in files:
|
for i in files:
|
||||||
fichier.append(os.path.join(root, i))
|
files_list.append(os.path.join(root, i))
|
||||||
return fichier
|
return files_list
|
||||||
|
|
||||||
|
|
||||||
def remove_common_words(words):
|
def remove_common_words(words):
|
||||||
|
"""Removes all words that are less than 3 characters long."""
|
||||||
returned = [word for word in words if len(word) > 3]
|
returned = [word for word in words if len(word) > 3]
|
||||||
return returned
|
return returned
|
||||||
|
|
||||||
def padding_16(x):
|
|
||||||
if x < 256:
|
|
||||||
return bytes([0,x])
|
|
||||||
else:
|
|
||||||
return bytes([int(x/256), x%256])
|
|
||||||
|
|
||||||
# =============================================================================
|
if __name__ == "__main__":
|
||||||
|
error_rate = 0.1
|
||||||
samples = list_directory("../samples/")
|
samples = list_directory("../samples/")
|
||||||
filters = {}
|
filters = []
|
||||||
p = stemmer.PorterStemmer()
|
p = stemmer.PorterStemmer()
|
||||||
write_little = bitarray(endian="little")
|
|
||||||
write_big = bitarray(endian="big")
|
|
||||||
|
|
||||||
write_little.frombytes(padding_16(len(samples)))
|
|
||||||
write_big.frombytes(padding_16(len(samples)))
|
|
||||||
|
|
||||||
if len(samples) > 65535:
|
|
||||||
sys.exit("[ERROR] Too many articles to index. You will have to change the "
|
|
||||||
"way data is stored in the binary file to handle such amount of "
|
|
||||||
"files.")
|
|
||||||
|
|
||||||
for sample in samples:
|
for sample in samples:
|
||||||
with open(sample, 'r') as sample_fh:
|
with open(sample, 'r') as sample_fh:
|
||||||
@ -51,42 +47,37 @@ for sample in samples:
|
|||||||
# Get text from HTML content
|
# Get text from HTML content
|
||||||
words = html.fromstring(content).text_content().replace("\n", "")
|
words = html.fromstring(content).text_content().replace("\n", "")
|
||||||
words = re.findall(r"[\w]+", words)
|
words = re.findall(r"[\w]+", words)
|
||||||
# Remove all punctuation etc., convert words to lower and delete duplicates
|
# Remove all punctuation etc., convert words to lower and delete
|
||||||
|
# duplicates
|
||||||
words = list(set([word.lower() for word in words]))
|
words = list(set([word.lower() for word in words]))
|
||||||
|
|
||||||
# Remove common words
|
# Remove common words
|
||||||
words = remove_common_words(words)
|
words = remove_common_words(words)
|
||||||
# Stemming to reduce the number of words
|
# Stemming to reduce the number of words
|
||||||
words = [p.stem(word, 0, len(word)-1) for word in words]
|
words = list(set([p.stem(word, 0, len(word)-1) for word in words]))
|
||||||
|
|
||||||
filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1)
|
tmp_filter = bloom.BloomFilter(capacity=len(words),
|
||||||
|
error_rate=error_rate)
|
||||||
|
words = json.loads('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]')
|
||||||
for word in words:
|
for word in words:
|
||||||
filters[sample].add(word)
|
tmp_filter.add(word)
|
||||||
|
|
||||||
if filters[sample].bitarray.length() > 65535:
|
filters.append(tmp_filter.buckets)
|
||||||
sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
|
print(tmp_filter.buckets)
|
||||||
"will have to change the way data is stored in the binary "
|
sys.exit()
|
||||||
"file to handle such amount of text.")
|
|
||||||
|
|
||||||
tmp = bitarray(endian="little")
|
# First Int32 is length
|
||||||
tmp.frombytes(padding_16(filters[sample].bitarray.length()))
|
filters_to_write = struct.pack("<i", len(filters))
|
||||||
write_little.extend(tmp)
|
# Then comes the length of each filter
|
||||||
write_little.extend(filters[sample].bitarray)
|
for i in filters:
|
||||||
write_little.extend([0 for i in range(filters[sample].bitarray.length() %
|
filters_to_write += struct.pack("<i", len(i))
|
||||||
8)])
|
# Finally comes the filters themselves
|
||||||
tmp = bitarray(endian="big")
|
for i in filters:
|
||||||
tmp.frombytes(padding_16(filters[sample].bitarray.length()))
|
filters_to_write += struct.pack("<%di" % len(i), *i)
|
||||||
write_big.extend(tmp)
|
|
||||||
write_big.extend(filters[sample].bitarray)
|
|
||||||
write_big.extend([0 for i in range(filters[sample].bitarray.length() %
|
|
||||||
8)])
|
|
||||||
|
|
||||||
with open('../data/search_index_little', 'wb') as index_fh:
|
# Write everything
|
||||||
print(write_little)
|
with open("../data/filters", "wb") as index_fh:
|
||||||
write_little.tofile(index_fh)
|
index_fh.write(filters_to_write)
|
||||||
with open('../data/search_index_big', 'wb') as index_fh:
|
|
||||||
print(write_big)
|
|
||||||
write_big.tofile(index_fh)
|
|
||||||
|
|
||||||
with open('../data/pages_index.json', 'w') as pages_fh:
|
with open("../data/pages.json", "w") as pages_fh:
|
||||||
pages_fh.write(json.dumps(samples))
|
pages_fh.write(json.dumps({"index": samples}))
|
||||||
|
@ -1,277 +0,0 @@
|
|||||||
import math
|
|
||||||
import hashlib
|
|
||||||
from struct import unpack, pack, calcsize
|
|
||||||
|
|
||||||
try:
|
|
||||||
import bitarray
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError('pybloom requires bitarray >= 0.3.4')
|
|
||||||
|
|
||||||
__version__ = '2.0'
|
|
||||||
__author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
|
|
||||||
Marius Eriksen <marius@monkey.org>,\
|
|
||||||
Alex Brasetvik <alex@brasetvik.com>"
|
|
||||||
|
|
||||||
def make_hashfuncs(num_slices, num_bits):
|
|
||||||
if num_bits >= (1 << 31):
|
|
||||||
fmt_code, chunk_size = 'Q', 8
|
|
||||||
elif num_bits >= (1 << 15):
|
|
||||||
fmt_code, chunk_size = 'I', 4
|
|
||||||
else:
|
|
||||||
fmt_code, chunk_size = 'H', 2
|
|
||||||
total_hash_bits = 8 * num_slices * chunk_size
|
|
||||||
if total_hash_bits > 384:
|
|
||||||
hashfn = hashlib.sha512
|
|
||||||
elif total_hash_bits > 256:
|
|
||||||
hashfn = hashlib.sha384
|
|
||||||
elif total_hash_bits > 160:
|
|
||||||
hashfn = hashlib.sha256
|
|
||||||
elif total_hash_bits > 128:
|
|
||||||
hashfn = hashlib.sha1
|
|
||||||
else:
|
|
||||||
hashfn = hashlib.md5
|
|
||||||
fmt = fmt_code * (hashfn().digest_size // chunk_size)
|
|
||||||
num_salts, extra = divmod(num_slices, len(fmt))
|
|
||||||
if extra:
|
|
||||||
num_salts += 1
|
|
||||||
salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]
|
|
||||||
def _make_hashfuncs(key):
|
|
||||||
key = str(key)
|
|
||||||
rval = []
|
|
||||||
for salt in salts:
|
|
||||||
h = salt.copy()
|
|
||||||
h.update(key.encode('utf-8'))
|
|
||||||
rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
|
|
||||||
del rval[num_slices:]
|
|
||||||
return rval
|
|
||||||
return _make_hashfuncs
|
|
||||||
|
|
||||||
|
|
||||||
class BloomFilter(object):
|
|
||||||
FILE_FMT = '<dQQQQ'
|
|
||||||
|
|
||||||
def __init__(self, capacity, error_rate=0.001):
|
|
||||||
if not (0 < error_rate < 1):
|
|
||||||
raise ValueError("Error_Rate must be between 0 and 1.")
|
|
||||||
if not capacity > 0:
|
|
||||||
raise ValueError("Capacity must be > 0")
|
|
||||||
num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
|
|
||||||
bits_per_slice = int(math.ceil(
|
|
||||||
(capacity * abs(math.log(error_rate))) /
|
|
||||||
(num_slices * (math.log(2) ** 2))))
|
|
||||||
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
|
|
||||||
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
|
|
||||||
self.bitarray.setall(False)
|
|
||||||
|
|
||||||
def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
|
|
||||||
self.error_rate = error_rate
|
|
||||||
self.num_slices = num_slices
|
|
||||||
self.bits_per_slice = bits_per_slice
|
|
||||||
self.capacity = capacity
|
|
||||||
self.num_bits = num_slices * bits_per_slice
|
|
||||||
self.count = count
|
|
||||||
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
|
|
||||||
|
|
||||||
def __contains__(self, key):
|
|
||||||
bits_per_slice = self.bits_per_slice
|
|
||||||
bitarray = self.bitarray
|
|
||||||
if not isinstance(key, list):
|
|
||||||
hashes = self.make_hashes(key)
|
|
||||||
else:
|
|
||||||
hashes = key
|
|
||||||
offset = 0
|
|
||||||
for k in hashes:
|
|
||||||
if not bitarray[offset + k]:
|
|
||||||
return False
|
|
||||||
offset += bits_per_slice
|
|
||||||
return True
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
"""Return the number of keys stored by this bloom filter."""
|
|
||||||
return self.count
|
|
||||||
|
|
||||||
def add(self, key, skip_check=False):
|
|
||||||
bitarray = self.bitarray
|
|
||||||
bits_per_slice = self.bits_per_slice
|
|
||||||
hashes = self.make_hashes(key)
|
|
||||||
if not skip_check and hashes in self:
|
|
||||||
return True
|
|
||||||
if self.count > self.capacity:
|
|
||||||
raise IndexError("BloomFilter is at capacity")
|
|
||||||
offset = 0
|
|
||||||
for k in hashes:
|
|
||||||
self.bitarray[offset + k] = True
|
|
||||||
offset += bits_per_slice
|
|
||||||
self.count += 1
|
|
||||||
return False
|
|
||||||
|
|
||||||
def copy(self):
|
|
||||||
"""Return a copy of this bloom filter.
|
|
||||||
"""
|
|
||||||
new_filter = BloomFilter(self.capacity, self.error_rate)
|
|
||||||
new_filter.bitarray = self.bitarray.copy()
|
|
||||||
return new_filter
|
|
||||||
|
|
||||||
def union(self, other):
|
|
||||||
""" Calculates the union of the two underlying bitarrays and returns
|
|
||||||
a new bloom filter object."""
|
|
||||||
if self.capacity != other.capacity or \
|
|
||||||
self.error_rate != other.error_rate:
|
|
||||||
raise ValueError("Unioning filters requires both filters to have \
|
|
||||||
both the same capacity and error rate")
|
|
||||||
new_bloom = self.copy()
|
|
||||||
new_bloom.bitarray = new_bloom.bitarray | other.bitarray
|
|
||||||
return new_bloom
|
|
||||||
|
|
||||||
def __or__(self, other):
|
|
||||||
return self.union(other)
|
|
||||||
|
|
||||||
def intersection(self, other):
|
|
||||||
""" Calculates the intersection of the two underlying bitarrays and returns
|
|
||||||
a new bloom filter object."""
|
|
||||||
if self.capacity != other.capacity or \
|
|
||||||
self.error_rate != other.error_rate:
|
|
||||||
raise ValueError("Intersecting filters requires both filters to \
|
|
||||||
have equal capacity and error rate")
|
|
||||||
new_bloom = self.copy()
|
|
||||||
new_bloom.bitarray = new_bloom.bitarray & other.bitarray
|
|
||||||
return new_bloom
|
|
||||||
|
|
||||||
def __and__(self, other):
|
|
||||||
return self.intersection(other)
|
|
||||||
|
|
||||||
def tofile(self, f):
|
|
||||||
"""Write the bloom filter to file object `f'. Underlying bits
|
|
||||||
are written as machine values. This is much more space
|
|
||||||
efficient than pickling the object."""
|
|
||||||
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
|
|
||||||
self.bits_per_slice, self.capacity, self.count))
|
|
||||||
self.bitarray.tofile(f)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def fromfile(cls, f, n=-1):
|
|
||||||
"""Read a bloom filter from file-object `f' serialized with
|
|
||||||
``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
|
|
||||||
headerlen = calcsize(cls.FILE_FMT)
|
|
||||||
|
|
||||||
if 0 < n < headerlen:
|
|
||||||
raise ValueError('n too small!')
|
|
||||||
|
|
||||||
filter = cls(1) # Bogus instantiation, we will `_setup'.
|
|
||||||
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
|
|
||||||
filter.bitarray = bitarray.bitarray(endian='little')
|
|
||||||
if n > 0:
|
|
||||||
filter.bitarray.fromfile(f, n - headerlen)
|
|
||||||
else:
|
|
||||||
filter.bitarray.fromfile(f)
|
|
||||||
if filter.num_bits != filter.bitarray.length() and \
|
|
||||||
(filter.num_bits + (8 - filter.num_bits % 8)
|
|
||||||
!= filter.bitarray.length()):
|
|
||||||
raise ValueError('Bit length mismatch!')
|
|
||||||
|
|
||||||
return filter
|
|
||||||
|
|
||||||
def __getstate__(self):
|
|
||||||
d = self.__dict__.copy()
|
|
||||||
del d['make_hashes']
|
|
||||||
return d
|
|
||||||
|
|
||||||
def __setstate__(self, d):
|
|
||||||
self.__dict__.update(d)
|
|
||||||
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
|
|
||||||
|
|
||||||
class ScalableBloomFilter(object):
|
|
||||||
SMALL_SET_GROWTH = 2 # slower, but takes up less memory
|
|
||||||
LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
|
|
||||||
FILE_FMT = '<idQd'
|
|
||||||
|
|
||||||
def __init__(self, initial_capacity=100, error_rate=0.001,
|
|
||||||
mode=SMALL_SET_GROWTH):
|
|
||||||
if not error_rate or error_rate < 0:
|
|
||||||
raise ValueError("Error_Rate must be a decimal less than 0.")
|
|
||||||
self._setup(mode, 0.9, initial_capacity, error_rate)
|
|
||||||
self.filters = []
|
|
||||||
|
|
||||||
def _setup(self, mode, ratio, initial_capacity, error_rate):
|
|
||||||
self.scale = mode
|
|
||||||
self.ratio = ratio
|
|
||||||
self.initial_capacity = initial_capacity
|
|
||||||
self.error_rate = error_rate
|
|
||||||
|
|
||||||
def __contains__(self, key):
|
|
||||||
for f in reversed(self.filters):
|
|
||||||
if key in f:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def add(self, key):
|
|
||||||
if key in self:
|
|
||||||
return True
|
|
||||||
if not self.filters:
|
|
||||||
filter = BloomFilter(
|
|
||||||
capacity=self.initial_capacity,
|
|
||||||
error_rate=self.error_rate * (1.0 - self.ratio))
|
|
||||||
self.filters.append(filter)
|
|
||||||
else:
|
|
||||||
filter = self.filters[-1]
|
|
||||||
if filter.count >= filter.capacity:
|
|
||||||
filter = BloomFilter(
|
|
||||||
capacity=filter.capacity * self.scale,
|
|
||||||
error_rate=filter.error_rate * self.ratio)
|
|
||||||
self.filters.append(filter)
|
|
||||||
filter.add(key, skip_check=True)
|
|
||||||
return False
|
|
||||||
|
|
||||||
@property
|
|
||||||
def capacity(self):
|
|
||||||
"""Returns the total capacity for all filters in this SBF"""
|
|
||||||
return sum([f.capacity for f in self.filters])
|
|
||||||
|
|
||||||
@property
|
|
||||||
def count(self):
|
|
||||||
return len(self)
|
|
||||||
|
|
||||||
def tofile(self, f):
|
|
||||||
"""Serialize this ScalableBloomFilter into the file-object
|
|
||||||
`f'."""
|
|
||||||
f.write(pack(self.FILE_FMT, self.scale, self.ratio,
|
|
||||||
self.initial_capacity, self.error_rate))
|
|
||||||
|
|
||||||
# Write #-of-filters
|
|
||||||
f.write(pack('<l', len(self.filters)))
|
|
||||||
|
|
||||||
if len(self.filters) > 0:
|
|
||||||
# Then each filter directly, with a header describing
|
|
||||||
# their lengths.
|
|
||||||
headerpos = f.tell()
|
|
||||||
headerfmt = '<' + 'Q'*(len(self.filters))
|
|
||||||
f.write('.' * calcsize(headerfmt))
|
|
||||||
filter_sizes = []
|
|
||||||
for filter in self.filters:
|
|
||||||
begin = f.tell()
|
|
||||||
filter.tofile(f)
|
|
||||||
filter_sizes.append(f.tell() - begin)
|
|
||||||
|
|
||||||
f.seek(headerpos)
|
|
||||||
f.write(pack(headerfmt, *filter_sizes))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def fromfile(cls, f):
|
|
||||||
"""Deserialize the ScalableBloomFilter in file object `f'."""
|
|
||||||
filter = cls()
|
|
||||||
filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
|
|
||||||
nfilters, = unpack('<l', f.read(calcsize('<l')))
|
|
||||||
if nfilters > 0:
|
|
||||||
header_fmt = '<' + 'Q'*nfilters
|
|
||||||
bytes = f.read(calcsize(header_fmt))
|
|
||||||
filter_lengths = unpack(header_fmt, bytes)
|
|
||||||
for fl in filter_lengths:
|
|
||||||
filter.filters.append(BloomFilter.fromfile(f, fl))
|
|
||||||
else:
|
|
||||||
filter.filters = []
|
|
||||||
|
|
||||||
return filter
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
"""Returns the total number of elements stored in this SBF"""
|
|
||||||
return sum([f.count for f in self.filters])
|
|
121
js/app.js
Normal file
121
js/app.js
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
/* Params */
|
||||||
|
var error_rate = 0.1;
|
||||||
|
|
||||||
|
|
||||||
|
/* Vars */
|
||||||
|
var bloom = Array(), index;
|
||||||
|
var ready = false;
|
||||||
|
|
||||||
|
|
||||||
|
/* Functions */
|
||||||
|
function callback() {
|
||||||
|
if (typeof(index) === 'undefined' || bloom.length == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sets up the page, that is now ready
|
||||||
|
ready = true;
|
||||||
|
document.getElementById('main').innerHTML = '<form id="search_form"><p><input type="text" id="search" name="search" placeholder="Search for articles..."/></p></form>';
|
||||||
|
|
||||||
|
// Handle onchange actions
|
||||||
|
document.getElementById('search').oninput = function (e) {
|
||||||
|
if (!ready) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_results(e.target.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true iff all the terms in the array are in the bloom filter b
|
||||||
|
function terms_in_bloom(terms, b) {
|
||||||
|
for (var i = 0; i < terms.length; i++) {
|
||||||
|
if (!b.test(terms[i])) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter the results to match the query
|
||||||
|
function filter_results(query) {
|
||||||
|
var search_terms = query.trim();
|
||||||
|
if (search_terms === "") {
|
||||||
|
document.getElementById('results').innerHTML = "";
|
||||||
|
}
|
||||||
|
search_terms = query.split(" ").map(stemmer);
|
||||||
|
|
||||||
|
var results = Array();
|
||||||
|
for (var i = 0; i < index.length; i++) {
|
||||||
|
if (terms_in_bloom(search_terms, bloom[i])) {
|
||||||
|
results.push(index[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (results.length > 0) {
|
||||||
|
results_html = '<ul>';
|
||||||
|
for (var i = 0; i < results.length; i++) {
|
||||||
|
results_html += '<li>' + results[i] + '</li>';
|
||||||
|
}
|
||||||
|
results_html += '</ul>'
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
results_html = '<p>Aucun résultat.</p>';
|
||||||
|
}
|
||||||
|
document.getElementById('results').innerHTML = results_html;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* App */
|
||||||
|
|
||||||
|
// Get the words index (a.k.a. Bloom Filter)
|
||||||
|
var oReq = new XMLHttpRequest();
|
||||||
|
oReq.open("GET", "data/filters", true);
|
||||||
|
oReq.responseType = "arraybuffer";
|
||||||
|
oReq.onload = function (oEvent) {
|
||||||
|
var array_buffer = oReq.response;
|
||||||
|
if (array_buffer) {
|
||||||
|
var byte_array = new Int32Array(array_buffer);
|
||||||
|
|
||||||
|
// First element is the number of bloom filters in the binary file
|
||||||
|
var nb_bloom_filters = byte_array[0];
|
||||||
|
// nb_bloom_filters next elements are the lengths of the arrays
|
||||||
|
var lengths = Array();
|
||||||
|
for (var i = 0; i < nb_bloom_filters; i++) {
|
||||||
|
lengths.push(byte_array[1 + i]);
|
||||||
|
}
|
||||||
|
// Then, builds Bloom filters
|
||||||
|
var l = 0, tmp_array;
|
||||||
|
for (var i = 0; i < nb_bloom_filters; i++) {
|
||||||
|
tmp_array = byte_array.subarray(1 + nb_bloom_filters + l, 1 + nb_bloom_filters + l + lengths[i]);
|
||||||
|
var l = lengths[i];
|
||||||
|
bloom.push(new BloomFilter(tmp_array, error_rate));
|
||||||
|
console.log(tmp_array);
|
||||||
|
console.log(bloom[0].test("concern"));
|
||||||
|
}
|
||||||
|
|
||||||
|
callback();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
document.getElementById('error').innerHTML = 'Unable to load the bloom filters.';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
oReq.send(null);
|
||||||
|
|
||||||
|
// Get the pages index
|
||||||
|
var req = new XMLHttpRequest();
|
||||||
|
req.open('GET', 'data/pages.json', true);
|
||||||
|
req.onreadystatechange = function () {
|
||||||
|
if (req.readyState == 4) {
|
||||||
|
if (req.status == 200) {
|
||||||
|
var tmp = JSON.parse(req.responseText);
|
||||||
|
index = tmp['index'];
|
||||||
|
|
||||||
|
callback();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
document.getElementById('error').innerHTML = 'Unable to load the index.';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
req.send(null);
|
@ -22,23 +22,26 @@
|
|||||||
function BloomFilter(capacity, error_rate) {
|
function BloomFilter(capacity, error_rate) {
|
||||||
// *m* is the number of bits. Note that *m* is rounded up to
|
// *m* is the number of bits. Note that *m* is rounded up to
|
||||||
// the nearest multiple of 32. *k* specifies the number of hashing functions.
|
// the nearest multiple of 32. *k* specifies the number of hashing functions.
|
||||||
|
if (error_rate < 0 || error_rate > 1 || (typeof(capacity) === "number" && capacity < 0)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
var a, i = -1;
|
var a, i = -1;
|
||||||
// Number of slices, k
|
// Number of slices, k
|
||||||
var k = Math.ceil(- Math.log(error_rate) / Math.log(2));
|
var k = Math.ceil(- Math.log(error_rate) / Math.log(2));
|
||||||
// Total number of bits, m
|
// Total number of bits, m
|
||||||
// Size of the UInt32 table, n
|
// Size of the Int32 table, n
|
||||||
var m, n;
|
var m, n;
|
||||||
if (typeof capacity !== "number") {
|
if (typeof capacity !== "number") {
|
||||||
a = capacity;
|
a = capacity;
|
||||||
// Total number of bits, m
|
// Total number of bits, m
|
||||||
m = a.length * 32;
|
m = a.length * 32;
|
||||||
// Size of the UInt32 table, n
|
// Size of the Int32 table, n
|
||||||
n = a.length;
|
n = a.length;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Total number of bits, m
|
// Total number of bits, m
|
||||||
m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k;
|
m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k;
|
||||||
// Size of the UInt32 table, n
|
// Size of the Int32 table, n
|
||||||
n = Math.ceil(m / 32);
|
n = Math.ceil(m / 32);
|
||||||
// Round total number of bits to closest multiple of 32
|
// Round total number of bits to closest multiple of 32
|
||||||
m = n * 32;
|
m = n * 32;
|
||||||
|
186
js/stemmer.js
Normal file
186
js/stemmer.js
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
|
||||||
|
// paper, in
|
||||||
|
//
|
||||||
|
// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
||||||
|
// no. 3, pp 130-137,
|
||||||
|
//
|
||||||
|
// see also http://www.tartarus.org/~martin/PorterStemmer
|
||||||
|
|
||||||
|
// Release 1 be 'andargor', Jul 2004
|
||||||
|
// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
|
||||||
|
|
||||||
|
var stemmer = (function(){
|
||||||
|
var step2list = {
|
||||||
|
"ational" : "ate",
|
||||||
|
"tional" : "tion",
|
||||||
|
"enci" : "ence",
|
||||||
|
"anci" : "ance",
|
||||||
|
"izer" : "ize",
|
||||||
|
"bli" : "ble",
|
||||||
|
"alli" : "al",
|
||||||
|
"entli" : "ent",
|
||||||
|
"eli" : "e",
|
||||||
|
"ousli" : "ous",
|
||||||
|
"ization" : "ize",
|
||||||
|
"ation" : "ate",
|
||||||
|
"ator" : "ate",
|
||||||
|
"alism" : "al",
|
||||||
|
"iveness" : "ive",
|
||||||
|
"fulness" : "ful",
|
||||||
|
"ousness" : "ous",
|
||||||
|
"aliti" : "al",
|
||||||
|
"iviti" : "ive",
|
||||||
|
"biliti" : "ble",
|
||||||
|
"logi" : "log"
|
||||||
|
},
|
||||||
|
|
||||||
|
step3list = {
|
||||||
|
"icate" : "ic",
|
||||||
|
"ative" : "",
|
||||||
|
"alize" : "al",
|
||||||
|
"iciti" : "ic",
|
||||||
|
"ical" : "ic",
|
||||||
|
"ful" : "",
|
||||||
|
"ness" : ""
|
||||||
|
},
|
||||||
|
|
||||||
|
c = "[^aeiou]", // consonant
|
||||||
|
v = "[aeiouy]", // vowel
|
||||||
|
C = c + "[^aeiouy]*", // consonant sequence
|
||||||
|
V = v + "[aeiou]*", // vowel sequence
|
||||||
|
|
||||||
|
mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
|
||||||
|
meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
|
||||||
|
mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
|
||||||
|
s_v = "^(" + C + ")?" + v; // vowel in stem
|
||||||
|
|
||||||
|
return function (w) {
|
||||||
|
var stem,
|
||||||
|
suffix,
|
||||||
|
firstch,
|
||||||
|
re,
|
||||||
|
re2,
|
||||||
|
re3,
|
||||||
|
re4,
|
||||||
|
origword = w;
|
||||||
|
|
||||||
|
if (w.length < 3) { return w; }
|
||||||
|
|
||||||
|
firstch = w.substr(0,1);
|
||||||
|
if (firstch == "y") {
|
||||||
|
w = firstch.toUpperCase() + w.substr(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 1a
|
||||||
|
re = /^(.+?)(ss|i)es$/;
|
||||||
|
re2 = /^(.+?)([^s])s$/;
|
||||||
|
|
||||||
|
if (re.test(w)) { w = w.replace(re,"$1$2"); }
|
||||||
|
else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
|
||||||
|
|
||||||
|
// Step 1b
|
||||||
|
re = /^(.+?)eed$/;
|
||||||
|
re2 = /^(.+?)(ed|ing)$/;
|
||||||
|
if (re.test(w)) {
|
||||||
|
var fp = re.exec(w);
|
||||||
|
re = new RegExp(mgr0);
|
||||||
|
if (re.test(fp[1])) {
|
||||||
|
re = /.$/;
|
||||||
|
w = w.replace(re,"");
|
||||||
|
}
|
||||||
|
} else if (re2.test(w)) {
|
||||||
|
var fp = re2.exec(w);
|
||||||
|
stem = fp[1];
|
||||||
|
re2 = new RegExp(s_v);
|
||||||
|
if (re2.test(stem)) {
|
||||||
|
w = stem;
|
||||||
|
re2 = /(at|bl|iz)$/;
|
||||||
|
re3 = new RegExp("([^aeiouylsz])\\1$");
|
||||||
|
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
||||||
|
if (re2.test(w)) { w = w + "e"; }
|
||||||
|
else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
|
||||||
|
else if (re4.test(w)) { w = w + "e"; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 1c
|
||||||
|
re = /^(.+?)y$/;
|
||||||
|
if (re.test(w)) {
|
||||||
|
var fp = re.exec(w);
|
||||||
|
stem = fp[1];
|
||||||
|
re = new RegExp(s_v);
|
||||||
|
if (re.test(stem)) { w = stem + "i"; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2
|
||||||
|
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
|
||||||
|
if (re.test(w)) {
|
||||||
|
var fp = re.exec(w);
|
||||||
|
stem = fp[1];
|
||||||
|
suffix = fp[2];
|
||||||
|
re = new RegExp(mgr0);
|
||||||
|
if (re.test(stem)) {
|
||||||
|
w = stem + step2list[suffix];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3
|
||||||
|
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
|
||||||
|
if (re.test(w)) {
|
||||||
|
var fp = re.exec(w);
|
||||||
|
stem = fp[1];
|
||||||
|
suffix = fp[2];
|
||||||
|
re = new RegExp(mgr0);
|
||||||
|
if (re.test(stem)) {
|
||||||
|
w = stem + step3list[suffix];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4
|
||||||
|
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
|
||||||
|
re2 = /^(.+?)(s|t)(ion)$/;
|
||||||
|
if (re.test(w)) {
|
||||||
|
var fp = re.exec(w);
|
||||||
|
stem = fp[1];
|
||||||
|
re = new RegExp(mgr1);
|
||||||
|
if (re.test(stem)) {
|
||||||
|
w = stem;
|
||||||
|
}
|
||||||
|
} else if (re2.test(w)) {
|
||||||
|
var fp = re2.exec(w);
|
||||||
|
stem = fp[1] + fp[2];
|
||||||
|
re2 = new RegExp(mgr1);
|
||||||
|
if (re2.test(stem)) {
|
||||||
|
w = stem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 5
|
||||||
|
re = /^(.+?)e$/;
|
||||||
|
if (re.test(w)) {
|
||||||
|
var fp = re.exec(w);
|
||||||
|
stem = fp[1];
|
||||||
|
re = new RegExp(mgr1);
|
||||||
|
re2 = new RegExp(meq1);
|
||||||
|
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
||||||
|
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
|
||||||
|
w = stem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
re = /ll$/;
|
||||||
|
re2 = new RegExp(mgr1);
|
||||||
|
if (re.test(w) && re2.test(w)) {
|
||||||
|
re = /.$/;
|
||||||
|
w = w.replace(re,"");
|
||||||
|
}
|
||||||
|
|
||||||
|
// and turn initial Y back to y
|
||||||
|
|
||||||
|
if (firstch == "y") {
|
||||||
|
w = firstch.toLowerCase() + w.substr(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return w;
|
||||||
|
}
|
||||||
|
})();
|
@ -1,7 +1,6 @@
|
|||||||
/* These are some basic unit-tests for the bloom.js module */
|
/* These are some basic unit-tests for the bloom.js module */
|
||||||
|
|
||||||
var bloom = new BloomFilter(4, 0.1);
|
var bloom = new BloomFilter(4, 0.1);
|
||||||
console.log(bloom);
|
|
||||||
|
|
||||||
// Add some elements to the filter.
|
// Add some elements to the filter.
|
||||||
bloom.add("foo");
|
bloom.add("foo");
|
||||||
@ -22,6 +21,7 @@ var array = [].slice.call(bloom.buckets),
|
|||||||
json = JSON.stringify(array);
|
json = JSON.stringify(array);
|
||||||
|
|
||||||
console.log(array);
|
console.log(array);
|
||||||
|
console.log(json);
|
||||||
|
|
||||||
// Deserialisation. Note that the any array-like object is supported, but
|
// Deserialisation. Note that the any array-like object is supported, but
|
||||||
// this will be used directly, so you may wish to use a typed array for
|
// this will be used directly, so you may wish to use a typed array for
|
||||||
|
25
js/test2.js
Normal file
25
js/test2.js
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/* These are some basic unit-tests for the bloom.js module */
|
||||||
|
|
||||||
|
var words = JSON.parse('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]');
|
||||||
|
|
||||||
|
var bloom2 = new BloomFilter(words.length, 0.1);
|
||||||
|
console.log(bloom2);
|
||||||
|
|
||||||
|
// Add some elements to the filter.
|
||||||
|
for (var i = 0; i < words.length; i++) {
|
||||||
|
bloom2.add(words[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test if an item is in our filter.
|
||||||
|
// Returns true if an item is probably in the set,
|
||||||
|
// or false if an item is definitely not in the set.
|
||||||
|
for (var i = 0; i < words.length; i++) {
|
||||||
|
console.log(words[i] + " : " + bloom2.test(words[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serialisation. Note that bloom.buckets may be a typed array,
|
||||||
|
// so we convert to a normal array first.
|
||||||
|
var array = [].slice.call(bloom2.buckets),
|
||||||
|
json = JSON.stringify(array);
|
||||||
|
|
||||||
|
console.log(bloom2.buckets);
|
13
test.html
Normal file
13
test.html
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="fr">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>BloomJS demo</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Bloom.JS demo</h1>
|
||||||
|
<p>This page runs the <code>bloom.js</code> library unit-tests. Look at your console output for <code>assert</code> error and verbose debugging.
|
||||||
|
<script type="text/javascript" src="js/bloom.js"></script>
|
||||||
|
<script type="text/javascript" src="js/test2.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in New Issue
Block a user