Commit before debugging bloom.py

This commit is contained in:
Phyks 2014-10-28 02:36:28 +01:00
parent 5b4cc421df
commit 496e2f823f
14 changed files with 584 additions and 354 deletions

BIN
data/filters Normal file

Binary file not shown.

1
data/pages.json Normal file
View File

@ -0,0 +1 @@
{"index": ["../samples/cryptdevice_multi.html", "../samples/highmon_weechat.html"]}

BIN
data/test Normal file

Binary file not shown.

BIN
data/words Normal file

Binary file not shown.

View File

@ -3,17 +3,22 @@
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<title>BloomJS demo</title> <title>BloomJS demo</title>
<style type="text/css">
#error {
font-weight: bold;
color: red;
}
</style>
</head> </head>
<body> <body>
<h1>Bloom.JS demo</h1> <h1>Bloom.JS demo</h1>
<p id="loading"></p> <p id="error"></p>
<form id="search_form"> <div id="main">
<p> <p>Loading…</p>
<input type="text" id="search" name="search" value="Search for articles..."/> </div>
</p>
</form>
<div id="results"></div> <div id="results"></div>
<script type="text/javascript" src="js/bloom.js"></script> <script type="text/javascript" src="js/bloom.js"></script>
<script type="text/javascript" src="js/stemmer.js"></script>
<script type="text/javascript" src="js/app.js"></script> <script type="text/javascript" src="js/app.js"></script>
</body> </body>
</html> </html>

162
index_generation/bloom.py Normal file
View File

@ -0,0 +1,162 @@
#!/usr/bin/env python3
"""
This is a translation of the bloom.js script (originally from
https://github.com/jasondavies/bloomfilter.js) in Python.
Due to its status of translation of the previously mentionned JS code, you
should refer to this one for any particular doc that should be missing in this
implementation.
Needs the bitarray python module to work.
Note : Depending on your use case, the pybloom module available on Pypi may
better suits your needs. I reimplemented the above mentionned JS script in
Python mostly because I had to for this script, as the pybloom module uses
advanced hashing techniques, difficult to implement in JS.
This script has been written by Phyks and is in the public domain (or whatever
is closer to public domain in your country).
"""
import math
try:
import numpy as np
except ImportError:
raise ImportError('This script requires numpy')
class BloomFilter():
def __init__(self, capacity, error_rate=0.1):
"""
Implements a space-efficient probabilistic data structure.
capacity
This is the capacity of the BloomFilter. So to speak, it should be
able to store at least *capacity* elements
error_rate
the error rate of the filter returning false positives. This
determines the filters capacity. Inserting more than capacity
elements greatly increases the chance of false positive.
"""
if not (0 < error_rate < 1):
raise ValueError("Error_Rate must be between 0 and 1.")
if not capacity > 0 or type(capacity) != int:
raise ValueError("Capacity must be > 0")
# Same calculation as in the js file, see it for reference purpose
# Basically determines the number of bits and slices from the capacity
# and error_rate.
k = math.ceil(- math.log(error_rate, 2))
m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k
n = math.ceil(m / 32)
m = n * 32
self.m = m
self.k = k
kbytes = 1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2))
self.buckets = np.zeros(n, dtype=np.int32)
if kbytes == 1:
loc_type = np.uint8
elif kbytes == 2:
loc_type = np.uint16
else:
loc_type = np.int32
self._locations = np.zeros(k, dtype=loc_type)
def locations(self, v):
r = self._locations
a = self.fnv_1a(v)
b = self.fnv_1a_b(a)
print(b)
i = 0
x = a % self.m
while i < self.k:
r[i] = (x + self.m) if x < 0 else x
x = (x + b) % self.m
i += 1
return r
def add(self, v):
l = self.locations(v + "")
i = 0
buckets = self.buckets
while i < self.k:
buckets[math.floor(l[i] / 32)] |= 1 << int(l[i] % 32)
i += 1
def test(self, v):
l = self.locations(v + "")
i = 0
buckets = self.buckets
while i < self.k:
b = l[i]
if buckets[math.floor(b / 32)] & (1 << int(b % 32)) == 0:
return False
i += 1
return True
def size(self):
"""
Estimated cardinality
"""
bits = 0
buckets = self.buckets
for i in range(0, len(buckets)):
bits += self.popcnt(buckets[i])
return -self.m * math.log(1 - bits / self.m) / self.k
def popcnt(self, v):
"""
http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
"""
v -= (v >> 1) & 0x55555555
v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24
def fnv_1a(self, v):
"""
Fowler/Noll/Vo hashing.
"""
n = len(v)
a = 2166136261
i = 0
while i < n:
c = ord(v[i])
d = c & 0xff000000
if d:
a ^= d >> 24
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
d = c & 0xff0000
if d:
a ^= d >> 16
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
d = c & 0xff00
if d:
a ^= d >> 8
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
a ^= c & 0xff
print(a ^ (c & 0xff))
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
i += 1
# From http://home.comcast.net/~bretm/hash/6.html
a += a << 13
a ^= a >> 7
a += a << 3
a ^= a >> 17
a += a << 5
return a & 0xffffffff
def fnv_1a_b(self, a):
"""
One additional iteration of FNV, given a hash.
"""
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24)
a += a << 13
a ^= a >> 7
a += a << 3
a ^= a >> 17
a += a << 5
print(a)
return a & 0xffffffff

View File

@ -1,48 +1,44 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""
Inspired by
http://www.stavros.io/posts/bloom-filter-search-engine/?print
You have to install the numpy python module for bloom to work.
"""
import bloom
import json
import os import os
import sys
from lxml import html
import re import re
import stemmer import stemmer
import json import struct
from bitarray import bitarray import sys
from pybloom import BloomFilter
from lxml import html
# List all files in path directory
def list_directory(path): def list_directory(path):
fichier = [] """Recursively list all files in a given directory."""
files_list = []
for root, dirs, files in os.walk(path): for root, dirs, files in os.walk(path):
for i in files: for i in files:
fichier.append(os.path.join(root, i)) files_list.append(os.path.join(root, i))
return fichier return files_list
def remove_common_words(words): def remove_common_words(words):
"""Removes all words that are less than 3 characters long."""
returned = [word for word in words if len(word) > 3] returned = [word for word in words if len(word) > 3]
return returned return returned
def padding_16(x):
if x < 256:
return bytes([0,x])
else:
return bytes([int(x/256), x%256])
# ============================================================================= if __name__ == "__main__":
error_rate = 0.1
samples = list_directory("../samples/") samples = list_directory("../samples/")
filters = {} filters = []
p = stemmer.PorterStemmer() p = stemmer.PorterStemmer()
write_little = bitarray(endian="little")
write_big = bitarray(endian="big")
write_little.frombytes(padding_16(len(samples)))
write_big.frombytes(padding_16(len(samples)))
if len(samples) > 65535:
sys.exit("[ERROR] Too many articles to index. You will have to change the "
"way data is stored in the binary file to handle such amount of "
"files.")
for sample in samples: for sample in samples:
with open(sample, 'r') as sample_fh: with open(sample, 'r') as sample_fh:
@ -51,42 +47,37 @@ for sample in samples:
# Get text from HTML content # Get text from HTML content
words = html.fromstring(content).text_content().replace("\n", "") words = html.fromstring(content).text_content().replace("\n", "")
words = re.findall(r"[\w]+", words) words = re.findall(r"[\w]+", words)
# Remove all punctuation etc., convert words to lower and delete duplicates # Remove all punctuation etc., convert words to lower and delete
# duplicates
words = list(set([word.lower() for word in words])) words = list(set([word.lower() for word in words]))
# Remove common words # Remove common words
words = remove_common_words(words) words = remove_common_words(words)
# Stemming to reduce the number of words # Stemming to reduce the number of words
words = [p.stem(word, 0, len(word)-1) for word in words] words = list(set([p.stem(word, 0, len(word)-1) for word in words]))
filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1) tmp_filter = bloom.BloomFilter(capacity=len(words),
error_rate=error_rate)
words = json.loads('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]')
for word in words: for word in words:
filters[sample].add(word) tmp_filter.add(word)
if filters[sample].bitarray.length() > 65535: filters.append(tmp_filter.buckets)
sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You " print(tmp_filter.buckets)
"will have to change the way data is stored in the binary " sys.exit()
"file to handle such amount of text.")
tmp = bitarray(endian="little") # First Int32 is length
tmp.frombytes(padding_16(filters[sample].bitarray.length())) filters_to_write = struct.pack("<i", len(filters))
write_little.extend(tmp) # Then comes the length of each filter
write_little.extend(filters[sample].bitarray) for i in filters:
write_little.extend([0 for i in range(filters[sample].bitarray.length() % filters_to_write += struct.pack("<i", len(i))
8)]) # Finally comes the filters themselves
tmp = bitarray(endian="big") for i in filters:
tmp.frombytes(padding_16(filters[sample].bitarray.length())) filters_to_write += struct.pack("<%di" % len(i), *i)
write_big.extend(tmp)
write_big.extend(filters[sample].bitarray)
write_big.extend([0 for i in range(filters[sample].bitarray.length() %
8)])
with open('../data/search_index_little', 'wb') as index_fh: # Write everything
print(write_little) with open("../data/filters", "wb") as index_fh:
write_little.tofile(index_fh) index_fh.write(filters_to_write)
with open('../data/search_index_big', 'wb') as index_fh:
print(write_big)
write_big.tofile(index_fh)
with open('../data/pages_index.json', 'w') as pages_fh: with open("../data/pages.json", "w") as pages_fh:
pages_fh.write(json.dumps(samples)) pages_fh.write(json.dumps({"index": samples}))

View File

@ -1,277 +0,0 @@
import math
import hashlib
from struct import unpack, pack, calcsize
try:
import bitarray
except ImportError:
raise ImportError('pybloom requires bitarray >= 0.3.4')
__version__ = '2.0'
__author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
Marius Eriksen <marius@monkey.org>,\
Alex Brasetvik <alex@brasetvik.com>"
def make_hashfuncs(num_slices, num_bits):
if num_bits >= (1 << 31):
fmt_code, chunk_size = 'Q', 8
elif num_bits >= (1 << 15):
fmt_code, chunk_size = 'I', 4
else:
fmt_code, chunk_size = 'H', 2
total_hash_bits = 8 * num_slices * chunk_size
if total_hash_bits > 384:
hashfn = hashlib.sha512
elif total_hash_bits > 256:
hashfn = hashlib.sha384
elif total_hash_bits > 160:
hashfn = hashlib.sha256
elif total_hash_bits > 128:
hashfn = hashlib.sha1
else:
hashfn = hashlib.md5
fmt = fmt_code * (hashfn().digest_size // chunk_size)
num_salts, extra = divmod(num_slices, len(fmt))
if extra:
num_salts += 1
salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]
def _make_hashfuncs(key):
key = str(key)
rval = []
for salt in salts:
h = salt.copy()
h.update(key.encode('utf-8'))
rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
del rval[num_slices:]
return rval
return _make_hashfuncs
class BloomFilter(object):
FILE_FMT = '<dQQQQ'
def __init__(self, capacity, error_rate=0.001):
if not (0 < error_rate < 1):
raise ValueError("Error_Rate must be between 0 and 1.")
if not capacity > 0:
raise ValueError("Capacity must be > 0")
num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
bits_per_slice = int(math.ceil(
(capacity * abs(math.log(error_rate))) /
(num_slices * (math.log(2) ** 2))))
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
self.bitarray.setall(False)
def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
self.error_rate = error_rate
self.num_slices = num_slices
self.bits_per_slice = bits_per_slice
self.capacity = capacity
self.num_bits = num_slices * bits_per_slice
self.count = count
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
def __contains__(self, key):
bits_per_slice = self.bits_per_slice
bitarray = self.bitarray
if not isinstance(key, list):
hashes = self.make_hashes(key)
else:
hashes = key
offset = 0
for k in hashes:
if not bitarray[offset + k]:
return False
offset += bits_per_slice
return True
def __len__(self):
"""Return the number of keys stored by this bloom filter."""
return self.count
def add(self, key, skip_check=False):
bitarray = self.bitarray
bits_per_slice = self.bits_per_slice
hashes = self.make_hashes(key)
if not skip_check and hashes in self:
return True
if self.count > self.capacity:
raise IndexError("BloomFilter is at capacity")
offset = 0
for k in hashes:
self.bitarray[offset + k] = True
offset += bits_per_slice
self.count += 1
return False
def copy(self):
"""Return a copy of this bloom filter.
"""
new_filter = BloomFilter(self.capacity, self.error_rate)
new_filter.bitarray = self.bitarray.copy()
return new_filter
def union(self, other):
""" Calculates the union of the two underlying bitarrays and returns
a new bloom filter object."""
if self.capacity != other.capacity or \
self.error_rate != other.error_rate:
raise ValueError("Unioning filters requires both filters to have \
both the same capacity and error rate")
new_bloom = self.copy()
new_bloom.bitarray = new_bloom.bitarray | other.bitarray
return new_bloom
def __or__(self, other):
return self.union(other)
def intersection(self, other):
""" Calculates the intersection of the two underlying bitarrays and returns
a new bloom filter object."""
if self.capacity != other.capacity or \
self.error_rate != other.error_rate:
raise ValueError("Intersecting filters requires both filters to \
have equal capacity and error rate")
new_bloom = self.copy()
new_bloom.bitarray = new_bloom.bitarray & other.bitarray
return new_bloom
def __and__(self, other):
return self.intersection(other)
def tofile(self, f):
"""Write the bloom filter to file object `f'. Underlying bits
are written as machine values. This is much more space
efficient than pickling the object."""
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
self.bits_per_slice, self.capacity, self.count))
self.bitarray.tofile(f)
@classmethod
def fromfile(cls, f, n=-1):
"""Read a bloom filter from file-object `f' serialized with
``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
headerlen = calcsize(cls.FILE_FMT)
if 0 < n < headerlen:
raise ValueError('n too small!')
filter = cls(1) # Bogus instantiation, we will `_setup'.
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
filter.bitarray = bitarray.bitarray(endian='little')
if n > 0:
filter.bitarray.fromfile(f, n - headerlen)
else:
filter.bitarray.fromfile(f)
if filter.num_bits != filter.bitarray.length() and \
(filter.num_bits + (8 - filter.num_bits % 8)
!= filter.bitarray.length()):
raise ValueError('Bit length mismatch!')
return filter
def __getstate__(self):
d = self.__dict__.copy()
del d['make_hashes']
return d
def __setstate__(self, d):
self.__dict__.update(d)
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
class ScalableBloomFilter(object):
SMALL_SET_GROWTH = 2 # slower, but takes up less memory
LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
FILE_FMT = '<idQd'
def __init__(self, initial_capacity=100, error_rate=0.001,
mode=SMALL_SET_GROWTH):
if not error_rate or error_rate < 0:
raise ValueError("Error_Rate must be a decimal less than 0.")
self._setup(mode, 0.9, initial_capacity, error_rate)
self.filters = []
def _setup(self, mode, ratio, initial_capacity, error_rate):
self.scale = mode
self.ratio = ratio
self.initial_capacity = initial_capacity
self.error_rate = error_rate
def __contains__(self, key):
for f in reversed(self.filters):
if key in f:
return True
return False
def add(self, key):
if key in self:
return True
if not self.filters:
filter = BloomFilter(
capacity=self.initial_capacity,
error_rate=self.error_rate * (1.0 - self.ratio))
self.filters.append(filter)
else:
filter = self.filters[-1]
if filter.count >= filter.capacity:
filter = BloomFilter(
capacity=filter.capacity * self.scale,
error_rate=filter.error_rate * self.ratio)
self.filters.append(filter)
filter.add(key, skip_check=True)
return False
@property
def capacity(self):
"""Returns the total capacity for all filters in this SBF"""
return sum([f.capacity for f in self.filters])
@property
def count(self):
return len(self)
def tofile(self, f):
"""Serialize this ScalableBloomFilter into the file-object
`f'."""
f.write(pack(self.FILE_FMT, self.scale, self.ratio,
self.initial_capacity, self.error_rate))
# Write #-of-filters
f.write(pack('<l', len(self.filters)))
if len(self.filters) > 0:
# Then each filter directly, with a header describing
# their lengths.
headerpos = f.tell()
headerfmt = '<' + 'Q'*(len(self.filters))
f.write('.' * calcsize(headerfmt))
filter_sizes = []
for filter in self.filters:
begin = f.tell()
filter.tofile(f)
filter_sizes.append(f.tell() - begin)
f.seek(headerpos)
f.write(pack(headerfmt, *filter_sizes))
@classmethod
def fromfile(cls, f):
"""Deserialize the ScalableBloomFilter in file object `f'."""
filter = cls()
filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
nfilters, = unpack('<l', f.read(calcsize('<l')))
if nfilters > 0:
header_fmt = '<' + 'Q'*nfilters
bytes = f.read(calcsize(header_fmt))
filter_lengths = unpack(header_fmt, bytes)
for fl in filter_lengths:
filter.filters.append(BloomFilter.fromfile(f, fl))
else:
filter.filters = []
return filter
def __len__(self):
"""Returns the total number of elements stored in this SBF"""
return sum([f.count for f in self.filters])

121
js/app.js Normal file
View File

@ -0,0 +1,121 @@
/* Params */
var error_rate = 0.1;
/* Vars */
var bloom = Array(), index;
var ready = false;
/* Functions */
function callback() {
if (typeof(index) === 'undefined' || bloom.length == 0) {
return;
}
// Sets up the page, that is now ready
ready = true;
document.getElementById('main').innerHTML = '<form id="search_form"><p><input type="text" id="search" name="search" placeholder="Search for articles..."/></p></form>';
// Handle onchange actions
document.getElementById('search').oninput = function (e) {
if (!ready) {
return;
}
filter_results(e.target.value);
}
}
// Returns true iff all the terms in the array are in the bloom filter b
function terms_in_bloom(terms, b) {
for (var i = 0; i < terms.length; i++) {
if (!b.test(terms[i])) {
return false;
}
}
return true;
}
// Filter the results to match the query
function filter_results(query) {
var search_terms = query.trim();
if (search_terms === "") {
document.getElementById('results').innerHTML = "";
}
search_terms = query.split(" ").map(stemmer);
var results = Array();
for (var i = 0; i < index.length; i++) {
if (terms_in_bloom(search_terms, bloom[i])) {
results.push(index[i]);
}
}
if (results.length > 0) {
results_html = '<ul>';
for (var i = 0; i < results.length; i++) {
results_html += '<li>' + results[i] + '</li>';
}
results_html += '</ul>'
}
else {
results_html = '<p>Aucun résultat.</p>';
}
document.getElementById('results').innerHTML = results_html;
}
/* App */
// Get the words index (a.k.a. Bloom Filter)
var oReq = new XMLHttpRequest();
oReq.open("GET", "data/filters", true);
oReq.responseType = "arraybuffer";
oReq.onload = function (oEvent) {
var array_buffer = oReq.response;
if (array_buffer) {
var byte_array = new Int32Array(array_buffer);
// First element is the number of bloom filters in the binary file
var nb_bloom_filters = byte_array[0];
// nb_bloom_filters next elements are the lengths of the arrays
var lengths = Array();
for (var i = 0; i < nb_bloom_filters; i++) {
lengths.push(byte_array[1 + i]);
}
// Then, builds Bloom filters
var l = 0, tmp_array;
for (var i = 0; i < nb_bloom_filters; i++) {
tmp_array = byte_array.subarray(1 + nb_bloom_filters + l, 1 + nb_bloom_filters + l + lengths[i]);
var l = lengths[i];
bloom.push(new BloomFilter(tmp_array, error_rate));
console.log(tmp_array);
console.log(bloom[0].test("concern"));
}
callback();
}
else {
document.getElementById('error').innerHTML = 'Unable to load the bloom filters.';
}
};
oReq.send(null);
// Get the pages index
var req = new XMLHttpRequest();
req.open('GET', 'data/pages.json', true);
req.onreadystatechange = function () {
if (req.readyState == 4) {
if (req.status == 200) {
var tmp = JSON.parse(req.responseText);
index = tmp['index'];
callback();
}
else {
document.getElementById('error').innerHTML = 'Unable to load the index.';
}
}
};
req.send(null);

View File

@ -22,23 +22,26 @@
function BloomFilter(capacity, error_rate) { function BloomFilter(capacity, error_rate) {
// *m* is the number of bits. Note that *m* is rounded up to // *m* is the number of bits. Note that *m* is rounded up to
// the nearest multiple of 32. *k* specifies the number of hashing functions. // the nearest multiple of 32. *k* specifies the number of hashing functions.
if (error_rate < 0 || error_rate > 1 || (typeof(capacity) === "number" && capacity < 0)) {
return false;
}
var a, i = -1; var a, i = -1;
// Number of slices, k // Number of slices, k
var k = Math.ceil(- Math.log(error_rate) / Math.log(2)); var k = Math.ceil(- Math.log(error_rate) / Math.log(2));
// Total number of bits, m // Total number of bits, m
// Size of the UInt32 table, n // Size of the Int32 table, n
var m, n; var m, n;
if (typeof capacity !== "number") { if (typeof capacity !== "number") {
a = capacity; a = capacity;
// Total number of bits, m // Total number of bits, m
m = a.length * 32; m = a.length * 32;
// Size of the UInt32 table, n // Size of the Int32 table, n
n = a.length; n = a.length;
} }
else { else {
// Total number of bits, m // Total number of bits, m
m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k; m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k;
// Size of the UInt32 table, n // Size of the Int32 table, n
n = Math.ceil(m / 32); n = Math.ceil(m / 32);
// Round total number of bits to closest multiple of 32 // Round total number of bits to closest multiple of 32
m = n * 32; m = n * 32;

186
js/stemmer.js Normal file
View File

@ -0,0 +1,186 @@
// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
// paper, in
//
// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
// no. 3, pp 130-137,
//
// see also http://www.tartarus.org/~martin/PorterStemmer
// Release 1 be 'andargor', Jul 2004
// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
var stemmer = (function(){
var step2list = {
"ational" : "ate",
"tional" : "tion",
"enci" : "ence",
"anci" : "ance",
"izer" : "ize",
"bli" : "ble",
"alli" : "al",
"entli" : "ent",
"eli" : "e",
"ousli" : "ous",
"ization" : "ize",
"ation" : "ate",
"ator" : "ate",
"alism" : "al",
"iveness" : "ive",
"fulness" : "ful",
"ousness" : "ous",
"aliti" : "al",
"iviti" : "ive",
"biliti" : "ble",
"logi" : "log"
},
step3list = {
"icate" : "ic",
"ative" : "",
"alize" : "al",
"iciti" : "ic",
"ical" : "ic",
"ful" : "",
"ness" : ""
},
c = "[^aeiou]", // consonant
v = "[aeiouy]", // vowel
C = c + "[^aeiouy]*", // consonant sequence
V = v + "[aeiou]*", // vowel sequence
mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
s_v = "^(" + C + ")?" + v; // vowel in stem
return function (w) {
var stem,
suffix,
firstch,
re,
re2,
re3,
re4,
origword = w;
if (w.length < 3) { return w; }
firstch = w.substr(0,1);
if (firstch == "y") {
w = firstch.toUpperCase() + w.substr(1);
}
// Step 1a
re = /^(.+?)(ss|i)es$/;
re2 = /^(.+?)([^s])s$/;
if (re.test(w)) { w = w.replace(re,"$1$2"); }
else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
// Step 1b
re = /^(.+?)eed$/;
re2 = /^(.+?)(ed|ing)$/;
if (re.test(w)) {
var fp = re.exec(w);
re = new RegExp(mgr0);
if (re.test(fp[1])) {
re = /.$/;
w = w.replace(re,"");
}
} else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1];
re2 = new RegExp(s_v);
if (re2.test(stem)) {
w = stem;
re2 = /(at|bl|iz)$/;
re3 = new RegExp("([^aeiouylsz])\\1$");
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re2.test(w)) { w = w + "e"; }
else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
else if (re4.test(w)) { w = w + "e"; }
}
}
// Step 1c
re = /^(.+?)y$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(s_v);
if (re.test(stem)) { w = stem + "i"; }
}
// Step 2
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem)) {
w = stem + step2list[suffix];
}
}
// Step 3
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem)) {
w = stem + step3list[suffix];
}
}
// Step 4
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
re2 = /^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
if (re.test(stem)) {
w = stem;
}
} else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1] + fp[2];
re2 = new RegExp(mgr1);
if (re2.test(stem)) {
w = stem;
}
}
// Step 5
re = /^(.+?)e$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
re2 = new RegExp(meq1);
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
w = stem;
}
}
re = /ll$/;
re2 = new RegExp(mgr1);
if (re.test(w) && re2.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
// and turn initial Y back to y
if (firstch == "y") {
w = firstch.toLowerCase() + w.substr(1);
}
return w;
}
})();

View File

@ -1,7 +1,6 @@
/* These are some basic unit-tests for the bloom.js module */ /* These are some basic unit-tests for the bloom.js module */
var bloom = new BloomFilter(4, 0.1); var bloom = new BloomFilter(4, 0.1);
console.log(bloom);
// Add some elements to the filter. // Add some elements to the filter.
bloom.add("foo"); bloom.add("foo");
@ -22,6 +21,7 @@ var array = [].slice.call(bloom.buckets),
json = JSON.stringify(array); json = JSON.stringify(array);
console.log(array); console.log(array);
console.log(json);
// Deserialisation. Note that the any array-like object is supported, but // Deserialisation. Note that the any array-like object is supported, but
// this will be used directly, so you may wish to use a typed array for // this will be used directly, so you may wish to use a typed array for

25
js/test2.js Normal file
View File

@ -0,0 +1,25 @@
/* These are some basic unit-tests for the bloom.js module */
var words = JSON.parse('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]');
var bloom2 = new BloomFilter(words.length, 0.1);
console.log(bloom2);
// Add some elements to the filter.
for (var i = 0; i < words.length; i++) {
bloom2.add(words[i]);
}
// Test if an item is in our filter.
// Returns true if an item is probably in the set,
// or false if an item is definitely not in the set.
for (var i = 0; i < words.length; i++) {
console.log(words[i] + " : " + bloom2.test(words[i]));
}
// Serialisation. Note that bloom.buckets may be a typed array,
// so we convert to a normal array first.
var array = [].slice.call(bloom2.buckets),
json = JSON.stringify(array);
console.log(bloom2.buckets);

13
test.html Normal file
View File

@ -0,0 +1,13 @@
<!doctype html>
<html lang="fr">
<head>
<meta charset="utf-8">
<title>BloomJS demo</title>
</head>
<body>
<h1>Bloom.JS demo</h1>
<p>This page runs the <code>bloom.js</code> library unit-tests. Look at your console output for <code>assert</code> error and verbose debugging.
<script type="text/javascript" src="js/bloom.js"></script>
<script type="text/javascript" src="js/test2.js"></script>
</body>
</html>