From 496e2f823f07d90219593d0f3a4e125c29213b38 Mon Sep 17 00:00:00 2001 From: Phyks Date: Tue, 28 Oct 2014 02:36:28 +0100 Subject: [PATCH] Commit before debugging bloom.py --- data/filters | Bin 0 -> 180 bytes data/pages.json | 1 + data/test | Bin 0 -> 12 bytes data/words | Bin 0 -> 248 bytes index.html | 17 +- index_generation/bloom.py | 162 +++++++++++++++++ index_generation/generate_index.py | 125 ++++++------- index_generation/pybloom.py | 277 ----------------------------- js/app.js | 121 +++++++++++++ js/bloom.js | 9 +- js/stemmer.js | 186 +++++++++++++++++++ js/test.js | 2 +- js/test2.js | 25 +++ test.html | 13 ++ 14 files changed, 584 insertions(+), 354 deletions(-) create mode 100644 data/filters create mode 100644 data/pages.json create mode 100644 data/test create mode 100644 data/words create mode 100644 index_generation/bloom.py delete mode 100644 index_generation/pybloom.py create mode 100644 js/app.js create mode 100644 js/stemmer.js create mode 100644 js/test2.js create mode 100644 test.html diff --git a/data/filters b/data/filters new file mode 100644 index 0000000000000000000000000000000000000000..22b4dacd3de6294e95c847b195f8e1dba76e5f39 GIT binary patch literal 180 zcmV;l089S@0000E0000S0002h-?ls^>GtZSiYJ^c7Eg4b2qE!*W?h(*lUCnt==<_~ z1Zo9$-Cf^A{3mh_{@<&DQ5z{1L^bv9>d+LmtU>8928MbAg%lAbw{QgPri`$gn zy84RRC~Zym`PpTYZ8j(}(l(kZpT=T$Aj0%>c|0EC&8|o_`X`2|Smj^1XIb~^BkUem iQ1a`?MA3=6eFI}w{OErQHC=k^-v;m1!k?Mlp)c=5bzHsx literal 0 HcmV?d00001 diff --git a/data/pages.json b/data/pages.json new file mode 100644 index 0000000..a860202 --- /dev/null +++ b/data/pages.json @@ -0,0 +1 @@ +{"index": ["../samples/cryptdevice_multi.html", "../samples/highmon_weechat.html"]} \ No newline at end of file diff --git a/data/test b/data/test new file mode 100644 index 0000000000000000000000000000000000000000..658b8a864a8e88fcc42a08633e6ef57591378a8f GIT binary patch literal 12 RcmZQ%U|?Vb;*N#|O#lUg0xbXl literal 0 HcmV?d00001 diff --git a/data/words b/data/words new file mode 100644 index 0000000000000000000000000000000000000000..3058f6fd2ec64b78fc5423eb67378d3e4852f2f2 GIT binary patch literal 248 zcmeytz`($&r>9?>m|KvOTCAU3R9R4xl3JFToEo27np2Xgmr;_N!(a|n#<0i4Zr2In z-tMW_UH`sw=Q>$4KO*aUua{i?&xe!Oyg#bF{MsJZip3uLrdKq+`SWjc?W{W&C(qcn z3fcIK%=C=h{Ji+`)YRmR#1e?%c_72@JXz~yf4Hw+Wq#a`pJ&*AUEQ`ova+gt#lG|@ zg3hN;#((Uc8|Z)B$mX`}+m%Px8~c6P{4jq8cg^AX@2t<16)S4H#mtv|(7erV*D>yd pnz^?378@7W{jOR%DfCv_Cb@m-udBaqkJ38*onryhhL!u+832V>b_)Oi literal 0 HcmV?d00001 diff --git a/index.html b/index.html index c348094..20aff01 100644 --- a/index.html +++ b/index.html @@ -3,17 +3,22 @@ BloomJS demo +

Bloom.JS demo

-

-
-

- -

-
+

+
+

Loading…

+
+ diff --git a/index_generation/bloom.py b/index_generation/bloom.py new file mode 100644 index 0000000..562c2f1 --- /dev/null +++ b/index_generation/bloom.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 + +""" +This is a translation of the bloom.js script (originally from +https://github.com/jasondavies/bloomfilter.js) in Python. + +Due to its status of translation of the previously mentionned JS code, you +should refer to this one for any particular doc that should be missing in this +implementation. + +Needs the bitarray python module to work. + +Note : Depending on your use case, the pybloom module available on Pypi may +better suits your needs. I reimplemented the above mentionned JS script in +Python mostly because I had to for this script, as the pybloom module uses +advanced hashing techniques, difficult to implement in JS. + +This script has been written by Phyks and is in the public domain (or whatever +is closer to public domain in your country). +""" + +import math + +try: + import numpy as np +except ImportError: + raise ImportError('This script requires numpy') + + +class BloomFilter(): + def __init__(self, capacity, error_rate=0.1): + """ + Implements a space-efficient probabilistic data structure. + + capacity + This is the capacity of the BloomFilter. So to speak, it should be + able to store at least *capacity* elements + error_rate + the error rate of the filter returning false positives. This + determines the filters capacity. Inserting more than capacity + elements greatly increases the chance of false positive. + """ + if not (0 < error_rate < 1): + raise ValueError("Error_Rate must be between 0 and 1.") + if not capacity > 0 or type(capacity) != int: + raise ValueError("Capacity must be > 0") + + # Same calculation as in the js file, see it for reference purpose + # Basically determines the number of bits and slices from the capacity + # and error_rate. + k = math.ceil(- math.log(error_rate, 2)) + m = math.ceil(capacity * abs(math.log(error_rate)) / (k * (math.log(2) ** 2))) * k + n = math.ceil(m / 32) + m = n * 32 + self.m = m + self.k = k + + kbytes = 1 << math.ceil(math.log(math.ceil(math.log(m, 2) / 8), 2)) + self.buckets = np.zeros(n, dtype=np.int32) + if kbytes == 1: + loc_type = np.uint8 + elif kbytes == 2: + loc_type = np.uint16 + else: + loc_type = np.int32 + self._locations = np.zeros(k, dtype=loc_type) + + def locations(self, v): + r = self._locations + a = self.fnv_1a(v) + b = self.fnv_1a_b(a) + print(b) + i = 0 + x = a % self.m + while i < self.k: + r[i] = (x + self.m) if x < 0 else x + x = (x + b) % self.m + i += 1 + return r + + def add(self, v): + l = self.locations(v + "") + i = 0 + buckets = self.buckets + while i < self.k: + buckets[math.floor(l[i] / 32)] |= 1 << int(l[i] % 32) + i += 1 + + def test(self, v): + l = self.locations(v + "") + i = 0 + buckets = self.buckets + while i < self.k: + b = l[i] + if buckets[math.floor(b / 32)] & (1 << int(b % 32)) == 0: + return False + i += 1 + return True + + def size(self): + """ + Estimated cardinality + """ + bits = 0 + buckets = self.buckets + for i in range(0, len(buckets)): + bits += self.popcnt(buckets[i]) + return -self.m * math.log(1 - bits / self.m) / self.k + + def popcnt(self, v): + """ + http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + """ + v -= (v >> 1) & 0x55555555 + v = (v & 0x33333333) + ((v >> 2) & 0x33333333) + return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24 + + def fnv_1a(self, v): + """ + Fowler/Noll/Vo hashing. + """ + n = len(v) + a = 2166136261 + i = 0 + while i < n: + c = ord(v[i]) + d = c & 0xff000000 + if d: + a ^= d >> 24 + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) + d = c & 0xff0000 + if d: + a ^= d >> 16 + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) + d = c & 0xff00 + if d: + a ^= d >> 8 + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) + a ^= c & 0xff + print(a ^ (c & 0xff)) + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) + i += 1 + # From http://home.comcast.net/~bretm/hash/6.html + a += a << 13 + a ^= a >> 7 + a += a << 3 + a ^= a >> 17 + a += a << 5 + return a & 0xffffffff + + def fnv_1a_b(self, a): + """ + One additional iteration of FNV, given a hash. + """ + a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) + a += a << 13 + a ^= a >> 7 + a += a << 3 + a ^= a >> 17 + a += a << 5 + print(a) + return a & 0xffffffff diff --git a/index_generation/generate_index.py b/index_generation/generate_index.py index 3e5ae55..d399244 100755 --- a/index_generation/generate_index.py +++ b/index_generation/generate_index.py @@ -1,92 +1,83 @@ #!/usr/bin/env python3 +""" +Inspired by +http://www.stavros.io/posts/bloom-filter-search-engine/?print + + +You have to install the numpy python module for bloom to work. +""" + +import bloom +import json import os -import sys -from lxml import html import re import stemmer -import json -from bitarray import bitarray -from pybloom import BloomFilter +import struct +import sys + +from lxml import html -# List all files in path directory def list_directory(path): - fichier = [] + """Recursively list all files in a given directory.""" + files_list = [] for root, dirs, files in os.walk(path): for i in files: - fichier.append(os.path.join(root, i)) - return fichier + files_list.append(os.path.join(root, i)) + return files_list def remove_common_words(words): + """Removes all words that are less than 3 characters long.""" returned = [word for word in words if len(word) > 3] return returned -def padding_16(x): - if x < 256: - return bytes([0,x]) - else: - return bytes([int(x/256), x%256]) -# ============================================================================= -samples = list_directory("../samples/") -filters = {} -p = stemmer.PorterStemmer() -write_little = bitarray(endian="little") -write_big = bitarray(endian="big") +if __name__ == "__main__": + error_rate = 0.1 + samples = list_directory("../samples/") + filters = [] + p = stemmer.PorterStemmer() -write_little.frombytes(padding_16(len(samples))) -write_big.frombytes(padding_16(len(samples))) + for sample in samples: + with open(sample, 'r') as sample_fh: + content = sample_fh.read() -if len(samples) > 65535: - sys.exit("[ERROR] Too many articles to index. You will have to change the " - "way data is stored in the binary file to handle such amount of " - "files.") + # Get text from HTML content + words = html.fromstring(content).text_content().replace("\n", "") + words = re.findall(r"[\w]+", words) + # Remove all punctuation etc., convert words to lower and delete + # duplicates + words = list(set([word.lower() for word in words])) -for sample in samples: - with open(sample, 'r') as sample_fh: - content = sample_fh.read() + # Remove common words + words = remove_common_words(words) + # Stemming to reduce the number of words + words = list(set([p.stem(word, 0, len(word)-1) for word in words])) - # Get text from HTML content - words = html.fromstring(content).text_content().replace("\n", "") - words = re.findall(r"[\w]+", words) - # Remove all punctuation etc., convert words to lower and delete duplicates - words = list(set([word.lower() for word in words])) + tmp_filter = bloom.BloomFilter(capacity=len(words), + error_rate=error_rate) + words = json.loads('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]') + for word in words: + tmp_filter.add(word) - # Remove common words - words = remove_common_words(words) - # Stemming to reduce the number of words - words = [p.stem(word, 0, len(word)-1) for word in words] + filters.append(tmp_filter.buckets) + print(tmp_filter.buckets) + sys.exit() - filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1) - for word in words: - filters[sample].add(word) + # First Int32 is length + filters_to_write = struct.pack(" 65535: - sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You " - "will have to change the way data is stored in the binary " - "file to handle such amount of text.") + # Write everything + with open("../data/filters", "wb") as index_fh: + index_fh.write(filters_to_write) - tmp = bitarray(endian="little") - tmp.frombytes(padding_16(filters[sample].bitarray.length())) - write_little.extend(tmp) - write_little.extend(filters[sample].bitarray) - write_little.extend([0 for i in range(filters[sample].bitarray.length() % - 8)]) - tmp = bitarray(endian="big") - tmp.frombytes(padding_16(filters[sample].bitarray.length())) - write_big.extend(tmp) - write_big.extend(filters[sample].bitarray) - write_big.extend([0 for i in range(filters[sample].bitarray.length() % - 8)]) - -with open('../data/search_index_little', 'wb') as index_fh: - print(write_little) - write_little.tofile(index_fh) -with open('../data/search_index_big', 'wb') as index_fh: - print(write_big) - write_big.tofile(index_fh) - -with open('../data/pages_index.json', 'w') as pages_fh: - pages_fh.write(json.dumps(samples)) + with open("../data/pages.json", "w") as pages_fh: + pages_fh.write(json.dumps({"index": samples})) diff --git a/index_generation/pybloom.py b/index_generation/pybloom.py deleted file mode 100644 index ce3d54a..0000000 --- a/index_generation/pybloom.py +++ /dev/null @@ -1,277 +0,0 @@ -import math -import hashlib -from struct import unpack, pack, calcsize - -try: - import bitarray -except ImportError: - raise ImportError('pybloom requires bitarray >= 0.3.4') - -__version__ = '2.0' -__author__ = "Jay Baird , Bob Ippolito ,\ -Marius Eriksen ,\ -Alex Brasetvik " - -def make_hashfuncs(num_slices, num_bits): - if num_bits >= (1 << 31): - fmt_code, chunk_size = 'Q', 8 - elif num_bits >= (1 << 15): - fmt_code, chunk_size = 'I', 4 - else: - fmt_code, chunk_size = 'H', 2 - total_hash_bits = 8 * num_slices * chunk_size - if total_hash_bits > 384: - hashfn = hashlib.sha512 - elif total_hash_bits > 256: - hashfn = hashlib.sha384 - elif total_hash_bits > 160: - hashfn = hashlib.sha256 - elif total_hash_bits > 128: - hashfn = hashlib.sha1 - else: - hashfn = hashlib.md5 - fmt = fmt_code * (hashfn().digest_size // chunk_size) - num_salts, extra = divmod(num_slices, len(fmt)) - if extra: - num_salts += 1 - salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)] - def _make_hashfuncs(key): - key = str(key) - rval = [] - for salt in salts: - h = salt.copy() - h.update(key.encode('utf-8')) - rval.extend(uint % num_bits for uint in unpack(fmt, h.digest())) - del rval[num_slices:] - return rval - return _make_hashfuncs - - -class BloomFilter(object): - FILE_FMT = ' 0: - raise ValueError("Capacity must be > 0") - num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) - bits_per_slice = int(math.ceil( - (capacity * abs(math.log(error_rate))) / - (num_slices * (math.log(2) ** 2)))) - self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) - self.bitarray = bitarray.bitarray(self.num_bits, endian='little') - self.bitarray.setall(False) - - def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): - self.error_rate = error_rate - self.num_slices = num_slices - self.bits_per_slice = bits_per_slice - self.capacity = capacity - self.num_bits = num_slices * bits_per_slice - self.count = count - self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) - - def __contains__(self, key): - bits_per_slice = self.bits_per_slice - bitarray = self.bitarray - if not isinstance(key, list): - hashes = self.make_hashes(key) - else: - hashes = key - offset = 0 - for k in hashes: - if not bitarray[offset + k]: - return False - offset += bits_per_slice - return True - - def __len__(self): - """Return the number of keys stored by this bloom filter.""" - return self.count - - def add(self, key, skip_check=False): - bitarray = self.bitarray - bits_per_slice = self.bits_per_slice - hashes = self.make_hashes(key) - if not skip_check and hashes in self: - return True - if self.count > self.capacity: - raise IndexError("BloomFilter is at capacity") - offset = 0 - for k in hashes: - self.bitarray[offset + k] = True - offset += bits_per_slice - self.count += 1 - return False - - def copy(self): - """Return a copy of this bloom filter. -""" - new_filter = BloomFilter(self.capacity, self.error_rate) - new_filter.bitarray = self.bitarray.copy() - return new_filter - - def union(self, other): - """ Calculates the union of the two underlying bitarrays and returns -a new bloom filter object.""" - if self.capacity != other.capacity or \ - self.error_rate != other.error_rate: - raise ValueError("Unioning filters requires both filters to have \ -both the same capacity and error rate") - new_bloom = self.copy() - new_bloom.bitarray = new_bloom.bitarray | other.bitarray - return new_bloom - - def __or__(self, other): - return self.union(other) - - def intersection(self, other): - """ Calculates the intersection of the two underlying bitarrays and returns -a new bloom filter object.""" - if self.capacity != other.capacity or \ - self.error_rate != other.error_rate: - raise ValueError("Intersecting filters requires both filters to \ -have equal capacity and error rate") - new_bloom = self.copy() - new_bloom.bitarray = new_bloom.bitarray & other.bitarray - return new_bloom - - def __and__(self, other): - return self.intersection(other) - - def tofile(self, f): - """Write the bloom filter to file object `f'. Underlying bits -are written as machine values. This is much more space -efficient than pickling the object.""" - f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, - self.bits_per_slice, self.capacity, self.count)) - self.bitarray.tofile(f) - - @classmethod - def fromfile(cls, f, n=-1): - """Read a bloom filter from file-object `f' serialized with -``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" - headerlen = calcsize(cls.FILE_FMT) - - if 0 < n < headerlen: - raise ValueError('n too small!') - - filter = cls(1) # Bogus instantiation, we will `_setup'. - filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) - filter.bitarray = bitarray.bitarray(endian='little') - if n > 0: - filter.bitarray.fromfile(f, n - headerlen) - else: - filter.bitarray.fromfile(f) - if filter.num_bits != filter.bitarray.length() and \ - (filter.num_bits + (8 - filter.num_bits % 8) - != filter.bitarray.length()): - raise ValueError('Bit length mismatch!') - - return filter - - def __getstate__(self): - d = self.__dict__.copy() - del d['make_hashes'] - return d - - def __setstate__(self, d): - self.__dict__.update(d) - self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) - -class ScalableBloomFilter(object): - SMALL_SET_GROWTH = 2 # slower, but takes up less memory - LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster - FILE_FMT = '= filter.capacity: - filter = BloomFilter( - capacity=filter.capacity * self.scale, - error_rate=filter.error_rate * self.ratio) - self.filters.append(filter) - filter.add(key, skip_check=True) - return False - - @property - def capacity(self): - """Returns the total capacity for all filters in this SBF""" - return sum([f.capacity for f in self.filters]) - - @property - def count(self): - return len(self) - - def tofile(self, f): - """Serialize this ScalableBloomFilter into the file-object -`f'.""" - f.write(pack(self.FILE_FMT, self.scale, self.ratio, - self.initial_capacity, self.error_rate)) - - # Write #-of-filters - f.write(pack(' 0: - # Then each filter directly, with a header describing - # their lengths. - headerpos = f.tell() - headerfmt = '<' + 'Q'*(len(self.filters)) - f.write('.' * calcsize(headerfmt)) - filter_sizes = [] - for filter in self.filters: - begin = f.tell() - filter.tofile(f) - filter_sizes.append(f.tell() - begin) - - f.seek(headerpos) - f.write(pack(headerfmt, *filter_sizes)) - - @classmethod - def fromfile(cls, f): - """Deserialize the ScalableBloomFilter in file object `f'.""" - filter = cls() - filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) - nfilters, = unpack(' 0: - header_fmt = '<' + 'Q'*nfilters - bytes = f.read(calcsize(header_fmt)) - filter_lengths = unpack(header_fmt, bytes) - for fl in filter_lengths: - filter.filters.append(BloomFilter.fromfile(f, fl)) - else: - filter.filters = [] - - return filter - - def __len__(self): - """Returns the total number of elements stored in this SBF""" - return sum([f.count for f in self.filters]) diff --git a/js/app.js b/js/app.js new file mode 100644 index 0000000..e326a57 --- /dev/null +++ b/js/app.js @@ -0,0 +1,121 @@ +/* Params */ +var error_rate = 0.1; + + +/* Vars */ +var bloom = Array(), index; +var ready = false; + + +/* Functions */ +function callback() { + if (typeof(index) === 'undefined' || bloom.length == 0) { + return; + } + + // Sets up the page, that is now ready + ready = true; + document.getElementById('main').innerHTML = '

'; + + // Handle onchange actions + document.getElementById('search').oninput = function (e) { + if (!ready) { + return; + } + + filter_results(e.target.value); + } +} + +// Returns true iff all the terms in the array are in the bloom filter b +function terms_in_bloom(terms, b) { + for (var i = 0; i < terms.length; i++) { + if (!b.test(terms[i])) { + return false; + } + } + return true; +} + +// Filter the results to match the query +function filter_results(query) { + var search_terms = query.trim(); + if (search_terms === "") { + document.getElementById('results').innerHTML = ""; + } + search_terms = query.split(" ").map(stemmer); + + var results = Array(); + for (var i = 0; i < index.length; i++) { + if (terms_in_bloom(search_terms, bloom[i])) { + results.push(index[i]); + } + } + + if (results.length > 0) { + results_html = '
    '; + for (var i = 0; i < results.length; i++) { + results_html += '
  • ' + results[i] + '
  • '; + } + results_html += '
' + } + else { + results_html = '

Aucun résultat.

'; + } + document.getElementById('results').innerHTML = results_html; +} + + +/* App */ + +// Get the words index (a.k.a. Bloom Filter) +var oReq = new XMLHttpRequest(); +oReq.open("GET", "data/filters", true); +oReq.responseType = "arraybuffer"; +oReq.onload = function (oEvent) { + var array_buffer = oReq.response; + if (array_buffer) { + var byte_array = new Int32Array(array_buffer); + + // First element is the number of bloom filters in the binary file + var nb_bloom_filters = byte_array[0]; + // nb_bloom_filters next elements are the lengths of the arrays + var lengths = Array(); + for (var i = 0; i < nb_bloom_filters; i++) { + lengths.push(byte_array[1 + i]); + } + // Then, builds Bloom filters + var l = 0, tmp_array; + for (var i = 0; i < nb_bloom_filters; i++) { + tmp_array = byte_array.subarray(1 + nb_bloom_filters + l, 1 + nb_bloom_filters + l + lengths[i]); + var l = lengths[i]; + bloom.push(new BloomFilter(tmp_array, error_rate)); + console.log(tmp_array); + console.log(bloom[0].test("concern")); + } + + callback(); + } + else { + document.getElementById('error').innerHTML = 'Unable to load the bloom filters.'; + } +}; +oReq.send(null); + +// Get the pages index +var req = new XMLHttpRequest(); +req.open('GET', 'data/pages.json', true); +req.onreadystatechange = function () { + if (req.readyState == 4) { + if (req.status == 200) { + var tmp = JSON.parse(req.responseText); + index = tmp['index']; + + callback(); + } + else { + document.getElementById('error').innerHTML = 'Unable to load the index.'; + } + } +}; +req.send(null); diff --git a/js/bloom.js b/js/bloom.js index 4bf381a..e0cf72e 100644 --- a/js/bloom.js +++ b/js/bloom.js @@ -22,23 +22,26 @@ function BloomFilter(capacity, error_rate) { // *m* is the number of bits. Note that *m* is rounded up to // the nearest multiple of 32. *k* specifies the number of hashing functions. + if (error_rate < 0 || error_rate > 1 || (typeof(capacity) === "number" && capacity < 0)) { + return false; + } var a, i = -1; // Number of slices, k var k = Math.ceil(- Math.log(error_rate) / Math.log(2)); // Total number of bits, m - // Size of the UInt32 table, n + // Size of the Int32 table, n var m, n; if (typeof capacity !== "number") { a = capacity; // Total number of bits, m m = a.length * 32; - // Size of the UInt32 table, n + // Size of the Int32 table, n n = a.length; } else { // Total number of bits, m m = Math.ceil(capacity * Math.abs(Math.log(error_rate)) / (k * Math.pow(Math.log(2), 2))) * k; - // Size of the UInt32 table, n + // Size of the Int32 table, n n = Math.ceil(m / 32); // Round total number of bits to closest multiple of 32 m = n * 32; diff --git a/js/stemmer.js b/js/stemmer.js new file mode 100644 index 0000000..ceade1a --- /dev/null +++ b/js/stemmer.js @@ -0,0 +1,186 @@ +// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original +// paper, in +// +// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +// no. 3, pp 130-137, +// +// see also http://www.tartarus.org/~martin/PorterStemmer + +// Release 1 be 'andargor', Jul 2004 +// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009 + +var stemmer = (function(){ + var step2list = { + "ational" : "ate", + "tional" : "tion", + "enci" : "ence", + "anci" : "ance", + "izer" : "ize", + "bli" : "ble", + "alli" : "al", + "entli" : "ent", + "eli" : "e", + "ousli" : "ous", + "ization" : "ize", + "ation" : "ate", + "ator" : "ate", + "alism" : "al", + "iveness" : "ive", + "fulness" : "ful", + "ousness" : "ous", + "aliti" : "al", + "iviti" : "ive", + "biliti" : "ble", + "logi" : "log" + }, + + step3list = { + "icate" : "ic", + "ative" : "", + "alize" : "al", + "iciti" : "ic", + "ical" : "ic", + "ful" : "", + "ness" : "" + }, + + c = "[^aeiou]", // consonant + v = "[aeiouy]", // vowel + C = c + "[^aeiouy]*", // consonant sequence + V = v + "[aeiou]*", // vowel sequence + + mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0 + meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1 + mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1 + s_v = "^(" + C + ")?" + v; // vowel in stem + + return function (w) { + var stem, + suffix, + firstch, + re, + re2, + re3, + re4, + origword = w; + + if (w.length < 3) { return w; } + + firstch = w.substr(0,1); + if (firstch == "y") { + w = firstch.toUpperCase() + w.substr(1); + } + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) { w = w.replace(re,"$1$2"); } + else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) { w = w + "e"; } + else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } + else if (re4.test(w)) { w = w + "e"; } + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) { w = stem + "i"; } + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) { + w = stem + step2list[suffix]; + } + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) { + w = stem + step3list[suffix]; + } + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) { + w = stem; + } + } else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) { + w = stem; + } + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { + w = stem; + } + } + + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + + if (firstch == "y") { + w = firstch.toLowerCase() + w.substr(1); + } + + return w; + } +})(); diff --git a/js/test.js b/js/test.js index 9441115..c83bc6b 100644 --- a/js/test.js +++ b/js/test.js @@ -1,7 +1,6 @@ /* These are some basic unit-tests for the bloom.js module */ var bloom = new BloomFilter(4, 0.1); -console.log(bloom); // Add some elements to the filter. bloom.add("foo"); @@ -22,6 +21,7 @@ var array = [].slice.call(bloom.buckets), json = JSON.stringify(array); console.log(array); +console.log(json); // Deserialisation. Note that the any array-like object is supported, but // this will be used directly, so you may wish to use a typed array for diff --git a/js/test2.js b/js/test2.js new file mode 100644 index 0000000..8a12f76 --- /dev/null +++ b/js/test2.js @@ -0,0 +1,25 @@ +/* These are some basic unit-tests for the bloom.js module */ + +var words = JSON.parse('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]'); + +var bloom2 = new BloomFilter(words.length, 0.1); +console.log(bloom2); + +// Add some elements to the filter. +for (var i = 0; i < words.length; i++) { + bloom2.add(words[i]); +} + +// Test if an item is in our filter. +// Returns true if an item is probably in the set, +// or false if an item is definitely not in the set. +for (var i = 0; i < words.length; i++) { + console.log(words[i] + " : " + bloom2.test(words[i])); +} + +// Serialisation. Note that bloom.buckets may be a typed array, +// so we convert to a normal array first. +var array = [].slice.call(bloom2.buckets), + json = JSON.stringify(array); + +console.log(bloom2.buckets); diff --git a/test.html b/test.html new file mode 100644 index 0000000..09db9d2 --- /dev/null +++ b/test.html @@ -0,0 +1,13 @@ + + + + + BloomJS demo + + +

Bloom.JS demo

+

This page runs the bloom.js library unit-tests. Look at your console output for assert error and verbose debugging. + + + +