From d759e7c8aba2fc91f6f2c5ba6e8c68db63af2b26 Mon Sep 17 00:00:00 2001 From: Phyks Date: Thu, 2 Jan 2014 21:24:22 +0100 Subject: [PATCH] Clean + switch to bloom filters and bitarrays * Refactor of the repo structure, for better usability. * README.md refactored. * Switch to BloomFilters in python script, to decrease the index file. TODO: * Handle binary files in JS to pass the BloomFilters from python to JS. Note: Current implementations of BloomFilters differ in JS and Python lib. --- README.md | 30 +- index.json | 1 - .../generate_index.py | 16 +- index_generation/pybloom.py | 277 +++++++++++++ index_generation/stemmer.py | 367 ++++++++++++++++++ bloom.js => js/bloom.js | 3 + bloomfilter.js => js/bloomfilter.js | 0 jquery-2.0.3.min.js => js/jquery-2.0.3.min.js | 0 8 files changed, 680 insertions(+), 14 deletions(-) delete mode 100644 index.json rename generate_index.py => index_generation/generate_index.py (74%) create mode 100644 index_generation/pybloom.py create mode 100644 index_generation/stemmer.py rename bloom.js => js/bloom.js (92%) rename bloomfilter.js => js/bloomfilter.js (100%) rename jquery-2.0.3.min.js => js/jquery-2.0.3.min.js (100%) diff --git a/README.md b/README.md index c03c6ec..e820858 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,31 @@ An index is generated by a Python script, upon generation of the pages, and is d ## Files -* `generate_index.py` : Python script to generate the index (runs only at page generation) in a nice format for Javascript -* `samples/` : samples for testing purpose (taken from my blog articles) +### Index generation (`index_generation/` folder) + +* `generate_index.py`: Python script to generate the index (runs only at page generation) in a nice format for Javascript +* `pybloom.py`: Library to handle bloom filters in Python +* `stemmer.py`: Implementation of Porter Stemming algorithm in Python, from Vivake Gupta. + +### Example html search form + +* `index.html` +* `js/bloom.js`: main JS code +* `js/bloomfilters.js`: JS library to use BloomFilters +* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future. + +### Examples + +* `samples/`: samples for testing purpose (taken from my blog articles) ## Notes -I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones : +* I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones: -* https://github.com/olivernn/lunr.js -* https://github.com/reyesr/fullproof + * https://github.com/olivernn/lunr.js + * https://github.com/reyesr/fullproof -But I wasn't fully satisfied by the first one, and I found the second one too heavy and complicated for my purpose, so I ended up coding this. + But I wasn't fully satisfied by the first one, and I found the second one too heavy and complicated for my purpose, so I ended up coding this. + +* This code is mainly a proof of concept. As such, it is not fully optimized (actually, I just tweaked until the resulted files and calculations could be considered "acceptable"). For those looking for more effective solutions, here are a few things I found while looking for information on the web: + + * The stemming algorithm used may not be the most efficient one. People wanting to work with non-English languages or to optimize the overall computation of the index can easily move to a more effective algorithm. See [Wikipedia](http://en.wikipedia.org/wiki/Stemming) and [the stemming library in Python](https://pypi.python.org/pypi/stemming/1.0) which has C wrappers for best performances. diff --git a/index.json b/index.json deleted file mode 100644 index 0fc14dc..0000000 --- a/index.json +++ /dev/null @@ -1 +0,0 @@ -{"samples/cryptdevice_multi.html11": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html10": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/highmon_weechat.html": ["clean", "assume", "stored", "what", "highmon", "size", "above", "very", "interfaces", "tell", "list", "away", "alias", "following", "extensively", "then", "make", "scroll_highmon_up", "same", "hilight_monitor_title_textand", "last", "almost", "extensions", "created", "think", "aliases", "there", "weechat", "opened", "speed", "hilight_monitor_title_text", "default", "sure", "while", "something", "though", "which", "select", "irssi", "hack", "latency", "after", "setup", "clear", "highmon_title", "dedicating", "client", "solution", "buffers", "bind", "documented", "preferences", "documentation", "screen", "newly", "done", "instead", "link", "poitras", "size_max", "display", "used", "recently", "type", "plugins", "just", "another", "good", "options", "future", "doesn", "text_item", "silverd", "having", "plugin", "start", "priority", "local", "excellent", "text_effects", "output", "note", "planning", "when", "sometimes", "wrong", "mute", "internet", "contrary", "articles", "first", "configuration", "clear_highmon", "definitely", "sessioni", "items", "nice", "barhighmon", "scroll", "250to", "have", "efficient", "pascal", "well", "should", "determine", "position", "detach", "without", "some", "important", "useless", "working", "hide", "next", "instance", "want", "colors", "your", "hidous", "typing", "bar_lines", "text", "edit", "property", "easily", "bars", "that", "website", "avoid", "install", "python", "even", "messages", "wasn", "found", "title", "this", "could", "connecting", "although", "lacks", "configured", "accidentally", "able", "screen_away", "look", "moved", "including", "check", "freshly", "base", "scroll_highmon_down", "achieved", "means", "quite", "other", "hilight", "window", "auto", "connection", "pleased", "know", "will", "main", "lines", "number", "opinion", "like", "monitor", "reference", "decoration", "play", "thing", "perl", "automatically", "server", "using", "buffer", "installed", "step", "positioned", "point", "always", "usable", "with", "standard", "iset", "inline", "bold", "from", "such", "perfect"], "samples/cryptdevice_multi.html3": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html9": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html8": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html5": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html4": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html7": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html6": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html1": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"], "samples/cryptdevice_multi.html2": ["ckeybyif", "cryptkey", "doesn", "adding", "occurrence", "described", "then", "finally", "last", "here", "that", "change", "mount", "default", "something", "need", "stuff", "best", "setup", "multiple", "package", "solution", "ckey", "cryptdevice2", "necessary", "encrypt2", "updates", "achieve", "over", "display", "grub", "disks", "provided", "extended", "boot", "multi", "another", "concern", "cryptsetup", "used", "parameters", "required", "initcpio", "container", "example", "when", "load", "order", "conf", "needed", "manually", "hooks", "available", "devices", "requires", "your", "least", "edit", "copy", "command", "second", "obsolete", "means", "avoid", "install", "ckeyfiin", "update", "with", "works", "this", "which", "able", "mkinitcpio", "support", "cryptdevice", "laptop", "unlock", "warning", "linemkdir", "unlocking", "arch", "presented", "device", "also", "proceed", "line", "file", "hook", "first", "installed", "mkdir", "more", "found", "encrypted", "system", "encrypt", "wiki", "luks", "cryptkey2"]} \ No newline at end of file diff --git a/generate_index.py b/index_generation/generate_index.py similarity index 74% rename from generate_index.py rename to index_generation/generate_index.py index 0fb7740..34f4795 100755 --- a/generate_index.py +++ b/index_generation/generate_index.py @@ -3,8 +3,8 @@ import os from lxml import html import re -import json -from collections import defaultdict +import stemmer +from pybloom import BloomFilter # List all files in path directory @@ -22,7 +22,8 @@ def remove_common_words(words): # ============================================================================= samples = list_directory("samples/") -index = defaultdict(list) +filters = {} +p = stemmer.PorterStemmer() for sample in samples: with open(sample, 'r') as sample_fh: @@ -37,10 +38,11 @@ for sample in samples: # Remove common words words = remove_common_words(words) # Stemming to reduce the number of words - # TODO : Could use http://tartarus.org/martin/PorterStemmer/ + words = [p.stem(word, 0, len(word)-1) for word in words] + filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1) for word in words: - index[sample].append(word) + filters[sample].add(word) -with open("index.json", 'w') as index_fh: - index_fh.write(json.dumps(index)) +print(sum(len(filter.bitarray.tobytes()) for filter in filters.values()) / + len(filters)) diff --git a/index_generation/pybloom.py b/index_generation/pybloom.py new file mode 100644 index 0000000..ce3d54a --- /dev/null +++ b/index_generation/pybloom.py @@ -0,0 +1,277 @@ +import math +import hashlib +from struct import unpack, pack, calcsize + +try: + import bitarray +except ImportError: + raise ImportError('pybloom requires bitarray >= 0.3.4') + +__version__ = '2.0' +__author__ = "Jay Baird , Bob Ippolito ,\ +Marius Eriksen ,\ +Alex Brasetvik " + +def make_hashfuncs(num_slices, num_bits): + if num_bits >= (1 << 31): + fmt_code, chunk_size = 'Q', 8 + elif num_bits >= (1 << 15): + fmt_code, chunk_size = 'I', 4 + else: + fmt_code, chunk_size = 'H', 2 + total_hash_bits = 8 * num_slices * chunk_size + if total_hash_bits > 384: + hashfn = hashlib.sha512 + elif total_hash_bits > 256: + hashfn = hashlib.sha384 + elif total_hash_bits > 160: + hashfn = hashlib.sha256 + elif total_hash_bits > 128: + hashfn = hashlib.sha1 + else: + hashfn = hashlib.md5 + fmt = fmt_code * (hashfn().digest_size // chunk_size) + num_salts, extra = divmod(num_slices, len(fmt)) + if extra: + num_salts += 1 + salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)] + def _make_hashfuncs(key): + key = str(key) + rval = [] + for salt in salts: + h = salt.copy() + h.update(key.encode('utf-8')) + rval.extend(uint % num_bits for uint in unpack(fmt, h.digest())) + del rval[num_slices:] + return rval + return _make_hashfuncs + + +class BloomFilter(object): + FILE_FMT = ' 0: + raise ValueError("Capacity must be > 0") + num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) + bits_per_slice = int(math.ceil( + (capacity * abs(math.log(error_rate))) / + (num_slices * (math.log(2) ** 2)))) + self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) + self.bitarray = bitarray.bitarray(self.num_bits, endian='little') + self.bitarray.setall(False) + + def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): + self.error_rate = error_rate + self.num_slices = num_slices + self.bits_per_slice = bits_per_slice + self.capacity = capacity + self.num_bits = num_slices * bits_per_slice + self.count = count + self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) + + def __contains__(self, key): + bits_per_slice = self.bits_per_slice + bitarray = self.bitarray + if not isinstance(key, list): + hashes = self.make_hashes(key) + else: + hashes = key + offset = 0 + for k in hashes: + if not bitarray[offset + k]: + return False + offset += bits_per_slice + return True + + def __len__(self): + """Return the number of keys stored by this bloom filter.""" + return self.count + + def add(self, key, skip_check=False): + bitarray = self.bitarray + bits_per_slice = self.bits_per_slice + hashes = self.make_hashes(key) + if not skip_check and hashes in self: + return True + if self.count > self.capacity: + raise IndexError("BloomFilter is at capacity") + offset = 0 + for k in hashes: + self.bitarray[offset + k] = True + offset += bits_per_slice + self.count += 1 + return False + + def copy(self): + """Return a copy of this bloom filter. +""" + new_filter = BloomFilter(self.capacity, self.error_rate) + new_filter.bitarray = self.bitarray.copy() + return new_filter + + def union(self, other): + """ Calculates the union of the two underlying bitarrays and returns +a new bloom filter object.""" + if self.capacity != other.capacity or \ + self.error_rate != other.error_rate: + raise ValueError("Unioning filters requires both filters to have \ +both the same capacity and error rate") + new_bloom = self.copy() + new_bloom.bitarray = new_bloom.bitarray | other.bitarray + return new_bloom + + def __or__(self, other): + return self.union(other) + + def intersection(self, other): + """ Calculates the intersection of the two underlying bitarrays and returns +a new bloom filter object.""" + if self.capacity != other.capacity or \ + self.error_rate != other.error_rate: + raise ValueError("Intersecting filters requires both filters to \ +have equal capacity and error rate") + new_bloom = self.copy() + new_bloom.bitarray = new_bloom.bitarray & other.bitarray + return new_bloom + + def __and__(self, other): + return self.intersection(other) + + def tofile(self, f): + """Write the bloom filter to file object `f'. Underlying bits +are written as machine values. This is much more space +efficient than pickling the object.""" + f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, + self.bits_per_slice, self.capacity, self.count)) + self.bitarray.tofile(f) + + @classmethod + def fromfile(cls, f, n=-1): + """Read a bloom filter from file-object `f' serialized with +``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" + headerlen = calcsize(cls.FILE_FMT) + + if 0 < n < headerlen: + raise ValueError('n too small!') + + filter = cls(1) # Bogus instantiation, we will `_setup'. + filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) + filter.bitarray = bitarray.bitarray(endian='little') + if n > 0: + filter.bitarray.fromfile(f, n - headerlen) + else: + filter.bitarray.fromfile(f) + if filter.num_bits != filter.bitarray.length() and \ + (filter.num_bits + (8 - filter.num_bits % 8) + != filter.bitarray.length()): + raise ValueError('Bit length mismatch!') + + return filter + + def __getstate__(self): + d = self.__dict__.copy() + del d['make_hashes'] + return d + + def __setstate__(self, d): + self.__dict__.update(d) + self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) + +class ScalableBloomFilter(object): + SMALL_SET_GROWTH = 2 # slower, but takes up less memory + LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster + FILE_FMT = '= filter.capacity: + filter = BloomFilter( + capacity=filter.capacity * self.scale, + error_rate=filter.error_rate * self.ratio) + self.filters.append(filter) + filter.add(key, skip_check=True) + return False + + @property + def capacity(self): + """Returns the total capacity for all filters in this SBF""" + return sum([f.capacity for f in self.filters]) + + @property + def count(self): + return len(self) + + def tofile(self, f): + """Serialize this ScalableBloomFilter into the file-object +`f'.""" + f.write(pack(self.FILE_FMT, self.scale, self.ratio, + self.initial_capacity, self.error_rate)) + + # Write #-of-filters + f.write(pack(' 0: + # Then each filter directly, with a header describing + # their lengths. + headerpos = f.tell() + headerfmt = '<' + 'Q'*(len(self.filters)) + f.write('.' * calcsize(headerfmt)) + filter_sizes = [] + for filter in self.filters: + begin = f.tell() + filter.tofile(f) + filter_sizes.append(f.tell() - begin) + + f.seek(headerpos) + f.write(pack(headerfmt, *filter_sizes)) + + @classmethod + def fromfile(cls, f): + """Deserialize the ScalableBloomFilter in file object `f'.""" + filter = cls() + filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) + nfilters, = unpack(' 0: + header_fmt = '<' + 'Q'*nfilters + bytes = f.read(calcsize(header_fmt)) + filter_lengths = unpack(header_fmt, bytes) + for fl in filter_lengths: + filter.filters.append(BloomFilter.fromfile(f, fl)) + else: + filter.filters = [] + + return filter + + def __len__(self): + """Returns the total number of elements stored in this SBF""" + return sum([f.count for f in self.filters]) diff --git a/index_generation/stemmer.py b/index_generation/stemmer.py new file mode 100644 index 0000000..4b31d7f --- /dev/null +++ b/index_generation/stemmer.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 + +"""Porter Stemming Algorithm +This is the Porter stemming algorithm, ported to Python from the +version coded up in ANSI C by the author. It may be be regarded +as canonical, in that it follows the algorithm presented in + +Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +no. 3, pp 130-137, + +only differing from it at the points maked --DEPARTURE-- below. + +See also http://www.tartarus.org/~martin/PorterStemmer + +The algorithm as described in the paper could be exactly replicated +by adjusting the points of DEPARTURE, but this is barely necessary, +because (a) the points of DEPARTURE are definitely improvements, and +(b) no encoding of the Porter stemmer I have seen is anything like +as exact as this version, even with the points of DEPARTURE! + +Vivake Gupta (v@nano.com) + +Release 1: January 2001 + +Further adjustments by Santiago Bruno (bananabruno@gmail.com) +to allow word input not restricted to one word per line, leading +to: + +release 2: July 2008 +""" + +import sys + +class PorterStemmer: + + def __init__(self): + """The main part of the stemming algorithm starts here. + b is a buffer holding a word to be stemmed. The letters are in b[k0], + b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is + readjusted downwards as the stemming progresses. Zero termination is + not in fact used in the algorithm. + + Note that only lower case sequences are stemmed. Forcing to lower case + should be done before stem(...) is called. + """ + + self.b = "" # buffer for word to be stemmed + self.k = 0 + self.k0 = 0 + self.j = 0 # j is a general offset into the string + + def cons(self, i): + """cons(i) is TRUE <=> b[i] is a consonant.""" + if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': + return 0 + if self.b[i] == 'y': + if i == self.k0: + return 1 + else: + return (not self.cons(i - 1)) + return 1 + + def m(self): + """m() measures the number of consonant sequences between k0 and j. + if c is a consonant sequence and v a vowel sequence, and <..> + indicates arbitrary presence, + + gives 0 + vc gives 1 + vcvc gives 2 + vcvcvc gives 3 + .... + """ + n = 0 + i = self.k0 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + while 1: + while 1: + if i > self.j: + return n + if self.cons(i): + break + i = i + 1 + i = i + 1 + n = n + 1 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + + def vowelinstem(self): + """vowelinstem() is TRUE <=> k0,...j contains a vowel""" + for i in range(self.k0, self.j + 1): + if not self.cons(i): + return 1 + return 0 + + def doublec(self, j): + """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" + if j < (self.k0 + 1): + return 0 + if (self.b[j] != self.b[j-1]): + return 0 + return self.cons(j) + + def cvc(self, i): + """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + """ + if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2): + return 0 + ch = self.b[i] + if ch == 'w' or ch == 'x' or ch == 'y': + return 0 + return 1 + + def ends(self, s): + """ends(s) is TRUE <=> k0,...k ends with the string s.""" + length = len(s) + if s[length - 1] != self.b[self.k]: # tiny speed-up + return 0 + if length > (self.k - self.k0 + 1): + return 0 + if self.b[self.k-length+1:self.k+1] != s: + return 0 + self.j = self.k - length + return 1 + + def setto(self, s): + """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" + length = len(s) + self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] + self.k = self.j + length + + def r(self, s): + """r(s) is used further down.""" + if self.m() > 0: + self.setto(s) + + def step1ab(self): + """step1ab() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + """ + if self.b[self.k] == 's': + if self.ends("sses"): + self.k = self.k - 2 + elif self.ends("ies"): + self.setto("i") + elif self.b[self.k - 1] != 's': + self.k = self.k - 1 + if self.ends("eed"): + if self.m() > 0: + self.k = self.k - 1 + elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): + self.k = self.j + if self.ends("at"): self.setto("ate") + elif self.ends("bl"): self.setto("ble") + elif self.ends("iz"): self.setto("ize") + elif self.doublec(self.k): + self.k = self.k - 1 + ch = self.b[self.k] + if ch == 'l' or ch == 's' or ch == 'z': + self.k = self.k + 1 + elif (self.m() == 1 and self.cvc(self.k)): + self.setto("e") + + def step1c(self): + """step1c() turns terminal y to i when there is another vowel in the stem.""" + if (self.ends("y") and self.vowelinstem()): + self.b = self.b[:self.k] + 'i' + self.b[self.k+1:] + + def step2(self): + """step2() maps double suffices to single ones. + so -ization ( = -ize plus -ation) maps to -ize etc. note that the + string before the suffix must give m() > 0. + """ + if self.b[self.k - 1] == 'a': + if self.ends("ational"): self.r("ate") + elif self.ends("tional"): self.r("tion") + elif self.b[self.k - 1] == 'c': + if self.ends("enci"): self.r("ence") + elif self.ends("anci"): self.r("ance") + elif self.b[self.k - 1] == 'e': + if self.ends("izer"): self.r("ize") + elif self.b[self.k - 1] == 'l': + if self.ends("bli"): self.r("ble") # --DEPARTURE-- + # To match the published algorithm, replace this phrase with + # if self.ends("abli"): self.r("able") + elif self.ends("alli"): self.r("al") + elif self.ends("entli"): self.r("ent") + elif self.ends("eli"): self.r("e") + elif self.ends("ousli"): self.r("ous") + elif self.b[self.k - 1] == 'o': + if self.ends("ization"): self.r("ize") + elif self.ends("ation"): self.r("ate") + elif self.ends("ator"): self.r("ate") + elif self.b[self.k - 1] == 's': + if self.ends("alism"): self.r("al") + elif self.ends("iveness"): self.r("ive") + elif self.ends("fulness"): self.r("ful") + elif self.ends("ousness"): self.r("ous") + elif self.b[self.k - 1] == 't': + if self.ends("aliti"): self.r("al") + elif self.ends("iviti"): self.r("ive") + elif self.ends("biliti"): self.r("ble") + elif self.b[self.k - 1] == 'g': # --DEPARTURE-- + if self.ends("logi"): self.r("log") + # To match the published algorithm, delete this phrase + + def step3(self): + """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" + if self.b[self.k] == 'e': + if self.ends("icate"): self.r("ic") + elif self.ends("ative"): self.r("") + elif self.ends("alize"): self.r("al") + elif self.b[self.k] == 'i': + if self.ends("iciti"): self.r("ic") + elif self.b[self.k] == 'l': + if self.ends("ical"): self.r("ic") + elif self.ends("ful"): self.r("") + elif self.b[self.k] == 's': + if self.ends("ness"): self.r("") + + def step4(self): + """step4() takes off -ant, -ence etc., in context vcvc.""" + if self.b[self.k - 1] == 'a': + if self.ends("al"): pass + else: return + elif self.b[self.k - 1] == 'c': + if self.ends("ance"): pass + elif self.ends("ence"): pass + else: return + elif self.b[self.k - 1] == 'e': + if self.ends("er"): pass + else: return + elif self.b[self.k - 1] == 'i': + if self.ends("ic"): pass + else: return + elif self.b[self.k - 1] == 'l': + if self.ends("able"): pass + elif self.ends("ible"): pass + else: return + elif self.b[self.k - 1] == 'n': + if self.ends("ant"): pass + elif self.ends("ement"): pass + elif self.ends("ment"): pass + elif self.ends("ent"): pass + else: return + elif self.b[self.k - 1] == 'o': + if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass + elif self.ends("ou"): pass + # takes care of -ous + else: return + elif self.b[self.k - 1] == 's': + if self.ends("ism"): pass + else: return + elif self.b[self.k - 1] == 't': + if self.ends("ate"): pass + elif self.ends("iti"): pass + else: return + elif self.b[self.k - 1] == 'u': + if self.ends("ous"): pass + else: return + elif self.b[self.k - 1] == 'v': + if self.ends("ive"): pass + else: return + elif self.b[self.k - 1] == 'z': + if self.ends("ize"): pass + else: return + else: + return + if self.m() > 1: + self.k = self.j + + def step5(self): + """step5() removes a final -e if m() > 1, and changes -ll to -l if + m() > 1. + """ + self.j = self.k + if self.b[self.k] == 'e': + a = self.m() + if a > 1 or (a == 1 and not self.cvc(self.k-1)): + self.k = self.k - 1 + if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: + self.k = self.k -1 + + def stem(self, p, i, j): + """In stem(p,i,j), p is a char pointer, and the string to be stemmed + is from p[i] to p[j] inclusive. Typically i is zero and j is the + offset to the last character of a string, (p[j+1] == '\0'). The + stemmer adjusts the characters p[i] ... p[j] and returns the new + end-point of the string, k. Stemming never increases word length, so + i <= k <= j. To turn the stemmer into a module, declare 'stem' as + extern, and delete the remainder of this file. + """ + # copy the parameters into statics + self.b = p + self.k = j + self.k0 = i + if self.k <= self.k0 + 1: + return self.b # --DEPARTURE-- + + # With this line, strings of length 1 or 2 don't go through the + # stemming process, although no mention is made of this in the + # published algorithm. Remove the line to match the published + # algorithm. + + self.step1ab() + self.step1c() + self.step2() + self.step3() + self.step4() + self.step5() + return self.b[self.k0:self.k+1] + + +if __name__ == '__main__': + p = PorterStemmer() + if len(sys.argv) > 1: + for f in sys.argv[1:]: + infile = open(f, 'r') + while 1: + output = '' + word = '' + line = infile.readline() + if line == '': + break + for c in line: + if c.isalpha(): + word += c.lower() + else: + if word: + output += p.stem(word, 0,len(word)-1) + word = '' + output += c.lower() + print(output) + infile.close() diff --git a/bloom.js b/js/bloom.js similarity index 92% rename from bloom.js rename to js/bloom.js index 382e526..aaae35e 100644 --- a/bloom.js +++ b/js/bloom.js @@ -44,6 +44,9 @@ function callback_change() { $("#results").append("

"+key+"

"); } } + if(!$("#results p").length) { + $("#results").append("

No results...

"); + } } $("#search").on('input', callback_change); diff --git a/bloomfilter.js b/js/bloomfilter.js similarity index 100% rename from bloomfilter.js rename to js/bloomfilter.js diff --git a/jquery-2.0.3.min.js b/js/jquery-2.0.3.min.js similarity index 100% rename from jquery-2.0.3.min.js rename to js/jquery-2.0.3.min.js