Clean + switch to bloom filters and bitarrays
* Refactor of the repo structure, for better usability. * README.md refactored. * Switch to BloomFilters in python script, to decrease the index file. TODO: * Handle binary files in JS to pass the BloomFilters from python to JS. Note: Current implementations of BloomFilters differ in JS and Python lib.
This commit is contained in:
parent
26d95b4cc3
commit
d759e7c8ab
20
README.md
20
README.md
@ -10,13 +10,31 @@ An index is generated by a Python script, upon generation of the pages, and is d
|
|||||||
|
|
||||||
## Files
|
## Files
|
||||||
|
|
||||||
|
### Index generation (`index_generation/` folder)
|
||||||
|
|
||||||
* `generate_index.py`: Python script to generate the index (runs only at page generation) in a nice format for Javascript
|
* `generate_index.py`: Python script to generate the index (runs only at page generation) in a nice format for Javascript
|
||||||
|
* `pybloom.py`: Library to handle bloom filters in Python
|
||||||
|
* `stemmer.py`: Implementation of Porter Stemming algorithm in Python, from Vivake Gupta.
|
||||||
|
|
||||||
|
### Example html search form
|
||||||
|
|
||||||
|
* `index.html`
|
||||||
|
* `js/bloom.js`: main JS code
|
||||||
|
* `js/bloomfilters.js`: JS library to use BloomFilters
|
||||||
|
* `js/jquery-2.0.3.min.js`: jQuery to have convenient functions, will mostly be dropped in the future.
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
* `samples/`: samples for testing purpose (taken from my blog articles)
|
* `samples/`: samples for testing purpose (taken from my blog articles)
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones :
|
* I got the idea while reading [this page](http://www.stavros.io/posts/bloom-filter-search-engine/?print) found on [Sebsauvage's shaarli](http://sebsauvage.net/links/). I searched a bit for code doing what I wanted and found these ones:
|
||||||
|
|
||||||
* https://github.com/olivernn/lunr.js
|
* https://github.com/olivernn/lunr.js
|
||||||
* https://github.com/reyesr/fullproof
|
* https://github.com/reyesr/fullproof
|
||||||
|
|
||||||
But I wasn't fully satisfied by the first one, and I found the second one too heavy and complicated for my purpose, so I ended up coding this.
|
But I wasn't fully satisfied by the first one, and I found the second one too heavy and complicated for my purpose, so I ended up coding this.
|
||||||
|
|
||||||
|
* This code is mainly a proof of concept. As such, it is not fully optimized (actually, I just tweaked until the resulted files and calculations could be considered "acceptable"). For those looking for more effective solutions, here are a few things I found while looking for information on the web:
|
||||||
|
|
||||||
|
* The stemming algorithm used may not be the most efficient one. People wanting to work with non-English languages or to optimize the overall computation of the index can easily move to a more effective algorithm. See [Wikipedia](http://en.wikipedia.org/wiki/Stemming) and [the stemming library in Python](https://pypi.python.org/pypi/stemming/1.0) which has C wrappers for best performances.
|
||||||
|
File diff suppressed because one or more lines are too long
@ -3,8 +3,8 @@
|
|||||||
import os
|
import os
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import re
|
import re
|
||||||
import json
|
import stemmer
|
||||||
from collections import defaultdict
|
from pybloom import BloomFilter
|
||||||
|
|
||||||
|
|
||||||
# List all files in path directory
|
# List all files in path directory
|
||||||
@ -22,7 +22,8 @@ def remove_common_words(words):
|
|||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
samples = list_directory("samples/")
|
samples = list_directory("samples/")
|
||||||
index = defaultdict(list)
|
filters = {}
|
||||||
|
p = stemmer.PorterStemmer()
|
||||||
|
|
||||||
for sample in samples:
|
for sample in samples:
|
||||||
with open(sample, 'r') as sample_fh:
|
with open(sample, 'r') as sample_fh:
|
||||||
@ -37,10 +38,11 @@ for sample in samples:
|
|||||||
# Remove common words
|
# Remove common words
|
||||||
words = remove_common_words(words)
|
words = remove_common_words(words)
|
||||||
# Stemming to reduce the number of words
|
# Stemming to reduce the number of words
|
||||||
# TODO : Could use http://tartarus.org/martin/PorterStemmer/
|
words = [p.stem(word, 0, len(word)-1) for word in words]
|
||||||
|
|
||||||
|
filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1)
|
||||||
for word in words:
|
for word in words:
|
||||||
index[sample].append(word)
|
filters[sample].add(word)
|
||||||
|
|
||||||
with open("index.json", 'w') as index_fh:
|
print(sum(len(filter.bitarray.tobytes()) for filter in filters.values()) /
|
||||||
index_fh.write(json.dumps(index))
|
len(filters))
|
277
index_generation/pybloom.py
Normal file
277
index_generation/pybloom.py
Normal file
@ -0,0 +1,277 @@
|
|||||||
|
import math
|
||||||
|
import hashlib
|
||||||
|
from struct import unpack, pack, calcsize
|
||||||
|
|
||||||
|
try:
|
||||||
|
import bitarray
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError('pybloom requires bitarray >= 0.3.4')
|
||||||
|
|
||||||
|
__version__ = '2.0'
|
||||||
|
__author__ = "Jay Baird <jay.baird@me.com>, Bob Ippolito <bob@redivi.com>,\
|
||||||
|
Marius Eriksen <marius@monkey.org>,\
|
||||||
|
Alex Brasetvik <alex@brasetvik.com>"
|
||||||
|
|
||||||
|
def make_hashfuncs(num_slices, num_bits):
|
||||||
|
if num_bits >= (1 << 31):
|
||||||
|
fmt_code, chunk_size = 'Q', 8
|
||||||
|
elif num_bits >= (1 << 15):
|
||||||
|
fmt_code, chunk_size = 'I', 4
|
||||||
|
else:
|
||||||
|
fmt_code, chunk_size = 'H', 2
|
||||||
|
total_hash_bits = 8 * num_slices * chunk_size
|
||||||
|
if total_hash_bits > 384:
|
||||||
|
hashfn = hashlib.sha512
|
||||||
|
elif total_hash_bits > 256:
|
||||||
|
hashfn = hashlib.sha384
|
||||||
|
elif total_hash_bits > 160:
|
||||||
|
hashfn = hashlib.sha256
|
||||||
|
elif total_hash_bits > 128:
|
||||||
|
hashfn = hashlib.sha1
|
||||||
|
else:
|
||||||
|
hashfn = hashlib.md5
|
||||||
|
fmt = fmt_code * (hashfn().digest_size // chunk_size)
|
||||||
|
num_salts, extra = divmod(num_slices, len(fmt))
|
||||||
|
if extra:
|
||||||
|
num_salts += 1
|
||||||
|
salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]
|
||||||
|
def _make_hashfuncs(key):
|
||||||
|
key = str(key)
|
||||||
|
rval = []
|
||||||
|
for salt in salts:
|
||||||
|
h = salt.copy()
|
||||||
|
h.update(key.encode('utf-8'))
|
||||||
|
rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
|
||||||
|
del rval[num_slices:]
|
||||||
|
return rval
|
||||||
|
return _make_hashfuncs
|
||||||
|
|
||||||
|
|
||||||
|
class BloomFilter(object):
|
||||||
|
FILE_FMT = '<dQQQQ'
|
||||||
|
|
||||||
|
def __init__(self, capacity, error_rate=0.001):
|
||||||
|
if not (0 < error_rate < 1):
|
||||||
|
raise ValueError("Error_Rate must be between 0 and 1.")
|
||||||
|
if not capacity > 0:
|
||||||
|
raise ValueError("Capacity must be > 0")
|
||||||
|
num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
|
||||||
|
bits_per_slice = int(math.ceil(
|
||||||
|
(capacity * abs(math.log(error_rate))) /
|
||||||
|
(num_slices * (math.log(2) ** 2))))
|
||||||
|
self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
|
||||||
|
self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
|
||||||
|
self.bitarray.setall(False)
|
||||||
|
|
||||||
|
def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
|
||||||
|
self.error_rate = error_rate
|
||||||
|
self.num_slices = num_slices
|
||||||
|
self.bits_per_slice = bits_per_slice
|
||||||
|
self.capacity = capacity
|
||||||
|
self.num_bits = num_slices * bits_per_slice
|
||||||
|
self.count = count
|
||||||
|
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
bits_per_slice = self.bits_per_slice
|
||||||
|
bitarray = self.bitarray
|
||||||
|
if not isinstance(key, list):
|
||||||
|
hashes = self.make_hashes(key)
|
||||||
|
else:
|
||||||
|
hashes = key
|
||||||
|
offset = 0
|
||||||
|
for k in hashes:
|
||||||
|
if not bitarray[offset + k]:
|
||||||
|
return False
|
||||||
|
offset += bits_per_slice
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""Return the number of keys stored by this bloom filter."""
|
||||||
|
return self.count
|
||||||
|
|
||||||
|
def add(self, key, skip_check=False):
|
||||||
|
bitarray = self.bitarray
|
||||||
|
bits_per_slice = self.bits_per_slice
|
||||||
|
hashes = self.make_hashes(key)
|
||||||
|
if not skip_check and hashes in self:
|
||||||
|
return True
|
||||||
|
if self.count > self.capacity:
|
||||||
|
raise IndexError("BloomFilter is at capacity")
|
||||||
|
offset = 0
|
||||||
|
for k in hashes:
|
||||||
|
self.bitarray[offset + k] = True
|
||||||
|
offset += bits_per_slice
|
||||||
|
self.count += 1
|
||||||
|
return False
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
"""Return a copy of this bloom filter.
|
||||||
|
"""
|
||||||
|
new_filter = BloomFilter(self.capacity, self.error_rate)
|
||||||
|
new_filter.bitarray = self.bitarray.copy()
|
||||||
|
return new_filter
|
||||||
|
|
||||||
|
def union(self, other):
|
||||||
|
""" Calculates the union of the two underlying bitarrays and returns
|
||||||
|
a new bloom filter object."""
|
||||||
|
if self.capacity != other.capacity or \
|
||||||
|
self.error_rate != other.error_rate:
|
||||||
|
raise ValueError("Unioning filters requires both filters to have \
|
||||||
|
both the same capacity and error rate")
|
||||||
|
new_bloom = self.copy()
|
||||||
|
new_bloom.bitarray = new_bloom.bitarray | other.bitarray
|
||||||
|
return new_bloom
|
||||||
|
|
||||||
|
def __or__(self, other):
|
||||||
|
return self.union(other)
|
||||||
|
|
||||||
|
def intersection(self, other):
|
||||||
|
""" Calculates the intersection of the two underlying bitarrays and returns
|
||||||
|
a new bloom filter object."""
|
||||||
|
if self.capacity != other.capacity or \
|
||||||
|
self.error_rate != other.error_rate:
|
||||||
|
raise ValueError("Intersecting filters requires both filters to \
|
||||||
|
have equal capacity and error rate")
|
||||||
|
new_bloom = self.copy()
|
||||||
|
new_bloom.bitarray = new_bloom.bitarray & other.bitarray
|
||||||
|
return new_bloom
|
||||||
|
|
||||||
|
def __and__(self, other):
|
||||||
|
return self.intersection(other)
|
||||||
|
|
||||||
|
def tofile(self, f):
|
||||||
|
"""Write the bloom filter to file object `f'. Underlying bits
|
||||||
|
are written as machine values. This is much more space
|
||||||
|
efficient than pickling the object."""
|
||||||
|
f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
|
||||||
|
self.bits_per_slice, self.capacity, self.count))
|
||||||
|
self.bitarray.tofile(f)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fromfile(cls, f, n=-1):
|
||||||
|
"""Read a bloom filter from file-object `f' serialized with
|
||||||
|
``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
|
||||||
|
headerlen = calcsize(cls.FILE_FMT)
|
||||||
|
|
||||||
|
if 0 < n < headerlen:
|
||||||
|
raise ValueError('n too small!')
|
||||||
|
|
||||||
|
filter = cls(1) # Bogus instantiation, we will `_setup'.
|
||||||
|
filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
|
||||||
|
filter.bitarray = bitarray.bitarray(endian='little')
|
||||||
|
if n > 0:
|
||||||
|
filter.bitarray.fromfile(f, n - headerlen)
|
||||||
|
else:
|
||||||
|
filter.bitarray.fromfile(f)
|
||||||
|
if filter.num_bits != filter.bitarray.length() and \
|
||||||
|
(filter.num_bits + (8 - filter.num_bits % 8)
|
||||||
|
!= filter.bitarray.length()):
|
||||||
|
raise ValueError('Bit length mismatch!')
|
||||||
|
|
||||||
|
return filter
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
d = self.__dict__.copy()
|
||||||
|
del d['make_hashes']
|
||||||
|
return d
|
||||||
|
|
||||||
|
def __setstate__(self, d):
|
||||||
|
self.__dict__.update(d)
|
||||||
|
self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
|
||||||
|
|
||||||
|
class ScalableBloomFilter(object):
|
||||||
|
SMALL_SET_GROWTH = 2 # slower, but takes up less memory
|
||||||
|
LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
|
||||||
|
FILE_FMT = '<idQd'
|
||||||
|
|
||||||
|
def __init__(self, initial_capacity=100, error_rate=0.001,
|
||||||
|
mode=SMALL_SET_GROWTH):
|
||||||
|
if not error_rate or error_rate < 0:
|
||||||
|
raise ValueError("Error_Rate must be a decimal less than 0.")
|
||||||
|
self._setup(mode, 0.9, initial_capacity, error_rate)
|
||||||
|
self.filters = []
|
||||||
|
|
||||||
|
def _setup(self, mode, ratio, initial_capacity, error_rate):
|
||||||
|
self.scale = mode
|
||||||
|
self.ratio = ratio
|
||||||
|
self.initial_capacity = initial_capacity
|
||||||
|
self.error_rate = error_rate
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
for f in reversed(self.filters):
|
||||||
|
if key in f:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def add(self, key):
|
||||||
|
if key in self:
|
||||||
|
return True
|
||||||
|
if not self.filters:
|
||||||
|
filter = BloomFilter(
|
||||||
|
capacity=self.initial_capacity,
|
||||||
|
error_rate=self.error_rate * (1.0 - self.ratio))
|
||||||
|
self.filters.append(filter)
|
||||||
|
else:
|
||||||
|
filter = self.filters[-1]
|
||||||
|
if filter.count >= filter.capacity:
|
||||||
|
filter = BloomFilter(
|
||||||
|
capacity=filter.capacity * self.scale,
|
||||||
|
error_rate=filter.error_rate * self.ratio)
|
||||||
|
self.filters.append(filter)
|
||||||
|
filter.add(key, skip_check=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def capacity(self):
|
||||||
|
"""Returns the total capacity for all filters in this SBF"""
|
||||||
|
return sum([f.capacity for f in self.filters])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def count(self):
|
||||||
|
return len(self)
|
||||||
|
|
||||||
|
def tofile(self, f):
|
||||||
|
"""Serialize this ScalableBloomFilter into the file-object
|
||||||
|
`f'."""
|
||||||
|
f.write(pack(self.FILE_FMT, self.scale, self.ratio,
|
||||||
|
self.initial_capacity, self.error_rate))
|
||||||
|
|
||||||
|
# Write #-of-filters
|
||||||
|
f.write(pack('<l', len(self.filters)))
|
||||||
|
|
||||||
|
if len(self.filters) > 0:
|
||||||
|
# Then each filter directly, with a header describing
|
||||||
|
# their lengths.
|
||||||
|
headerpos = f.tell()
|
||||||
|
headerfmt = '<' + 'Q'*(len(self.filters))
|
||||||
|
f.write('.' * calcsize(headerfmt))
|
||||||
|
filter_sizes = []
|
||||||
|
for filter in self.filters:
|
||||||
|
begin = f.tell()
|
||||||
|
filter.tofile(f)
|
||||||
|
filter_sizes.append(f.tell() - begin)
|
||||||
|
|
||||||
|
f.seek(headerpos)
|
||||||
|
f.write(pack(headerfmt, *filter_sizes))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fromfile(cls, f):
|
||||||
|
"""Deserialize the ScalableBloomFilter in file object `f'."""
|
||||||
|
filter = cls()
|
||||||
|
filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
|
||||||
|
nfilters, = unpack('<l', f.read(calcsize('<l')))
|
||||||
|
if nfilters > 0:
|
||||||
|
header_fmt = '<' + 'Q'*nfilters
|
||||||
|
bytes = f.read(calcsize(header_fmt))
|
||||||
|
filter_lengths = unpack(header_fmt, bytes)
|
||||||
|
for fl in filter_lengths:
|
||||||
|
filter.filters.append(BloomFilter.fromfile(f, fl))
|
||||||
|
else:
|
||||||
|
filter.filters = []
|
||||||
|
|
||||||
|
return filter
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""Returns the total number of elements stored in this SBF"""
|
||||||
|
return sum([f.count for f in self.filters])
|
367
index_generation/stemmer.py
Normal file
367
index_generation/stemmer.py
Normal file
@ -0,0 +1,367 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""Porter Stemming Algorithm
|
||||||
|
This is the Porter stemming algorithm, ported to Python from the
|
||||||
|
version coded up in ANSI C by the author. It may be be regarded
|
||||||
|
as canonical, in that it follows the algorithm presented in
|
||||||
|
|
||||||
|
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
||||||
|
no. 3, pp 130-137,
|
||||||
|
|
||||||
|
only differing from it at the points maked --DEPARTURE-- below.
|
||||||
|
|
||||||
|
See also http://www.tartarus.org/~martin/PorterStemmer
|
||||||
|
|
||||||
|
The algorithm as described in the paper could be exactly replicated
|
||||||
|
by adjusting the points of DEPARTURE, but this is barely necessary,
|
||||||
|
because (a) the points of DEPARTURE are definitely improvements, and
|
||||||
|
(b) no encoding of the Porter stemmer I have seen is anything like
|
||||||
|
as exact as this version, even with the points of DEPARTURE!
|
||||||
|
|
||||||
|
Vivake Gupta (v@nano.com)
|
||||||
|
|
||||||
|
Release 1: January 2001
|
||||||
|
|
||||||
|
Further adjustments by Santiago Bruno (bananabruno@gmail.com)
|
||||||
|
to allow word input not restricted to one word per line, leading
|
||||||
|
to:
|
||||||
|
|
||||||
|
release 2: July 2008
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
class PorterStemmer:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""The main part of the stemming algorithm starts here.
|
||||||
|
b is a buffer holding a word to be stemmed. The letters are in b[k0],
|
||||||
|
b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
|
||||||
|
readjusted downwards as the stemming progresses. Zero termination is
|
||||||
|
not in fact used in the algorithm.
|
||||||
|
|
||||||
|
Note that only lower case sequences are stemmed. Forcing to lower case
|
||||||
|
should be done before stem(...) is called.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.b = "" # buffer for word to be stemmed
|
||||||
|
self.k = 0
|
||||||
|
self.k0 = 0
|
||||||
|
self.j = 0 # j is a general offset into the string
|
||||||
|
|
||||||
|
def cons(self, i):
|
||||||
|
"""cons(i) is TRUE <=> b[i] is a consonant."""
|
||||||
|
if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
|
||||||
|
return 0
|
||||||
|
if self.b[i] == 'y':
|
||||||
|
if i == self.k0:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return (not self.cons(i - 1))
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def m(self):
|
||||||
|
"""m() measures the number of consonant sequences between k0 and j.
|
||||||
|
if c is a consonant sequence and v a vowel sequence, and <..>
|
||||||
|
indicates arbitrary presence,
|
||||||
|
|
||||||
|
<c><v> gives 0
|
||||||
|
<c>vc<v> gives 1
|
||||||
|
<c>vcvc<v> gives 2
|
||||||
|
<c>vcvcvc<v> gives 3
|
||||||
|
....
|
||||||
|
"""
|
||||||
|
n = 0
|
||||||
|
i = self.k0
|
||||||
|
while 1:
|
||||||
|
if i > self.j:
|
||||||
|
return n
|
||||||
|
if not self.cons(i):
|
||||||
|
break
|
||||||
|
i = i + 1
|
||||||
|
i = i + 1
|
||||||
|
while 1:
|
||||||
|
while 1:
|
||||||
|
if i > self.j:
|
||||||
|
return n
|
||||||
|
if self.cons(i):
|
||||||
|
break
|
||||||
|
i = i + 1
|
||||||
|
i = i + 1
|
||||||
|
n = n + 1
|
||||||
|
while 1:
|
||||||
|
if i > self.j:
|
||||||
|
return n
|
||||||
|
if not self.cons(i):
|
||||||
|
break
|
||||||
|
i = i + 1
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
|
def vowelinstem(self):
|
||||||
|
"""vowelinstem() is TRUE <=> k0,...j contains a vowel"""
|
||||||
|
for i in range(self.k0, self.j + 1):
|
||||||
|
if not self.cons(i):
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def doublec(self, j):
|
||||||
|
"""doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
|
||||||
|
if j < (self.k0 + 1):
|
||||||
|
return 0
|
||||||
|
if (self.b[j] != self.b[j-1]):
|
||||||
|
return 0
|
||||||
|
return self.cons(j)
|
||||||
|
|
||||||
|
def cvc(self, i):
|
||||||
|
"""cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
|
||||||
|
and also if the second c is not w,x or y. this is used when trying to
|
||||||
|
restore an e at the end of a short e.g.
|
||||||
|
|
||||||
|
cav(e), lov(e), hop(e), crim(e), but
|
||||||
|
snow, box, tray.
|
||||||
|
"""
|
||||||
|
if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
|
||||||
|
return 0
|
||||||
|
ch = self.b[i]
|
||||||
|
if ch == 'w' or ch == 'x' or ch == 'y':
|
||||||
|
return 0
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def ends(self, s):
|
||||||
|
"""ends(s) is TRUE <=> k0,...k ends with the string s."""
|
||||||
|
length = len(s)
|
||||||
|
if s[length - 1] != self.b[self.k]: # tiny speed-up
|
||||||
|
return 0
|
||||||
|
if length > (self.k - self.k0 + 1):
|
||||||
|
return 0
|
||||||
|
if self.b[self.k-length+1:self.k+1] != s:
|
||||||
|
return 0
|
||||||
|
self.j = self.k - length
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def setto(self, s):
|
||||||
|
"""setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
|
||||||
|
length = len(s)
|
||||||
|
self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
|
||||||
|
self.k = self.j + length
|
||||||
|
|
||||||
|
def r(self, s):
|
||||||
|
"""r(s) is used further down."""
|
||||||
|
if self.m() > 0:
|
||||||
|
self.setto(s)
|
||||||
|
|
||||||
|
def step1ab(self):
|
||||||
|
"""step1ab() gets rid of plurals and -ed or -ing. e.g.
|
||||||
|
|
||||||
|
caresses -> caress
|
||||||
|
ponies -> poni
|
||||||
|
ties -> ti
|
||||||
|
caress -> caress
|
||||||
|
cats -> cat
|
||||||
|
|
||||||
|
feed -> feed
|
||||||
|
agreed -> agree
|
||||||
|
disabled -> disable
|
||||||
|
|
||||||
|
matting -> mat
|
||||||
|
mating -> mate
|
||||||
|
meeting -> meet
|
||||||
|
milling -> mill
|
||||||
|
messing -> mess
|
||||||
|
|
||||||
|
meetings -> meet
|
||||||
|
"""
|
||||||
|
if self.b[self.k] == 's':
|
||||||
|
if self.ends("sses"):
|
||||||
|
self.k = self.k - 2
|
||||||
|
elif self.ends("ies"):
|
||||||
|
self.setto("i")
|
||||||
|
elif self.b[self.k - 1] != 's':
|
||||||
|
self.k = self.k - 1
|
||||||
|
if self.ends("eed"):
|
||||||
|
if self.m() > 0:
|
||||||
|
self.k = self.k - 1
|
||||||
|
elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
|
||||||
|
self.k = self.j
|
||||||
|
if self.ends("at"): self.setto("ate")
|
||||||
|
elif self.ends("bl"): self.setto("ble")
|
||||||
|
elif self.ends("iz"): self.setto("ize")
|
||||||
|
elif self.doublec(self.k):
|
||||||
|
self.k = self.k - 1
|
||||||
|
ch = self.b[self.k]
|
||||||
|
if ch == 'l' or ch == 's' or ch == 'z':
|
||||||
|
self.k = self.k + 1
|
||||||
|
elif (self.m() == 1 and self.cvc(self.k)):
|
||||||
|
self.setto("e")
|
||||||
|
|
||||||
|
def step1c(self):
|
||||||
|
"""step1c() turns terminal y to i when there is another vowel in the stem."""
|
||||||
|
if (self.ends("y") and self.vowelinstem()):
|
||||||
|
self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
|
||||||
|
|
||||||
|
def step2(self):
|
||||||
|
"""step2() maps double suffices to single ones.
|
||||||
|
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
|
||||||
|
string before the suffix must give m() > 0.
|
||||||
|
"""
|
||||||
|
if self.b[self.k - 1] == 'a':
|
||||||
|
if self.ends("ational"): self.r("ate")
|
||||||
|
elif self.ends("tional"): self.r("tion")
|
||||||
|
elif self.b[self.k - 1] == 'c':
|
||||||
|
if self.ends("enci"): self.r("ence")
|
||||||
|
elif self.ends("anci"): self.r("ance")
|
||||||
|
elif self.b[self.k - 1] == 'e':
|
||||||
|
if self.ends("izer"): self.r("ize")
|
||||||
|
elif self.b[self.k - 1] == 'l':
|
||||||
|
if self.ends("bli"): self.r("ble") # --DEPARTURE--
|
||||||
|
# To match the published algorithm, replace this phrase with
|
||||||
|
# if self.ends("abli"): self.r("able")
|
||||||
|
elif self.ends("alli"): self.r("al")
|
||||||
|
elif self.ends("entli"): self.r("ent")
|
||||||
|
elif self.ends("eli"): self.r("e")
|
||||||
|
elif self.ends("ousli"): self.r("ous")
|
||||||
|
elif self.b[self.k - 1] == 'o':
|
||||||
|
if self.ends("ization"): self.r("ize")
|
||||||
|
elif self.ends("ation"): self.r("ate")
|
||||||
|
elif self.ends("ator"): self.r("ate")
|
||||||
|
elif self.b[self.k - 1] == 's':
|
||||||
|
if self.ends("alism"): self.r("al")
|
||||||
|
elif self.ends("iveness"): self.r("ive")
|
||||||
|
elif self.ends("fulness"): self.r("ful")
|
||||||
|
elif self.ends("ousness"): self.r("ous")
|
||||||
|
elif self.b[self.k - 1] == 't':
|
||||||
|
if self.ends("aliti"): self.r("al")
|
||||||
|
elif self.ends("iviti"): self.r("ive")
|
||||||
|
elif self.ends("biliti"): self.r("ble")
|
||||||
|
elif self.b[self.k - 1] == 'g': # --DEPARTURE--
|
||||||
|
if self.ends("logi"): self.r("log")
|
||||||
|
# To match the published algorithm, delete this phrase
|
||||||
|
|
||||||
|
def step3(self):
|
||||||
|
"""step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
|
||||||
|
if self.b[self.k] == 'e':
|
||||||
|
if self.ends("icate"): self.r("ic")
|
||||||
|
elif self.ends("ative"): self.r("")
|
||||||
|
elif self.ends("alize"): self.r("al")
|
||||||
|
elif self.b[self.k] == 'i':
|
||||||
|
if self.ends("iciti"): self.r("ic")
|
||||||
|
elif self.b[self.k] == 'l':
|
||||||
|
if self.ends("ical"): self.r("ic")
|
||||||
|
elif self.ends("ful"): self.r("")
|
||||||
|
elif self.b[self.k] == 's':
|
||||||
|
if self.ends("ness"): self.r("")
|
||||||
|
|
||||||
|
def step4(self):
|
||||||
|
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
|
||||||
|
if self.b[self.k - 1] == 'a':
|
||||||
|
if self.ends("al"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'c':
|
||||||
|
if self.ends("ance"): pass
|
||||||
|
elif self.ends("ence"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'e':
|
||||||
|
if self.ends("er"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'i':
|
||||||
|
if self.ends("ic"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'l':
|
||||||
|
if self.ends("able"): pass
|
||||||
|
elif self.ends("ible"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'n':
|
||||||
|
if self.ends("ant"): pass
|
||||||
|
elif self.ends("ement"): pass
|
||||||
|
elif self.ends("ment"): pass
|
||||||
|
elif self.ends("ent"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'o':
|
||||||
|
if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
|
||||||
|
elif self.ends("ou"): pass
|
||||||
|
# takes care of -ous
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 's':
|
||||||
|
if self.ends("ism"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 't':
|
||||||
|
if self.ends("ate"): pass
|
||||||
|
elif self.ends("iti"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'u':
|
||||||
|
if self.ends("ous"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'v':
|
||||||
|
if self.ends("ive"): pass
|
||||||
|
else: return
|
||||||
|
elif self.b[self.k - 1] == 'z':
|
||||||
|
if self.ends("ize"): pass
|
||||||
|
else: return
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
if self.m() > 1:
|
||||||
|
self.k = self.j
|
||||||
|
|
||||||
|
def step5(self):
|
||||||
|
"""step5() removes a final -e if m() > 1, and changes -ll to -l if
|
||||||
|
m() > 1.
|
||||||
|
"""
|
||||||
|
self.j = self.k
|
||||||
|
if self.b[self.k] == 'e':
|
||||||
|
a = self.m()
|
||||||
|
if a > 1 or (a == 1 and not self.cvc(self.k-1)):
|
||||||
|
self.k = self.k - 1
|
||||||
|
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
|
||||||
|
self.k = self.k -1
|
||||||
|
|
||||||
|
def stem(self, p, i, j):
|
||||||
|
"""In stem(p,i,j), p is a char pointer, and the string to be stemmed
|
||||||
|
is from p[i] to p[j] inclusive. Typically i is zero and j is the
|
||||||
|
offset to the last character of a string, (p[j+1] == '\0'). The
|
||||||
|
stemmer adjusts the characters p[i] ... p[j] and returns the new
|
||||||
|
end-point of the string, k. Stemming never increases word length, so
|
||||||
|
i <= k <= j. To turn the stemmer into a module, declare 'stem' as
|
||||||
|
extern, and delete the remainder of this file.
|
||||||
|
"""
|
||||||
|
# copy the parameters into statics
|
||||||
|
self.b = p
|
||||||
|
self.k = j
|
||||||
|
self.k0 = i
|
||||||
|
if self.k <= self.k0 + 1:
|
||||||
|
return self.b # --DEPARTURE--
|
||||||
|
|
||||||
|
# With this line, strings of length 1 or 2 don't go through the
|
||||||
|
# stemming process, although no mention is made of this in the
|
||||||
|
# published algorithm. Remove the line to match the published
|
||||||
|
# algorithm.
|
||||||
|
|
||||||
|
self.step1ab()
|
||||||
|
self.step1c()
|
||||||
|
self.step2()
|
||||||
|
self.step3()
|
||||||
|
self.step4()
|
||||||
|
self.step5()
|
||||||
|
return self.b[self.k0:self.k+1]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
p = PorterStemmer()
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
for f in sys.argv[1:]:
|
||||||
|
infile = open(f, 'r')
|
||||||
|
while 1:
|
||||||
|
output = ''
|
||||||
|
word = ''
|
||||||
|
line = infile.readline()
|
||||||
|
if line == '':
|
||||||
|
break
|
||||||
|
for c in line:
|
||||||
|
if c.isalpha():
|
||||||
|
word += c.lower()
|
||||||
|
else:
|
||||||
|
if word:
|
||||||
|
output += p.stem(word, 0,len(word)-1)
|
||||||
|
word = ''
|
||||||
|
output += c.lower()
|
||||||
|
print(output)
|
||||||
|
infile.close()
|
@ -44,6 +44,9 @@ function callback_change() {
|
|||||||
$("#results").append("<p>"+key+"</p>");
|
$("#results").append("<p>"+key+"</p>");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(!$("#results p").length) {
|
||||||
|
$("#results").append("<p>No results...</p>");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$("#search").on('input', callback_change);
|
$("#search").on('input', callback_change);
|
Loading…
Reference in New Issue
Block a user