bloomysearch/index_generation/generate_index.py

87 lines
2.4 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2014-10-28 02:36:28 +01:00
"""
Inspired by
http://www.stavros.io/posts/bloom-filter-search-engine/?print
You have to install the numpy python module for bloom to work.
"""
import bloom
import json
import os
import re
import stemmer
2014-10-28 02:36:28 +01:00
import struct
import sys
from lxml import html
def list_directory(path):
2014-10-28 02:36:28 +01:00
"""Recursively list all files in a given directory."""
files_list = []
for root, dirs, files in os.walk(path):
for i in files:
2014-10-28 02:36:28 +01:00
files_list.append(os.path.join(root, i))
return files_list
def remove_common_words(words):
2014-10-28 02:36:28 +01:00
"""Removes all words that are less than 3 characters long."""
returned = [word for word in words if len(word) > 3]
return returned
2014-10-28 02:36:28 +01:00
if __name__ == "__main__":
error_rate = 0.1
2014-10-28 22:30:24 +01:00
os.chdir(os.path.dirname(sys.argv[0]))
2014-10-28 02:36:28 +01:00
samples = list_directory("../samples/")
2014-10-28 22:30:24 +01:00
pages = []
2014-10-28 02:36:28 +01:00
filters = []
p = stemmer.PorterStemmer()
for sample in samples:
with open(sample, 'r') as sample_fh:
content = sample_fh.read()
# Get text from HTML content
words = html.fromstring(content).text_content().replace("\n", "")
words = re.findall(r"[\w]+", words)
# Remove all punctuation etc., convert words to lower and delete
# duplicates
words = list(set([word.lower() for word in words]))
# Remove common words
words = remove_common_words(words)
# Stemming to reduce the number of words
words = list(set([p.stem(word, 0, len(word)-1) for word in words]))
tmp_filter = bloom.BloomFilter(capacity=len(words),
error_rate=error_rate)
for word in words:
tmp_filter.add(word)
filters.append(tmp_filter.buckets)
2014-10-28 22:30:24 +01:00
pages.append({"title": re.search(r"@title=(.*)\n", content).group(1),
"url": sample[3:]})
2014-10-28 02:36:28 +01:00
# First Int32 is length
filters_to_write = struct.pack("<i", len(filters))
# Then comes the length of each filter
for i in filters:
filters_to_write += struct.pack("<i", len(i))
# Finally comes the filters themselves
for i in filters:
filters_to_write += struct.pack("<%di" % len(i), *i)
# Write everything
with open("../data/filters", "wb") as index_fh:
index_fh.write(filters_to_write)
with open("../data/pages.json", "w") as pages_fh:
2014-10-28 22:30:24 +01:00
pages_fh.write(json.dumps({"index": pages}))