bloomysearch/index_generation/generate_index.py
2014-10-28 20:08:47 +01:00

82 lines
3.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Inspired by
http://www.stavros.io/posts/bloom-filter-search-engine/?print
You have to install the numpy python module for bloom to work.
"""
import bloom
import json
import os
import re
import stemmer
import struct
import sys
from lxml import html
def list_directory(path):
"""Recursively list all files in a given directory."""
files_list = []
for root, dirs, files in os.walk(path):
for i in files:
files_list.append(os.path.join(root, i))
return files_list
def remove_common_words(words):
"""Removes all words that are less than 3 characters long."""
returned = [word for word in words if len(word) > 3]
return returned
if __name__ == "__main__":
error_rate = 0.1
samples = list_directory("../samples/")
filters = []
p = stemmer.PorterStemmer()
for sample in samples:
with open(sample, 'r') as sample_fh:
content = sample_fh.read()
# Get text from HTML content
words = html.fromstring(content).text_content().replace("\n", "")
words = re.findall(r"[\w]+", words)
# Remove all punctuation etc., convert words to lower and delete
# duplicates
words = list(set([word.lower() for word in words]))
# Remove common words
words = remove_common_words(words)
# Stemming to reduce the number of words
words = list(set([p.stem(word, 0, len(word)-1) for word in words]))
tmp_filter = bloom.BloomFilter(capacity=len(words),
error_rate=error_rate)
words = json.loads('["solut", "devic", "cryptkey2", "contain", "chang", "thi", "conf", "ckeyfiin", "support", "load", "here", "laptop", "file", "exampl", "paramet", "cryptsetup", "when", "proce", "line", "cryptkei", "wiki", "edit", "present", "describ", "ckei", "grub", "first", "warn", "mkinitcpio", "with", "updat", "mount", "manual", "ckeybyif", "least", "need", "multipl", "also", "found", "arch", "then", "us", "encrypt", "packag", "that", "over", "someth", "hook", "doesn", "avail", "avoid", "work", "which", "provid", "order", "initcpio", "anoth", "setup", "mean", "necessari", "default", "disk", "best", "linemkdir", "luk", "system", "unlock", "occurr", "requir", "command", "abl", "cryptdevice2", "encrypt2", "instal", "multi", "last", "extend", "obsolet", "boot", "your", "achiev", "second", "mkdir", "stuff", "final", "displai", "concern", "ad", "cryptdevic", "more", "copi"]')
for word in words:
tmp_filter.add(word)
filters.append(tmp_filter.buckets)
# First Int32 is length
filters_to_write = struct.pack("<i", len(filters))
# Then comes the length of each filter
for i in filters:
filters_to_write += struct.pack("<i", len(i))
# Finally comes the filters themselves
for i in filters:
filters_to_write += struct.pack("<%di" % len(i), *i)
# Write everything
with open("../data/filters", "wb") as index_fh:
index_fh.write(filters_to_write)
with open("../data/pages.json", "w") as pages_fh:
pages_fh.write(json.dumps({"index": samples}))