bloomysearch/index_generation/generate_index.py

#!/usr/bin/env python3

import os
import sys
from lxml import html
import re
import stemmer
import json
from bitarray import bitarray
from pybloom import BloomFilter


# List all files in path directory
def list_directory(path):
    fichier = []
    for root, dirs, files in os.walk(path):
        for i in files:
            fichier.append(os.path.join(root, i))
    return fichier


def remove_common_words(words):
    returned = [word for word in words if len(word) > 3]
    return returned


def bitfield(n, fill):
    return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]

# =============================================================================
samples = list_directory("../samples/")
filters = {}
p = stemmer.PorterStemmer()
write = bitarray(bitfield(len(samples), 16))

if len(samples) > 65535:
    sys.exit("[ERROR] Too many articles to index. You will have to change the "
             "way data is stored in the binary file to handle such amount of "
             "files.")

for sample in samples:
    with open(sample, 'r') as sample_fh:
        content = sample_fh.read()

    # Get text from HTML content
    words = html.fromstring(content).text_content().replace("\n", "")
    words = re.findall(r"[\w]+", words)
    # Remove all punctuation etc., convert words to lower and delete duplicates
    words = list(set([word.lower() for word in words]))

    # Remove common words
    words = remove_common_words(words)
    # Stemming to reduce the number of words
    words = [p.stem(word, 0, len(word)-1) for word in words]

    filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1)
    for word in words:
        filters[sample].add(word)

    if filters[sample].bitarray.length() > 65535:
        sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
                 "will have to change the way data is stored in the binary "
                 "file to handle such amount of text.")

    write.extend(bitfield(filters[sample].bitarray.length(), 16))
    write.extend(filters[sample].bitarray)

with open('../data/search_index', 'wb') as index_fh:
    index_fh.write(write.tobytes())

with open('../data/pages_index.json', 'w') as pages_fh:
    pages_fh.write(json.dumps(samples))