#!/usr/bin/env python3 import os import sys from lxml import html import re import stemmer import json from bitarray import bitarray from pybloom import BloomFilter # List all files in path directory def list_directory(path): fichier = [] for root, dirs, files in os.walk(path): for i in files: fichier.append(os.path.join(root, i)) return fichier def remove_common_words(words): returned = [word for word in words if len(word) > 3] return returned def bitfield(n, fill): return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)] # ============================================================================= samples = list_directory("../samples/") filters = {} p = stemmer.PorterStemmer() write = bitarray(bitfield(len(samples), 16)) if len(samples) > 65535: sys.exit("[ERROR] Too many articles to index. You will have to change the " "way data is stored in the binary file to handle such amount of " "files.") for sample in samples: with open(sample, 'r') as sample_fh: content = sample_fh.read() # Get text from HTML content words = html.fromstring(content).text_content().replace("\n", "") words = re.findall(r"[\w]+", words) # Remove all punctuation etc., convert words to lower and delete duplicates words = list(set([word.lower() for word in words])) # Remove common words words = remove_common_words(words) # Stemming to reduce the number of words words = [p.stem(word, 0, len(word)-1) for word in words] filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1) for word in words: filters[sample].add(word) if filters[sample].bitarray.length() > 65535: sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You " "will have to change the way data is stored in the binary " "file to handle such amount of text.") write.extend(bitfield(filters[sample].bitarray.length(), 16)) write.extend(filters[sample].bitarray) with open('../data/search_index', 'wb') as index_fh: index_fh.write(write.tobytes()) with open('../data/pages_index.json', 'w') as pages_fh: pages_fh.write(json.dumps(samples))