2013-12-26 17:16:12 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import os
|
2014-01-11 01:19:50 +01:00
|
|
|
import sys
|
2013-12-26 17:16:12 +01:00
|
|
|
from lxml import html
|
|
|
|
import re
|
2014-01-02 21:24:22 +01:00
|
|
|
import stemmer
|
2014-01-11 01:19:50 +01:00
|
|
|
import json
|
2014-01-11 00:19:22 +01:00
|
|
|
from bitarray import bitarray
|
2014-01-02 21:24:22 +01:00
|
|
|
from pybloom import BloomFilter
|
2013-12-26 17:16:12 +01:00
|
|
|
|
|
|
|
|
|
|
|
# List all files in path directory
|
|
|
|
def list_directory(path):
|
|
|
|
fichier = []
|
|
|
|
for root, dirs, files in os.walk(path):
|
|
|
|
for i in files:
|
|
|
|
fichier.append(os.path.join(root, i))
|
|
|
|
return fichier
|
|
|
|
|
|
|
|
|
|
|
|
def remove_common_words(words):
|
|
|
|
returned = [word for word in words if len(word) > 3]
|
|
|
|
return returned
|
|
|
|
|
2014-02-26 22:48:15 +01:00
|
|
|
def padding_16(x):
|
|
|
|
if x < 256:
|
|
|
|
return bytes([0,len(samples)])
|
|
|
|
else:
|
|
|
|
return bytes([int(x/256), x%256])
|
2014-01-11 00:19:22 +01:00
|
|
|
|
2013-12-26 17:16:12 +01:00
|
|
|
# =============================================================================
|
2014-01-11 00:19:22 +01:00
|
|
|
samples = list_directory("../samples/")
|
2014-01-02 21:24:22 +01:00
|
|
|
filters = {}
|
|
|
|
p = stemmer.PorterStemmer()
|
2014-02-26 22:48:15 +01:00
|
|
|
write_little = bitarray(endian="little")
|
|
|
|
write_big = bitarray(endian="big")
|
|
|
|
|
|
|
|
write_little.frombytes(padding_16(len(samples)))
|
|
|
|
write_big.frombytes(padding_16(len(samples)))
|
2014-01-11 01:19:50 +01:00
|
|
|
|
|
|
|
if len(samples) > 65535:
|
|
|
|
sys.exit("[ERROR] Too many articles to index. You will have to change the "
|
|
|
|
"way data is stored in the binary file to handle such amount of "
|
|
|
|
"files.")
|
2013-12-26 17:16:12 +01:00
|
|
|
|
|
|
|
for sample in samples:
|
|
|
|
with open(sample, 'r') as sample_fh:
|
|
|
|
content = sample_fh.read()
|
|
|
|
|
|
|
|
# Get text from HTML content
|
|
|
|
words = html.fromstring(content).text_content().replace("\n", "")
|
|
|
|
words = re.findall(r"[\w]+", words)
|
|
|
|
# Remove all punctuation etc., convert words to lower and delete duplicates
|
|
|
|
words = list(set([word.lower() for word in words]))
|
|
|
|
|
|
|
|
# Remove common words
|
|
|
|
words = remove_common_words(words)
|
|
|
|
# Stemming to reduce the number of words
|
2014-01-02 21:24:22 +01:00
|
|
|
words = [p.stem(word, 0, len(word)-1) for word in words]
|
2013-12-26 17:16:12 +01:00
|
|
|
|
2014-01-02 21:24:22 +01:00
|
|
|
filters[sample] = BloomFilter(capacity=len(words), error_rate=0.1)
|
2013-12-26 17:16:12 +01:00
|
|
|
for word in words:
|
2014-01-02 21:24:22 +01:00
|
|
|
filters[sample].add(word)
|
2013-12-26 17:16:12 +01:00
|
|
|
|
2014-01-11 01:19:50 +01:00
|
|
|
if filters[sample].bitarray.length() > 65535:
|
|
|
|
sys.exit("[ERROR] Bloomfilter is too long for file "+sample+". You "
|
|
|
|
"will have to change the way data is stored in the binary "
|
|
|
|
"file to handle such amount of text.")
|
|
|
|
|
2014-02-26 22:48:15 +01:00
|
|
|
#write_little.extend(bitfield(filters[sample].bitarray.length(), 16))
|
|
|
|
#write_little.extend(filters[sample].bitarray)
|
|
|
|
#write_big.extend(bitfield(filters[sample].bitarray.length(), 16))
|
|
|
|
#write_big.extend(filters[sample].bitarray)
|
|
|
|
|
|
|
|
with open('../data/search_index_little', 'wb') as index_fh:
|
|
|
|
print(write_little)
|
|
|
|
write_little.tofile(index_fh)
|
|
|
|
with open('../data/search_index_big', 'wb') as index_fh:
|
|
|
|
print(write_big)
|
|
|
|
write_big.tofile(index_fh)
|
2014-01-11 00:19:22 +01:00
|
|
|
|
2014-01-11 01:19:50 +01:00
|
|
|
with open('../data/pages_index.json', 'w') as pages_fh:
|
|
|
|
pages_fh.write(json.dumps(samples))
|