bloomysearch/index_generation/generate_index.py

#!/usr/bin/env python3

"""
Inspired by
http://www.stavros.io/posts/bloom-filter-search-engine/?print


You have to install the numpy python module for bloom to work.
"""

import bloom
import json
import os
import re
import stemmer
import struct
import sys

from lxml import html


def list_directory(path):
    """Recursively list all files in a given directory."""
    files_list = []
    for root, dirs, files in os.walk(path):
        for i in files:
            files_list.append(os.path.join(root, i))
    return files_list


def remove_common_words(words):
    """Removes all words that are less than 3 characters long."""
    returned = [word for word in words if len(word) > 3]
    return returned


if __name__ == "__main__":
    error_rate = 0.1

    os.chdir(os.path.dirname(sys.argv[0]))
    samples = list_directory("../samples/")
    pages = []
    filters = []
    p = stemmer.PorterStemmer()

    for sample in samples:
        with open(sample, 'r') as sample_fh:
            content = sample_fh.read()

        # Get text from HTML content
        words = html.fromstring(content).text_content().replace("\n", "")
        words = re.findall(r"[\w]+", words)
        # Remove all punctuation etc., convert words to lower and delete
        # duplicates
        words = list(set([word.lower() for word in words]))

        # Remove common words
        words = remove_common_words(words)
        # Stemming to reduce the number of words
        words = list(set([p.stem(word, 0, len(word)-1) for word in words]))

        tmp_filter = bloom.BloomFilter(capacity=len(words),
                                       error_rate=error_rate)
        for word in words:
            tmp_filter.add(word)

        filters.append(tmp_filter.buckets)

        pages.append({"title": re.search(r"@title=(.*)\n", content).group(1),
                      "url": sample[3:]})

    # First Int32 is length
    filters_to_write = struct.pack("<i", len(filters))
    # Then comes the length of each filter
    for i in filters:
        filters_to_write += struct.pack("<i", len(i))
    # Finally comes the filters themselves
    for i in filters:
        filters_to_write += struct.pack("<%di" % len(i), *i)

    # Write everything
    with open("../data/filters", "wb") as index_fh:
        index_fh.write(filters_to_write)

    with open("../data/pages.json", "w") as pages_fh:
        pages_fh.write(json.dumps({"index": pages}))
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00			`#!/usr/bin/env python3`

Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`"""`
			`Inspired by`
			`http://www.stavros.io/posts/bloom-filter-search-engine/?print`


			`You have to install the numpy python module for bloom to work.`
			`"""`

			`import bloom`
			`import json`
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00			`import os`
			`import re`
Clean + switch to bloom filters and bitarrays * Refactor of the repo structure, for better usability. * README.md refactored. * Switch to BloomFilters in python script, to decrease the index file. TODO: * Handle binary files in JS to pass the BloomFilters from python to JS. Note: Current implementations of BloomFilters differ in JS and Python lib. 2014-01-02 21:24:22 +01:00			`import stemmer`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`import struct`
			`import sys`

			`from lxml import html`
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00

			`def list_directory(path):`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`"""Recursively list all files in a given directory."""`
			`files_list = []`
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00			`for root, dirs, files in os.walk(path):`
			`for i in files:`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`files_list.append(os.path.join(root, i))`
			`return files_list`
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00

			`def remove_common_words(words):`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`"""Removes all words that are less than 3 characters long."""`
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00			`returned = [word for word in words if len(word) > 3]`
			`return returned`

Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00
			`if __name__ == "__main__":`
			`error_rate = 0.1`
Cleaner proto 2014-10-28 22:30:24 +01:00
			`os.chdir(os.path.dirname(sys.argv[0]))`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`samples = list_directory("../samples/")`
Cleaner proto 2014-10-28 22:30:24 +01:00			`pages = []`
Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`filters = []`
			`p = stemmer.PorterStemmer()`

			`for sample in samples:`
			`with open(sample, 'r') as sample_fh:`
			`content = sample_fh.read()`

			`# Get text from HTML content`
			`words = html.fromstring(content).text_content().replace("\n", "")`
			`words = re.findall(r"[\w]+", words)`
			`# Remove all punctuation etc., convert words to lower and delete`
			`# duplicates`
			`words = list(set([word.lower() for word in words]))`

			`# Remove common words`
			`words = remove_common_words(words)`
			`# Stemming to reduce the number of words`
			`words = list(set([p.stem(word, 0, len(word)-1) for word in words]))`

			`tmp_filter = bloom.BloomFilter(capacity=len(words),`
			`error_rate=error_rate)`
			`for word in words:`
			`tmp_filter.add(word)`

			`filters.append(tmp_filter.buckets)`

Cleaner proto 2014-10-28 22:30:24 +01:00			`pages.append({"title": re.search(r"@title=(.*)\n", content).group(1),`
			`"url": sample[3:]})`

Commit before debugging bloom.py 2014-10-28 02:36:28 +01:00			`# First Int32 is length`
			`filters_to_write = struct.pack("<i", len(filters))`
			`# Then comes the length of each filter`
			`for i in filters:`
			`filters_to_write += struct.pack("<i", len(i))`
			`# Finally comes the filters themselves`
			`for i in filters:`
			`filters_to_write += struct.pack("<%di" % len(i), *i)`

			`# Write everything`
			`with open("../data/filters", "wb") as index_fh:`
			`index_fh.write(filters_to_write)`

			`with open("../data/pages.json", "w") as pages_fh:`
Cleaner proto 2014-10-28 22:30:24 +01:00			`pages_fh.write(json.dumps({"index": pages}))`