bloomysearch/generate_index.py

#!/usr/bin/env python3

import os
from lxml import html
import re
import json
from collections import defaultdict


# List all files in path directory
def list_directory(path):
    fichier = []
    for root, dirs, files in os.walk(path):
        for i in files:
            fichier.append(os.path.join(root, i))
    return fichier


def remove_common_words(words):
    returned = [word for word in words if len(word) > 3]
    return returned

# =============================================================================
samples = list_directory("samples/")
index = defaultdict(list)

for sample in samples:
    with open(sample, 'r') as sample_fh:
        content = sample_fh.read()

    # Get text from HTML content
    words = html.fromstring(content).text_content().replace("\n", "")
    words = re.findall(r"[\w]+", words)
    # Remove all punctuation etc., convert words to lower and delete duplicates
    words = list(set([word.lower() for word in words]))

    # Remove common words
    words = remove_common_words(words)
    # Stemming to reduce the number of words
    # TODO : Could use http://tartarus.org/martin/PorterStemmer/

    for word in words:
        index[sample].append(word)

with open("index.json", 'w') as index_fh:
    index_fh.write(json.dumps(index))
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00			`#!/usr/bin/env python3`

			`import os`
			`from lxml import html`
			`import re`
First working, clearly not optimized, version 2013-12-28 20:42:06 +01:00			`import json`
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00			`from collections import defaultdict`


			`# List all files in path directory`
			`def list_directory(path):`
			`fichier = []`
			`for root, dirs, files in os.walk(path):`
			`for i in files:`
			`fichier.append(os.path.join(root, i))`
			`return fichier`


			`def remove_common_words(words):`
			`returned = [word for word in words if len(word) > 3]`
			`return returned`

			`# =============================================================================`
			`samples = list_directory("samples/")`
			`index = defaultdict(list)`

			`for sample in samples:`
			`with open(sample, 'r') as sample_fh:`
			`content = sample_fh.read()`

			`# Get text from HTML content`
			`words = html.fromstring(content).text_content().replace("\n", "")`
			`words = re.findall(r"[\w]+", words)`
			`# Remove all punctuation etc., convert words to lower and delete duplicates`
			`words = list(set([word.lower() for word in words]))`

			`# Remove common words`
			`words = remove_common_words(words)`
			`# Stemming to reduce the number of words`
			`# TODO : Could use http://tartarus.org/martin/PorterStemmer/`

			`for word in words:`
First working, clearly not optimized, version 2013-12-28 20:42:06 +01:00			`index[sample].append(word)`
Initial commit Python script generates the index correctly, but not optimized at all... 2013-12-26 17:16:12 +01:00
First working, clearly not optimized, version 2013-12-28 20:42:06 +01:00			`with open("index.json", 'w') as index_fh:`
			`index_fh.write(json.dumps(index))`