Towards more modular system for data files

Also use `lru_cache` to do some memoization on data files loading
function, and speed up everything a bit.
This commit is contained in:
Lucas Verney 2017-06-14 15:29:33 +02:00
parent b3ae71a8be
commit 3469035f4a
2 changed files with 49 additions and 26 deletions

View File

@ -8,6 +8,8 @@ import argparse
import logging import logging
import sys import sys
logging.basicConfig()
import flatisfy.config import flatisfy.config
from flatisfy import cmds from flatisfy import cmds
from flatisfy import data from flatisfy import data
@ -118,14 +120,14 @@ def main():
# Set logger # Set logger
if args.vv: if args.vv:
logging.basicConfig(level=logging.DEBUG) logging.getLogger('').setLevel(logging.DEBUG)
logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG) logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)
elif args.verbose: elif args.verbose:
logging.basicConfig(level=logging.INFO) logging.getLogger('').setLevel(logging.INFO)
# sqlalchemy INFO level is way too loud, just stick with WARNING # sqlalchemy INFO level is way too loud, just stick with WARNING
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING) logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
else: else:
logging.basicConfig(level=logging.WARNING) logging.getLogger('').setLevel(logging.WARNING)
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING) logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
# Init-config command # Init-config command

View File

@ -10,12 +10,27 @@ import json
import logging import logging
import os import os
import flatisfy.exceptions import flatisfy.exceptions
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
# Try to load lru_cache
try:
from functools import lru_cache
except ImportError:
try:
from functools32 import lru_cache
except ImportError:
lru_cache = lambda maxsize=None: lambda func: func
LOGGER.warning(
"`functools.lru_cache` is not available on your system. Consider "
"installing `functools32` Python module if using Python2 for "
"better performances."
)
def _preprocess_ratp(output_dir): def _preprocess_ratp(output_dir):
""" """
@ -98,6 +113,18 @@ def _preprocess_laposte(output_dir):
return True return True
DATA_FILES = {
"ratp.json": {
"preprocess": _preprocess_ratp,
"output": ["ratp.json"]
},
"laposte.json": {
"preprocess": _preprocess_laposte,
"output": ["cities.json", "postal_codes.json"]
},
}
def preprocess_data(config, force=False): def preprocess_data(config, force=False):
""" """
Ensures that all the necessary data files have been built from the raw Ensures that all the necessary data files have been built from the raw
@ -115,38 +142,32 @@ def preprocess_data(config, force=False):
except OSError: except OSError:
LOGGER.debug("Opendata directory already existed, doing nothing.") LOGGER.debug("Opendata directory already existed, doing nothing.")
is_built_ratp = os.path.isfile( # Build all the necessary data files
os.path.join(opendata_directory, "ratp.json") for data_file in DATA_FILES:
# Check if already built
is_built = all(
os.path.isfile(
os.path.join(opendata_directory, output)
) for output in DATA_FILES[data_file]["output"]
) )
if not is_built_ratp or force: if not is_built or force:
LOGGER.info("Building from RATP data.") # Build if needed
if not _preprocess_ratp(opendata_directory): LOGGER.info("Building from {} data.".format(data_file))
raise flatisfy.exceptions.DataBuildError("Error with RATP data.") if not DATA_FILES[data_file]["preprocess"](opendata_directory):
is_built_laposte = (
os.path.isfile(os.path.join(opendata_directory, "cities.json")) and
os.path.isfile(os.path.join(opendata_directory, "postal_codes.json"))
)
if not is_built_laposte or force:
LOGGER.info("Building from LaPoste data.")
if not _preprocess_laposte(opendata_directory):
raise flatisfy.exceptions.DataBuildError( raise flatisfy.exceptions.DataBuildError(
"Error with LaPoste data." "Error with {} data.".format(data_file)
) )
@lru_cache(maxsize=5)
def load_data(data_type, config): def load_data(data_type, config):
""" """
Load a given built data file. Load a given built data file. This function is memoized.
:param data_type: A valid data identifier. :param data_type: A valid data identifier.
:param config: A config dictionary. :param config: A config dictionary.
:return: The loaded data. ``None`` if the query is incorrect. :return: The loaded data. ``None`` if the query is incorrect.
""" """
if data_type not in ["postal_codes", "cities", "ratp"]:
LOGGER.error("Invalid request. No %s data file.", data_type)
return None
opendata_directory = os.path.join(config["data_directory"], "opendata") opendata_directory = os.path.join(config["data_directory"], "opendata")
datafile_path = os.path.join(opendata_directory, "%s.json" % data_type) datafile_path = os.path.join(opendata_directory, "%s.json" % data_type)
data = {} data = {}