2018-01-19 11:50:11 +01:00
|
|
|
# coding: utf-8
|
2017-06-15 15:48:16 +02:00
|
|
|
"""
|
|
|
|
Preprocessing functions to convert input opendata files into SQLAlchemy objects
|
|
|
|
ready to be stored in the database.
|
|
|
|
"""
|
2018-01-19 11:50:11 +01:00
|
|
|
from __future__ import absolute_import, print_function, unicode_literals
|
2017-12-03 19:36:00 +01:00
|
|
|
import io
|
2017-06-15 15:48:16 +02:00
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import os
|
2018-01-18 13:58:12 +01:00
|
|
|
import sys
|
2017-06-15 15:48:16 +02:00
|
|
|
|
2018-01-19 11:50:11 +01:00
|
|
|
import titlecase
|
|
|
|
|
2018-01-18 14:48:28 +01:00
|
|
|
from flatisfy.models.postal_code import PostalCode
|
|
|
|
from flatisfy.models.public_transport import PublicTransport
|
2018-01-19 11:50:11 +01:00
|
|
|
from flatisfy.tools import normalize_string
|
2018-01-18 14:48:28 +01:00
|
|
|
|
2021-01-03 11:31:43 +01:00
|
|
|
import csv
|
2017-12-03 19:36:00 +01:00
|
|
|
|
2017-06-15 15:48:16 +02:00
|
|
|
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
|
|
|
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
|
2018-01-19 11:50:11 +01:00
|
|
|
titlecase.set_small_word_list(
|
|
|
|
# Add French small words
|
2021-01-26 14:39:52 +01:00
|
|
|
r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|"
|
|
|
|
+ titlecase.SMALL
|
2018-01-19 11:50:11 +01:00
|
|
|
)
|
|
|
|
|
2017-12-05 15:17:03 +01:00
|
|
|
TRANSPORT_DATA_FILES = {
|
|
|
|
"FR-IDF": "stops_fr-idf.txt",
|
|
|
|
"FR-NW": "stops_fr-nw.txt",
|
|
|
|
"FR-NE": "stops_fr-ne.txt",
|
|
|
|
"FR-SW": "stops_fr-sw.txt",
|
2021-01-26 14:39:52 +01:00
|
|
|
"FR-SE": "stops_fr-se.txt",
|
2017-12-05 15:17:03 +01:00
|
|
|
}
|
|
|
|
|
2017-06-15 15:48:16 +02:00
|
|
|
|
2017-12-03 19:36:00 +01:00
|
|
|
def french_postal_codes_to_quarter(postal_code):
|
2017-06-15 15:48:16 +02:00
|
|
|
"""
|
2017-12-03 19:36:00 +01:00
|
|
|
Convert a French postal code to the main quarter in France this postal
|
|
|
|
code belongs to.
|
2017-06-15 15:48:16 +02:00
|
|
|
|
|
|
|
:param postal_code: The postal code to convert.
|
2017-12-03 19:36:00 +01:00
|
|
|
:returns: The quarter of France or ``None``.
|
2017-06-15 15:48:16 +02:00
|
|
|
"""
|
2017-12-03 19:36:00 +01:00
|
|
|
departement = postal_code[:2]
|
|
|
|
|
2017-06-15 15:48:16 +02:00
|
|
|
# Mapping between areas (main subdivisions in French, ISO 3166-2) and
|
|
|
|
# French departements
|
|
|
|
# Taken from Wikipedia data.
|
2017-12-03 19:36:00 +01:00
|
|
|
department_to_subdivision = {
|
2021-01-26 14:39:52 +01:00
|
|
|
"FR-ARA": [
|
|
|
|
"01",
|
|
|
|
"03",
|
|
|
|
"07",
|
|
|
|
"15",
|
|
|
|
"26",
|
|
|
|
"38",
|
|
|
|
"42",
|
|
|
|
"43",
|
|
|
|
"63",
|
|
|
|
"69",
|
|
|
|
"73",
|
|
|
|
"74",
|
|
|
|
],
|
2017-06-15 15:48:16 +02:00
|
|
|
"FR-BFC": ["21", "25", "39", "58", "70", "71", "89", "90"],
|
|
|
|
"FR-BRE": ["22", "29", "35", "44", "56"],
|
|
|
|
"FR-CVL": ["18", "28", "36", "37", "41", "45"],
|
|
|
|
"FR-COR": ["20"],
|
|
|
|
"FR-GES": ["08", "10", "51", "52", "54", "55", "57", "67", "68", "88"],
|
|
|
|
"FR-HDF": ["02", "59", "60", "62", "80"],
|
|
|
|
"FR-IDF": ["75", "77", "78", "91", "92", "93", "94", "95"],
|
|
|
|
"FR-NOR": ["14", "27", "50", "61", "76"],
|
2021-01-26 14:39:52 +01:00
|
|
|
"FR-NAQ": [
|
|
|
|
"16",
|
|
|
|
"17",
|
|
|
|
"19",
|
|
|
|
"23",
|
|
|
|
"24",
|
|
|
|
"33",
|
|
|
|
"40",
|
|
|
|
"47",
|
|
|
|
"64",
|
|
|
|
"79",
|
|
|
|
"86",
|
|
|
|
"87",
|
|
|
|
],
|
|
|
|
"FR-OCC": [
|
|
|
|
"09",
|
|
|
|
"11",
|
|
|
|
"12",
|
|
|
|
"30",
|
|
|
|
"31",
|
|
|
|
"32",
|
|
|
|
"34",
|
|
|
|
"46",
|
|
|
|
"48",
|
|
|
|
"65",
|
|
|
|
"66",
|
|
|
|
"81",
|
|
|
|
"82",
|
|
|
|
],
|
2017-06-15 15:48:16 +02:00
|
|
|
"FR-PDL": ["44", "49", "53", "72", "85"],
|
2021-01-26 14:39:52 +01:00
|
|
|
"FR-PAC": ["04", "05", "06", "13", "83", "84"],
|
2017-06-15 15:48:16 +02:00
|
|
|
}
|
2017-12-03 19:36:00 +01:00
|
|
|
subdivision_to_quarters = {
|
2021-01-26 14:39:52 +01:00
|
|
|
"FR-IDF": ["FR-IDF"],
|
|
|
|
"FR-NW": ["FR-BRE", "FR-CVL", "FR-NOR", "FR-PDL"],
|
|
|
|
"FR-NE": ["FR-BFC", "FR-GES", "FR-HDF"],
|
|
|
|
"FR-SE": ["FR-ARA", "FR-COR", "FR-PAC", "FR-OCC"],
|
|
|
|
"FR-SW": ["FR-NAQ"],
|
2017-12-03 19:36:00 +01:00
|
|
|
}
|
2017-06-15 15:48:16 +02:00
|
|
|
|
2017-12-03 19:36:00 +01:00
|
|
|
subdivision = next(
|
2021-01-26 16:49:43 +01:00
|
|
|
(i for i, departments in department_to_subdivision.items() if departement in departments),
|
2021-01-26 14:39:52 +01:00
|
|
|
None,
|
2017-12-03 19:36:00 +01:00
|
|
|
)
|
2017-06-15 15:48:16 +02:00
|
|
|
return next(
|
2021-01-26 16:49:43 +01:00
|
|
|
(i for i, subdivisions in subdivision_to_quarters.items() if subdivision in subdivisions),
|
2021-01-26 14:39:52 +01:00
|
|
|
None,
|
2017-06-15 15:48:16 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _preprocess_laposte():
|
|
|
|
"""
|
|
|
|
Build SQLAlchemy objects from the postal codes data.
|
|
|
|
|
|
|
|
:return: A list of ``PostalCode`` objects to be inserted in database.
|
|
|
|
"""
|
|
|
|
data_file = "laposte.json"
|
|
|
|
LOGGER.info("Building from %s data.", data_file)
|
|
|
|
|
|
|
|
raw_laposte_data = []
|
|
|
|
# Load opendata file
|
|
|
|
try:
|
2021-01-26 14:39:52 +01:00
|
|
|
with io.open(os.path.join(MODULE_DIR, data_file), "r", encoding="utf-8") as fh:
|
2017-06-15 15:48:16 +02:00
|
|
|
raw_laposte_data = json.load(fh)
|
|
|
|
except (IOError, ValueError):
|
|
|
|
LOGGER.error("Invalid raw LaPoste opendata file.")
|
|
|
|
return []
|
|
|
|
|
|
|
|
# Build postal codes to other infos file
|
|
|
|
postal_codes_data = []
|
2018-01-19 11:50:11 +01:00
|
|
|
# Keep track of seen (postal_codes, names) to avoid inserting useless
|
|
|
|
# duplicates (already in the OpenData file)
|
|
|
|
seen_postal_codes = []
|
2017-06-15 15:48:16 +02:00
|
|
|
for item in raw_laposte_data:
|
|
|
|
fields = item["fields"]
|
|
|
|
try:
|
2017-12-03 19:36:00 +01:00
|
|
|
area = french_postal_codes_to_quarter(fields["code_postal"])
|
2017-06-15 15:48:16 +02:00
|
|
|
if area is None:
|
2021-01-29 12:03:50 +01:00
|
|
|
LOGGER.debug(
|
2017-06-15 15:48:16 +02:00
|
|
|
"No matching area found for postal code %s, skipping it.",
|
2021-01-26 14:39:52 +01:00
|
|
|
fields["code_postal"],
|
2017-06-15 15:48:16 +02:00
|
|
|
)
|
|
|
|
continue
|
|
|
|
|
2021-01-26 16:49:43 +01:00
|
|
|
name = normalize_string(titlecase.titlecase(fields["nom_de_la_commune"]), lowercase=False)
|
2018-01-19 11:50:11 +01:00
|
|
|
|
|
|
|
if (fields["code_postal"], name) in seen_postal_codes:
|
|
|
|
continue
|
|
|
|
|
|
|
|
seen_postal_codes.append((fields["code_postal"], name))
|
2021-01-26 14:39:52 +01:00
|
|
|
postal_codes_data.append(
|
|
|
|
PostalCode(
|
|
|
|
area=area,
|
|
|
|
postal_code=fields["code_postal"],
|
2021-01-29 12:03:50 +01:00
|
|
|
insee_code=fields["code_commune_insee"],
|
2021-01-26 14:39:52 +01:00
|
|
|
name=name,
|
|
|
|
lat=fields["coordonnees_gps"][0],
|
|
|
|
lng=fields["coordonnees_gps"][1],
|
|
|
|
)
|
|
|
|
)
|
2017-06-15 15:48:16 +02:00
|
|
|
except KeyError:
|
2021-01-29 16:19:15 +01:00
|
|
|
LOGGER.debug("Missing data for postal code %s, skipping it.", fields["code_postal"])
|
2017-06-15 15:48:16 +02:00
|
|
|
|
|
|
|
return postal_codes_data
|
|
|
|
|
|
|
|
|
2017-12-03 19:36:00 +01:00
|
|
|
def _preprocess_public_transport():
|
2017-06-15 15:48:16 +02:00
|
|
|
"""
|
2017-12-03 19:36:00 +01:00
|
|
|
Build SQLAlchemy objects from the Navitia public transport data.
|
2017-06-15 15:48:16 +02:00
|
|
|
|
|
|
|
:return: A list of ``PublicTransport`` objects to be inserted in database.
|
|
|
|
"""
|
2017-12-03 19:36:00 +01:00
|
|
|
public_transport_data = []
|
2017-06-15 15:52:11 +02:00
|
|
|
# Load opendata file
|
2017-12-05 15:17:03 +01:00
|
|
|
for area, data_file in TRANSPORT_DATA_FILES.items():
|
|
|
|
LOGGER.info("Building from public transport data %s.", data_file)
|
2017-12-03 19:36:00 +01:00
|
|
|
try:
|
2021-01-26 16:49:43 +01:00
|
|
|
with io.open(os.path.join(MODULE_DIR, data_file), "r", encoding="utf-8") as fh:
|
2017-12-03 19:36:00 +01:00
|
|
|
filereader = csv.reader(fh)
|
|
|
|
next(filereader, None) # Skip first row (headers)
|
|
|
|
for row in filereader:
|
2021-01-26 16:49:43 +01:00
|
|
|
public_transport_data.append(PublicTransport(name=row[2], area=area, lat=row[3], lng=row[4]))
|
2017-12-03 19:36:00 +01:00
|
|
|
except (IOError, IndexError):
|
2017-12-05 15:17:03 +01:00
|
|
|
LOGGER.error("Invalid raw opendata file: %s.", data_file)
|
2017-12-03 19:36:00 +01:00
|
|
|
return []
|
|
|
|
|
|
|
|
return public_transport_data
|
2017-06-15 15:48:16 +02:00
|
|
|
|
|
|
|
|
|
|
|
# List of all the available preprocessing functions. Order can be important.
|
2021-01-26 14:39:52 +01:00
|
|
|
PREPROCESSING_FUNCTIONS = [_preprocess_laposte, _preprocess_public_transport]
|