Use a single common data source for public transports stops

Now makes use of Navitia opendata dumps to cover France. Fixes #65.
This commit is contained in:
Lucas Verney 2017-12-03 19:36:00 +01:00
parent 376b327379
commit aa0e1fd965
12 changed files with 167976 additions and 4682 deletions

View File

@ -74,8 +74,7 @@ which covers Paris. If you want to run the script using some other location,
you might have to change these files by matching datasets.
* [LaPoste Hexasmal](https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/?disjunctive.code_commune_insee&disjunctive.nom_de_la_commune&disjunctive.code_postal&disjunctive.libell_d_acheminement&disjunctive.ligne_5) for the list of cities and postal codes in France.
* [RATP (Paris) stations](https://data.ratp.fr/explore/dataset/positions-geographiques-des-stations-du-reseau-ratp/table/?disjunctive.stop_name&disjunctive.code_postal&disjunctive.departement) for the list of subway/tram/bus stations with their positions in Paris and nearby areas.
* [Tcl (Lyon) stations](https://download.data.grandlyon.com/wfs/rdata?SERVICE=WFS&VERSION=2.0.0&outputformat=GEOJSON&maxfeatures=4601&request=GetFeature&typename=tcl_sytral.tclarret&SRSNAME=urn:ogc:def:crs:EPSG::4326) for the list of subway/tram/bus stations with their positions in Paris and nearby areas.
* [Navitia public transport datasets](https://navitia.opendatasoft.com/explore/?sort=modified&refine.geographicarea=France) for the list of subway/tram/bus stations with their positions in France. These are the `stops_fr-*.txt` files, extracted from the `NTFS` datasets for each region.
Both datasets are licensed under the Open Data Commons Open Database License
(ODbL): https://opendatacommons.org/licenses/odbl/.

View File

@ -164,7 +164,7 @@ def main():
if args.cmd == "build-data":
force = True
data.preprocess_data(config, force=force)
if data.preprocess_data(config, force=force):
LOGGER.info("Done building data!")
if args.cmd == "build-data":

View File

@ -43,6 +43,7 @@ def preprocess_data(config, force=False):
:params config: A config dictionary.
:params force: Whether to force rebuild or not.
:return bool: Whether data have been built or not.
"""
# Check if a build is required
get_session = database.init_db(config["database"], config["search_index"])
@ -53,7 +54,7 @@ def preprocess_data(config, force=False):
)
if is_built and not force:
# No need to rebuild the database, skip
return
return False
# Otherwise, purge all existing data
session.query(PublicTransport).delete()
session.query(PostalCode).delete()
@ -67,6 +68,7 @@ def preprocess_data(config, force=False):
)
with get_session() as session:
session.add_all(data_objects)
return True
@hash_dict
@ -88,7 +90,7 @@ def load_data(model, constraint, config):
areas = []
# Get areas to fetch from, using postal codes
for postal_code in constraint["postal_codes"]:
areas.append(data_files.french_postal_codes_to_iso_3166(postal_code))
areas.append(data_files.french_postal_codes_to_quarter(postal_code))
# Load data for each area
areas = list(set(areas))
for area in areas:

View File

@ -3,10 +3,14 @@
Preprocessing functions to convert input opendata files into SQLAlchemy objects
ready to be stored in the database.
"""
import csv
import io
import json
import logging
import os
from backports import csv
from flatisfy.models.postal_code import PostalCode
from flatisfy.models.public_transport import PublicTransport
@ -15,18 +19,20 @@ LOGGER = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
def french_postal_codes_to_iso_3166(postal_code):
def french_postal_codes_to_quarter(postal_code):
"""
Convert a French postal code to the main subdivision in French this postal
code belongs to (ISO 3166-2 code).
Convert a French postal code to the main quarter in France this postal
code belongs to.
:param postal_code: The postal code to convert.
:returns: The ISO 3166-2 code of the subdivision or ``None``.
:returns: The quarter of France or ``None``.
"""
departement = postal_code[:2]
# Mapping between areas (main subdivisions in French, ISO 3166-2) and
# French departements
# Taken from Wikipedia data.
area_to_departement = {
department_to_subdivision = {
"FR-ARA": ["01", "03", "07", "15", "26", "38", "42", "43", "63", "69",
"73", "74"],
"FR-BFC": ["21", "25", "39", "58", "70", "71", "89", "90"],
@ -44,17 +50,30 @@ def french_postal_codes_to_iso_3166(postal_code):
"FR-PDL": ["44", "49", "53", "72", "85"],
"FR-PAC": ["04", "05", "06", "13", "83", "84"]
}
subdivision_to_quarters = {
'FR-IDF': ['FR-IDF'],
'FR-NW': ['FR-BRE', 'FR-CVL', 'FR-NOR', 'FR-PDL'],
'FR-NE': ['FR-BFC', 'FR-GES', 'FR-HDF'],
'FR-SE': ['FR-ARA', 'FR-COR', 'FR-PAC', 'FR-OCC'],
'FR-SW': ['FR-NAQ']
}
departement = postal_code[:2]
return next(
subdivision = next(
(
i
for i in area_to_departement
if departement in area_to_departement[i]
for i, departments in department_to_subdivision.items()
if departement in departments
),
None
)
return next(
(
i
for i, subdivisions in subdivision_to_quarters.items()
if subdivision in subdivisions
),
None
)
def _preprocess_laposte():
@ -69,8 +88,8 @@ def _preprocess_laposte():
raw_laposte_data = []
# Load opendata file
try:
with open(
os.path.join(MODULE_DIR, data_file), "r"
with io.open(
os.path.join(MODULE_DIR, data_file), "r", encoding='utf-8'
) as fh:
raw_laposte_data = json.load(fh)
except (IOError, ValueError):
@ -82,7 +101,7 @@ def _preprocess_laposte():
for item in raw_laposte_data:
fields = item["fields"]
try:
area = french_postal_codes_to_iso_3166(fields["code_postal"])
area = french_postal_codes_to_quarter(fields["code_postal"])
if area is None:
LOGGER.info(
"No matching area found for postal code %s, skipping it.",
@ -104,72 +123,45 @@ def _preprocess_laposte():
return postal_codes_data
def _preprocess_ratp():
def _preprocess_public_transport():
"""
Build SQLAlchemy objects from the RATP data (public transport in Paris,
France).
Build SQLAlchemy objects from the Navitia public transport data.
:return: A list of ``PublicTransport`` objects to be inserted in database.
"""
data_file = "ratp.json"
LOGGER.info("Building from %s data.", data_file)
DATA_FILES = {
"FR-IDF": "stops_fr-idf.txt",
"FR-NW": "stops_fr-nw.txt",
"FR-NE": "stops_fr-ne.txt",
"FR-SW": "stops_fr-sw.txt",
"FR-SE": "stops_fr-se.txt"
}
ratp_data_raw = []
public_transport_data = []
# Load opendata file
for area, data_file in DATA_FILES.items():
LOGGER.info("Building from public transport data %s." % data_file)
try:
with open(os.path.join(MODULE_DIR, data_file), "r") as fh:
ratp_data_raw = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw RATP opendata file.")
with io.open(os.path.join(MODULE_DIR, data_file), "r",
encoding='utf-8') as fh:
filereader = csv.reader(fh)
next(filereader, None) # Skip first row (headers)
for row in filereader:
public_transport_data.append(PublicTransport(
name=row[2],
area=area,
lat=row[3],
lng=row[4]
))
except (IOError, IndexError):
LOGGER.error("Invalid raw opendata file: %s." % data_file)
return []
# Process it
ratp_data = []
for item in ratp_data_raw:
fields = item["fields"]
ratp_data.append(PublicTransport(
name=fields["stop_name"],
area="FR-IDF",
lat=fields["coord"][0],
lng=fields["coord"][1]
))
return ratp_data
def _preprocess_tcl():
"""
Build SQLAlchemy objects from the Tcl data (public transport in Lyon,
France).
:return: A list of ``PublicTransport`` objects to be inserted in database.
"""
data_file = "tcl.json"
LOGGER.info("Building from %s data.", data_file)
tcl_data_raw = []
# Load opendata file
try:
with open(os.path.join(MODULE_DIR, data_file), "r") as fh:
tcl_data_raw = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw Tcl opendata file.")
return []
# Process it
tcl_data = []
for item in tcl_data_raw["features"]:
tcl_data.append(PublicTransport(
name=item["properties"]["nom"],
area="FR-ARA",
lat=item["geometry"]["coordinates"][1],
lng=item["geometry"]["coordinates"][0]
))
return tcl_data
return public_transport_data
# List of all the available preprocessing functions. Order can be important.
PREPROCESSING_FUNCTIONS = [
_preprocess_laposte,
_preprocess_ratp,
_preprocess_tcl
_preprocess_public_transport
]

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
appdirs
arrow
backports.csv
bottle
bottle-sqlalchemy
canister