Use a single common data source for public transports stops

Now makes use of Navitia opendata dumps to cover France. Fixes #65.
This commit is contained in:
Lucas Verney 2017-12-03 19:36:00 +01:00
parent 376b327379
commit aa0e1fd965
12 changed files with 167976 additions and 4682 deletions

View File

@ -74,8 +74,7 @@ which covers Paris. If you want to run the script using some other location,
you might have to change these files by matching datasets. you might have to change these files by matching datasets.
* [LaPoste Hexasmal](https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/?disjunctive.code_commune_insee&disjunctive.nom_de_la_commune&disjunctive.code_postal&disjunctive.libell_d_acheminement&disjunctive.ligne_5) for the list of cities and postal codes in France. * [LaPoste Hexasmal](https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/?disjunctive.code_commune_insee&disjunctive.nom_de_la_commune&disjunctive.code_postal&disjunctive.libell_d_acheminement&disjunctive.ligne_5) for the list of cities and postal codes in France.
* [RATP (Paris) stations](https://data.ratp.fr/explore/dataset/positions-geographiques-des-stations-du-reseau-ratp/table/?disjunctive.stop_name&disjunctive.code_postal&disjunctive.departement) for the list of subway/tram/bus stations with their positions in Paris and nearby areas. * [Navitia public transport datasets](https://navitia.opendatasoft.com/explore/?sort=modified&refine.geographicarea=France) for the list of subway/tram/bus stations with their positions in France. These are the `stops_fr-*.txt` files, extracted from the `NTFS` datasets for each region.
* [Tcl (Lyon) stations](https://download.data.grandlyon.com/wfs/rdata?SERVICE=WFS&VERSION=2.0.0&outputformat=GEOJSON&maxfeatures=4601&request=GetFeature&typename=tcl_sytral.tclarret&SRSNAME=urn:ogc:def:crs:EPSG::4326) for the list of subway/tram/bus stations with their positions in Paris and nearby areas.
Both datasets are licensed under the Open Data Commons Open Database License Both datasets are licensed under the Open Data Commons Open Database License
(ODbL): https://opendatacommons.org/licenses/odbl/. (ODbL): https://opendatacommons.org/licenses/odbl/.

View File

@ -164,8 +164,8 @@ def main():
if args.cmd == "build-data": if args.cmd == "build-data":
force = True force = True
data.preprocess_data(config, force=force) if data.preprocess_data(config, force=force):
LOGGER.info("Done building data!") LOGGER.info("Done building data!")
if args.cmd == "build-data": if args.cmd == "build-data":
sys.exit(0) sys.exit(0)

View File

@ -43,6 +43,7 @@ def preprocess_data(config, force=False):
:params config: A config dictionary. :params config: A config dictionary.
:params force: Whether to force rebuild or not. :params force: Whether to force rebuild or not.
:return bool: Whether data have been built or not.
""" """
# Check if a build is required # Check if a build is required
get_session = database.init_db(config["database"], config["search_index"]) get_session = database.init_db(config["database"], config["search_index"])
@ -53,7 +54,7 @@ def preprocess_data(config, force=False):
) )
if is_built and not force: if is_built and not force:
# No need to rebuild the database, skip # No need to rebuild the database, skip
return return False
# Otherwise, purge all existing data # Otherwise, purge all existing data
session.query(PublicTransport).delete() session.query(PublicTransport).delete()
session.query(PostalCode).delete() session.query(PostalCode).delete()
@ -67,6 +68,7 @@ def preprocess_data(config, force=False):
) )
with get_session() as session: with get_session() as session:
session.add_all(data_objects) session.add_all(data_objects)
return True
@hash_dict @hash_dict
@ -88,7 +90,7 @@ def load_data(model, constraint, config):
areas = [] areas = []
# Get areas to fetch from, using postal codes # Get areas to fetch from, using postal codes
for postal_code in constraint["postal_codes"]: for postal_code in constraint["postal_codes"]:
areas.append(data_files.french_postal_codes_to_iso_3166(postal_code)) areas.append(data_files.french_postal_codes_to_quarter(postal_code))
# Load data for each area # Load data for each area
areas = list(set(areas)) areas = list(set(areas))
for area in areas: for area in areas:

View File

@ -3,10 +3,14 @@
Preprocessing functions to convert input opendata files into SQLAlchemy objects Preprocessing functions to convert input opendata files into SQLAlchemy objects
ready to be stored in the database. ready to be stored in the database.
""" """
import csv
import io
import json import json
import logging import logging
import os import os
from backports import csv
from flatisfy.models.postal_code import PostalCode from flatisfy.models.postal_code import PostalCode
from flatisfy.models.public_transport import PublicTransport from flatisfy.models.public_transport import PublicTransport
@ -15,18 +19,20 @@ LOGGER = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
def french_postal_codes_to_iso_3166(postal_code): def french_postal_codes_to_quarter(postal_code):
""" """
Convert a French postal code to the main subdivision in French this postal Convert a French postal code to the main quarter in France this postal
code belongs to (ISO 3166-2 code). code belongs to.
:param postal_code: The postal code to convert. :param postal_code: The postal code to convert.
:returns: The ISO 3166-2 code of the subdivision or ``None``. :returns: The quarter of France or ``None``.
""" """
departement = postal_code[:2]
# Mapping between areas (main subdivisions in French, ISO 3166-2) and # Mapping between areas (main subdivisions in French, ISO 3166-2) and
# French departements # French departements
# Taken from Wikipedia data. # Taken from Wikipedia data.
area_to_departement = { department_to_subdivision = {
"FR-ARA": ["01", "03", "07", "15", "26", "38", "42", "43", "63", "69", "FR-ARA": ["01", "03", "07", "15", "26", "38", "42", "43", "63", "69",
"73", "74"], "73", "74"],
"FR-BFC": ["21", "25", "39", "58", "70", "71", "89", "90"], "FR-BFC": ["21", "25", "39", "58", "70", "71", "89", "90"],
@ -44,17 +50,30 @@ def french_postal_codes_to_iso_3166(postal_code):
"FR-PDL": ["44", "49", "53", "72", "85"], "FR-PDL": ["44", "49", "53", "72", "85"],
"FR-PAC": ["04", "05", "06", "13", "83", "84"] "FR-PAC": ["04", "05", "06", "13", "83", "84"]
} }
subdivision_to_quarters = {
'FR-IDF': ['FR-IDF'],
'FR-NW': ['FR-BRE', 'FR-CVL', 'FR-NOR', 'FR-PDL'],
'FR-NE': ['FR-BFC', 'FR-GES', 'FR-HDF'],
'FR-SE': ['FR-ARA', 'FR-COR', 'FR-PAC', 'FR-OCC'],
'FR-SW': ['FR-NAQ']
}
departement = postal_code[:2] subdivision = next(
return next(
( (
i i
for i in area_to_departement for i, departments in department_to_subdivision.items()
if departement in area_to_departement[i] if departement in departments
),
None
)
return next(
(
i
for i, subdivisions in subdivision_to_quarters.items()
if subdivision in subdivisions
), ),
None None
) )
def _preprocess_laposte(): def _preprocess_laposte():
@ -69,8 +88,8 @@ def _preprocess_laposte():
raw_laposte_data = [] raw_laposte_data = []
# Load opendata file # Load opendata file
try: try:
with open( with io.open(
os.path.join(MODULE_DIR, data_file), "r" os.path.join(MODULE_DIR, data_file), "r", encoding='utf-8'
) as fh: ) as fh:
raw_laposte_data = json.load(fh) raw_laposte_data = json.load(fh)
except (IOError, ValueError): except (IOError, ValueError):
@ -82,7 +101,7 @@ def _preprocess_laposte():
for item in raw_laposte_data: for item in raw_laposte_data:
fields = item["fields"] fields = item["fields"]
try: try:
area = french_postal_codes_to_iso_3166(fields["code_postal"]) area = french_postal_codes_to_quarter(fields["code_postal"])
if area is None: if area is None:
LOGGER.info( LOGGER.info(
"No matching area found for postal code %s, skipping it.", "No matching area found for postal code %s, skipping it.",
@ -104,72 +123,45 @@ def _preprocess_laposte():
return postal_codes_data return postal_codes_data
def _preprocess_ratp(): def _preprocess_public_transport():
""" """
Build SQLAlchemy objects from the RATP data (public transport in Paris, Build SQLAlchemy objects from the Navitia public transport data.
France).
:return: A list of ``PublicTransport`` objects to be inserted in database. :return: A list of ``PublicTransport`` objects to be inserted in database.
""" """
data_file = "ratp.json" DATA_FILES = {
LOGGER.info("Building from %s data.", data_file) "FR-IDF": "stops_fr-idf.txt",
"FR-NW": "stops_fr-nw.txt",
"FR-NE": "stops_fr-ne.txt",
"FR-SW": "stops_fr-sw.txt",
"FR-SE": "stops_fr-se.txt"
}
ratp_data_raw = [] public_transport_data = []
# Load opendata file # Load opendata file
try: for area, data_file in DATA_FILES.items():
with open(os.path.join(MODULE_DIR, data_file), "r") as fh: LOGGER.info("Building from public transport data %s." % data_file)
ratp_data_raw = json.load(fh) try:
except (IOError, ValueError): with io.open(os.path.join(MODULE_DIR, data_file), "r",
LOGGER.error("Invalid raw RATP opendata file.") encoding='utf-8') as fh:
return [] filereader = csv.reader(fh)
next(filereader, None) # Skip first row (headers)
for row in filereader:
public_transport_data.append(PublicTransport(
name=row[2],
area=area,
lat=row[3],
lng=row[4]
))
except (IOError, IndexError):
LOGGER.error("Invalid raw opendata file: %s." % data_file)
return []
# Process it return public_transport_data
ratp_data = []
for item in ratp_data_raw:
fields = item["fields"]
ratp_data.append(PublicTransport(
name=fields["stop_name"],
area="FR-IDF",
lat=fields["coord"][0],
lng=fields["coord"][1]
))
return ratp_data
def _preprocess_tcl():
"""
Build SQLAlchemy objects from the Tcl data (public transport in Lyon,
France).
:return: A list of ``PublicTransport`` objects to be inserted in database.
"""
data_file = "tcl.json"
LOGGER.info("Building from %s data.", data_file)
tcl_data_raw = []
# Load opendata file
try:
with open(os.path.join(MODULE_DIR, data_file), "r") as fh:
tcl_data_raw = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw Tcl opendata file.")
return []
# Process it
tcl_data = []
for item in tcl_data_raw["features"]:
tcl_data.append(PublicTransport(
name=item["properties"]["nom"],
area="FR-ARA",
lat=item["geometry"]["coordinates"][1],
lng=item["geometry"]["coordinates"][0]
))
return tcl_data
# List of all the available preprocessing functions. Order can be important. # List of all the available preprocessing functions. Order can be important.
PREPROCESSING_FUNCTIONS = [ PREPROCESSING_FUNCTIONS = [
_preprocess_laposte, _preprocess_laposte,
_preprocess_ratp, _preprocess_public_transport
_preprocess_tcl
] ]

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
appdirs appdirs
arrow arrow
backports.csv
bottle bottle
bottle-sqlalchemy bottle-sqlalchemy
canister canister