Browse Source

Use a single common data source for public transports stops

Now makes use of Navitia opendata dumps to cover France. Fixes #65.
navitia
Lucas Verney 5 years ago
parent
commit
aa0e1fd965
  1. 3
      README.md
  2. 4
      flatisfy/__main__.py
  3. 6
      flatisfy/data.py
  4. 130
      flatisfy/data_files/__init__.py
  5. 1
      flatisfy/data_files/ratp.json
  6. 59362
      flatisfy/data_files/stops_fr-idf.txt
  7. 25211
      flatisfy/data_files/stops_fr-ne.txt
  8. 15287
      flatisfy/data_files/stops_fr-nw.txt
  9. 49703
      flatisfy/data_files/stops_fr-se.txt
  10. 18344
      flatisfy/data_files/stops_fr-sw.txt
  11. 4606
      flatisfy/data_files/tcl.json
  12. 1
      requirements.txt

3
README.md

@ -74,8 +74,7 @@ which covers Paris. If you want to run the script using some other location, @@ -74,8 +74,7 @@ which covers Paris. If you want to run the script using some other location,
you might have to change these files by matching datasets.
* [LaPoste Hexasmal](https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/?disjunctive.code_commune_insee&disjunctive.nom_de_la_commune&disjunctive.code_postal&disjunctive.libell_d_acheminement&disjunctive.ligne_5) for the list of cities and postal codes in France.
* [RATP (Paris) stations](https://data.ratp.fr/explore/dataset/positions-geographiques-des-stations-du-reseau-ratp/table/?disjunctive.stop_name&disjunctive.code_postal&disjunctive.departement) for the list of subway/tram/bus stations with their positions in Paris and nearby areas.
* [Tcl (Lyon) stations](https://download.data.grandlyon.com/wfs/rdata?SERVICE=WFS&VERSION=2.0.0&outputformat=GEOJSON&maxfeatures=4601&request=GetFeature&typename=tcl_sytral.tclarret&SRSNAME=urn:ogc:def:crs:EPSG::4326) for the list of subway/tram/bus stations with their positions in Paris and nearby areas.
* [Navitia public transport datasets](https://navitia.opendatasoft.com/explore/?sort=modified&refine.geographicarea=France) for the list of subway/tram/bus stations with their positions in France. These are the `stops_fr-*.txt` files, extracted from the `NTFS` datasets for each region.
Both datasets are licensed under the Open Data Commons Open Database License
(ODbL): https://opendatacommons.org/licenses/odbl/.

4
flatisfy/__main__.py

@ -164,8 +164,8 @@ def main(): @@ -164,8 +164,8 @@ def main():
if args.cmd == "build-data":
force = True
data.preprocess_data(config, force=force)
LOGGER.info("Done building data!")
if data.preprocess_data(config, force=force):
LOGGER.info("Done building data!")
if args.cmd == "build-data":
sys.exit(0)

6
flatisfy/data.py

@ -43,6 +43,7 @@ def preprocess_data(config, force=False): @@ -43,6 +43,7 @@ def preprocess_data(config, force=False):
:params config: A config dictionary.
:params force: Whether to force rebuild or not.
:return bool: Whether data have been built or not.
"""
# Check if a build is required
get_session = database.init_db(config["database"], config["search_index"])
@ -53,7 +54,7 @@ def preprocess_data(config, force=False): @@ -53,7 +54,7 @@ def preprocess_data(config, force=False):
)
if is_built and not force:
# No need to rebuild the database, skip
return
return False
# Otherwise, purge all existing data
session.query(PublicTransport).delete()
session.query(PostalCode).delete()
@ -67,6 +68,7 @@ def preprocess_data(config, force=False): @@ -67,6 +68,7 @@ def preprocess_data(config, force=False):
)
with get_session() as session:
session.add_all(data_objects)
return True
@hash_dict
@ -88,7 +90,7 @@ def load_data(model, constraint, config): @@ -88,7 +90,7 @@ def load_data(model, constraint, config):
areas = []
# Get areas to fetch from, using postal codes
for postal_code in constraint["postal_codes"]:
areas.append(data_files.french_postal_codes_to_iso_3166(postal_code))
areas.append(data_files.french_postal_codes_to_quarter(postal_code))
# Load data for each area
areas = list(set(areas))
for area in areas:

130
flatisfy/data_files/__init__.py

@ -3,10 +3,14 @@ @@ -3,10 +3,14 @@
Preprocessing functions to convert input opendata files into SQLAlchemy objects
ready to be stored in the database.
"""
import csv
import io
import json
import logging
import os
from backports import csv
from flatisfy.models.postal_code import PostalCode
from flatisfy.models.public_transport import PublicTransport
@ -15,18 +19,20 @@ LOGGER = logging.getLogger(__name__) @@ -15,18 +19,20 @@ LOGGER = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
def french_postal_codes_to_iso_3166(postal_code):
def french_postal_codes_to_quarter(postal_code):
"""
Convert a French postal code to the main subdivision in French this postal
code belongs to (ISO 3166-2 code).
Convert a French postal code to the main quarter in France this postal
code belongs to.
:param postal_code: The postal code to convert.
:returns: The ISO 3166-2 code of the subdivision or ``None``.
:returns: The quarter of France or ``None``.
"""
departement = postal_code[:2]
# Mapping between areas (main subdivisions in French, ISO 3166-2) and
# French departements
# Taken from Wikipedia data.
area_to_departement = {
department_to_subdivision = {
"FR-ARA": ["01", "03", "07", "15", "26", "38", "42", "43", "63", "69",
"73", "74"],
"FR-BFC": ["21", "25", "39", "58", "70", "71", "89", "90"],
@ -44,19 +50,32 @@ def french_postal_codes_to_iso_3166(postal_code): @@ -44,19 +50,32 @@ def french_postal_codes_to_iso_3166(postal_code):
"FR-PDL": ["44", "49", "53", "72", "85"],
"FR-PAC": ["04", "05", "06", "13", "83", "84"]
}
subdivision_to_quarters = {
'FR-IDF': ['FR-IDF'],
'FR-NW': ['FR-BRE', 'FR-CVL', 'FR-NOR', 'FR-PDL'],
'FR-NE': ['FR-BFC', 'FR-GES', 'FR-HDF'],
'FR-SE': ['FR-ARA', 'FR-COR', 'FR-PAC', 'FR-OCC'],
'FR-SW': ['FR-NAQ']
}
departement = postal_code[:2]
subdivision = next(
(
i
for i, departments in department_to_subdivision.items()
if departement in departments
),
None
)
return next(
(
i
for i in area_to_departement
if departement in area_to_departement[i]
for i, subdivisions in subdivision_to_quarters.items()
if subdivision in subdivisions
),
None
)
def _preprocess_laposte():
"""
Build SQLAlchemy objects from the postal codes data.
@ -69,8 +88,8 @@ def _preprocess_laposte(): @@ -69,8 +88,8 @@ def _preprocess_laposte():
raw_laposte_data = []
# Load opendata file
try:
with open(
os.path.join(MODULE_DIR, data_file), "r"
with io.open(
os.path.join(MODULE_DIR, data_file), "r", encoding='utf-8'
) as fh:
raw_laposte_data = json.load(fh)
except (IOError, ValueError):
@ -82,7 +101,7 @@ def _preprocess_laposte(): @@ -82,7 +101,7 @@ def _preprocess_laposte():
for item in raw_laposte_data:
fields = item["fields"]
try:
area = french_postal_codes_to_iso_3166(fields["code_postal"])
area = french_postal_codes_to_quarter(fields["code_postal"])
if area is None:
LOGGER.info(
"No matching area found for postal code %s, skipping it.",
@ -104,72 +123,45 @@ def _preprocess_laposte(): @@ -104,72 +123,45 @@ def _preprocess_laposte():
return postal_codes_data
def _preprocess_ratp():
"""
Build SQLAlchemy objects from the RATP data (public transport in Paris,
France).
:return: A list of ``PublicTransport`` objects to be inserted in database.
"""
data_file = "ratp.json"
LOGGER.info("Building from %s data.", data_file)
ratp_data_raw = []
# Load opendata file
try:
with open(os.path.join(MODULE_DIR, data_file), "r") as fh:
ratp_data_raw = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw RATP opendata file.")
return []
# Process it
ratp_data = []
for item in ratp_data_raw:
fields = item["fields"]
ratp_data.append(PublicTransport(
name=fields["stop_name"],
area="FR-IDF",
lat=fields["coord"][0],
lng=fields["coord"][1]
))
return ratp_data
def _preprocess_tcl():
def _preprocess_public_transport():
"""
Build SQLAlchemy objects from the Tcl data (public transport in Lyon,
France).
Build SQLAlchemy objects from the Navitia public transport data.
:return: A list of ``PublicTransport`` objects to be inserted in database.
"""
data_file = "tcl.json"
LOGGER.info("Building from %s data.", data_file)
DATA_FILES = {
"FR-IDF": "stops_fr-idf.txt",
"FR-NW": "stops_fr-nw.txt",
"FR-NE": "stops_fr-ne.txt",
"FR-SW": "stops_fr-sw.txt",
"FR-SE": "stops_fr-se.txt"
}
tcl_data_raw = []
public_transport_data = []
# Load opendata file
try:
with open(os.path.join(MODULE_DIR, data_file), "r") as fh:
tcl_data_raw = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw Tcl opendata file.")
return []
# Process it
tcl_data = []
for item in tcl_data_raw["features"]:
tcl_data.append(PublicTransport(
name=item["properties"]["nom"],
area="FR-ARA",
lat=item["geometry"]["coordinates"][1],
lng=item["geometry"]["coordinates"][0]
))
return tcl_data
for area, data_file in DATA_FILES.items():
LOGGER.info("Building from public transport data %s." % data_file)
try:
with io.open(os.path.join(MODULE_DIR, data_file), "r",
encoding='utf-8') as fh:
filereader = csv.reader(fh)
next(filereader, None) # Skip first row (headers)
for row in filereader:
public_transport_data.append(PublicTransport(
name=row[2],
area=area,
lat=row[3],
lng=row[4]
))
except (IOError, IndexError):
LOGGER.error("Invalid raw opendata file: %s." % data_file)
return []
return public_transport_data
# List of all the available preprocessing functions. Order can be important.
PREPROCESSING_FUNCTIONS = [
_preprocess_laposte,
_preprocess_ratp,
_preprocess_tcl
_preprocess_public_transport
]

1
flatisfy/data_files/ratp.json

File diff suppressed because one or more lines are too long

59362
flatisfy/data_files/stops_fr-idf.txt

File diff suppressed because it is too large Load Diff

25211
flatisfy/data_files/stops_fr-ne.txt

File diff suppressed because it is too large Load Diff

15287
flatisfy/data_files/stops_fr-nw.txt

File diff suppressed because it is too large Load Diff

49703
flatisfy/data_files/stops_fr-se.txt

File diff suppressed because it is too large Load Diff

18344
flatisfy/data_files/stops_fr-sw.txt

File diff suppressed because it is too large Load Diff

4606
flatisfy/data_files/tcl.json

File diff suppressed because it is too large Load Diff

1
requirements.txt

@ -1,5 +1,6 @@ @@ -1,5 +1,6 @@
appdirs
arrow
backports.csv
bottle
bottle-sqlalchemy
canister

Loading…
Cancel
Save