From 9f328259a71d5a130bc11345fa7b716b7b2822d5 Mon Sep 17 00:00:00 2001 From: Gautier P Date: Fri, 29 Jan 2021 12:03:50 +0100 Subject: [PATCH] Add INSEE filtering --- flatisfy/config.py | 12 ++++++- flatisfy/data_files/__init__.py | 3 +- flatisfy/filters/__init__.py | 17 +++++++++- flatisfy/filters/metadata.py | 31 ++++++++++++++----- flatisfy/models/postal_code.py | 1 + .../9e58c66f1ac1_add_flat_insee_column.py | 24 ++++++++++++++ 6 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 migrations/versions/9e58c66f1ac1_add_flat_insee_column.py diff --git a/flatisfy/config.py b/flatisfy/config.py index 21776df..285d32f 100644 --- a/flatisfy/config.py +++ b/flatisfy/config.py @@ -32,6 +32,7 @@ DEFAULT_CONFIG = { "house_types": [], # List of house types, must be in APART, HOUSE, # PARKING, LAND, OTHER or UNKNOWN "postal_codes": [], # List of postal codes + "insees": [], # List of postal codes "area": (None, None), # (min, max) in m^2 "cost": (None, None), # (min, max) in currency unit "rooms": (None, None), # (min, max) @@ -202,13 +203,22 @@ def validate_config(config, check_with_data): assert "postal_codes" in constraint assert constraint["postal_codes"] assert all(isinstance(x, str) for x in constraint["postal_codes"]) + if "insee_codes" in constraint: + assert constraint["insee_codes"] + assert all(isinstance(x, str) for x in constraint["insee_codes"]) + if check_with_data: # Ensure data is built into db data.preprocess_data(config, force=False) # Check postal codes - opendata_postal_codes = [x.postal_code for x in data.load_data(PostalCode, constraint, config)] + opendata = data.load_data(PostalCode, constraint, config) + opendata_postal_codes = [x.postal_code for x in opendata] + opendata_insee_codes = [x.insee_code for x in opendata] for postal_code in constraint["postal_codes"]: assert postal_code in opendata_postal_codes # noqa: E501 + if "insee_codes" in constraint: + for insee in constraint["insee_codes"]: + assert insee in opendata_insee_codes # noqa: E501 assert "area" in constraint _check_constraints_bounds(constraint["area"]) diff --git a/flatisfy/data_files/__init__.py b/flatisfy/data_files/__init__.py index 4064ce4..455f40e 100644 --- a/flatisfy/data_files/__init__.py +++ b/flatisfy/data_files/__init__.py @@ -151,7 +151,7 @@ def _preprocess_laposte(): try: area = french_postal_codes_to_quarter(fields["code_postal"]) if area is None: - LOGGER.info( + LOGGER.debug( "No matching area found for postal code %s, skipping it.", fields["code_postal"], ) @@ -167,6 +167,7 @@ def _preprocess_laposte(): PostalCode( area=area, postal_code=fields["code_postal"], + insee_code=fields["code_commune_insee"], name=name, lat=fields["coordonnees_gps"][0], lng=fields["coordonnees_gps"][1], diff --git a/flatisfy/filters/__init__.py b/flatisfy/filters/__init__.py index 472b56e..2216d04 100644 --- a/flatisfy/filters/__init__.py +++ b/flatisfy/filters/__init__.py @@ -37,7 +37,22 @@ def refine_with_housing_criteria(flats_list, constraint): # Check postal code postal_code = flat["flatisfy"].get("postal_code", None) if postal_code and postal_code not in constraint["postal_codes"]: - LOGGER.info("Postal code %s for flat %s is out of range.", postal_code, flat["id"]) + LOGGER.info( + "Postal code %s for flat %s is out of range (%s).", + postal_code, + flat["id"], + ", ".join(constraint["postal_codes"]), + ) + is_ok[i] = is_ok[i] and False + # Check insee code + insee_code = flat["flatisfy"].get("insee_code", None) + if insee_code and "insee_codes" in constraint and insee_code not in constraint["insee_codes"]: + LOGGER.info( + "insee code %s for flat %s is out of range (%s).", + insee_code, + flat["id"], + ", ".join(constraint["insee_codes"]), + ) is_ok[i] = is_ok[i] and False # Check time_to diff --git a/flatisfy/filters/metadata.py b/flatisfy/filters/metadata.py index 6f14562..57a2baa 100644 --- a/flatisfy/filters/metadata.py +++ b/flatisfy/filters/metadata.py @@ -88,8 +88,8 @@ def fuzzy_match(query, choices, limit=3, threshold=75): [('denfert rochereau', 100), ('saint-jacques', 76)] """ # TODO: Is there a better confidence measure? - normalized_query = tools.normalize_string(query) - normalized_choices = [tools.normalize_string(choice) for choice in choices] + normalized_query = tools.normalize_string(query).replace("saint", "st") + normalized_choices = [tools.normalize_string(choice).replace("saint", "st") for choice in choices] # Remove duplicates in the choices list unique_normalized_choices = tools.uniqify(normalized_choices) @@ -116,10 +116,11 @@ def fuzzy_match(query, choices, limit=3, threshold=75): return matches -def guess_location_position(location, cities, constraint): +def guess_location_position(location, cities, constraint, must_match): # try to find a city # Find all fuzzy-matching cities postal_code = None + insee_code = None position = None matched_cities = fuzzy_match(location, [x.name for x in cities], limit=None) @@ -128,6 +129,7 @@ def guess_location_position(location, cities, constraint): matched_postal_codes = [] for matched_city_name, _ in matched_cities: postal_code_objects_for_city = [x for x in cities if x.name == matched_city_name] + insee_code = [pc.insee_code for pc in postal_code_objects_for_city][0] matched_postal_codes.extend(pc.postal_code for pc in postal_code_objects_for_city) # Try to match them with postal codes in config constraint matched_postal_codes_in_config = set(matched_postal_codes) & set(constraint["postal_codes"]) @@ -154,7 +156,15 @@ def guess_location_position(location, cities, constraint): LOGGER.debug(("Found position %s using city %s."), position, matched_city_name) break - return (postal_code, position) + if not postal_code and must_match: + postal_code = cities[0].postal_code + position = { + "lat": cities[0].lat, + "lng": cities[0].lng, + } + insee_code = cities[0].insee_code + + return (postal_code, insee_code, position) def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): @@ -189,6 +199,7 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): continue postal_code = None + insee_code = None position = None # Try to find a postal code directly @@ -209,11 +220,12 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): postal_code = None # Then fetch position (and postal_code is couldn't be found earlier) + cities = opendata["postal_codes"] if postal_code: - cities = [x for x in opendata["postal_codes"] if x.postal_code == postal_code] - (_, position) = guess_location_position(location, cities, constraint) - else: - (postal_code, position) = guess_location_position(location, opendata["postal_codes"], constraint) + cities = [x for x in cities if x.postal_code == postal_code] + (postal_code, insee_code, position) = guess_location_position( + location, cities, constraint, postal_code is not None + ) # Check that postal code is not too far from the ones listed in config, # limit bad fuzzy matching @@ -257,6 +269,9 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): else: LOGGER.info("No postal code found for flat %s.", flat["id"]) + if insee_code: + flat["flatisfy"]["insee_code"] = insee_code + if position: flat["flatisfy"]["position"] = position diff --git a/flatisfy/models/postal_code.py b/flatisfy/models/postal_code.py index 7747641..4d2a671 100644 --- a/flatisfy/models/postal_code.py +++ b/flatisfy/models/postal_code.py @@ -27,6 +27,7 @@ class PostalCode(BASE): # following ISO 3166-2. area = Column(String, index=True) postal_code = Column(String, index=True) + insee_code = Column(String, index=True) name = Column(String, index=True) lat = Column(Float) lng = Column(Float) diff --git a/migrations/versions/9e58c66f1ac1_add_flat_insee_column.py b/migrations/versions/9e58c66f1ac1_add_flat_insee_column.py new file mode 100644 index 0000000..69bea5c --- /dev/null +++ b/migrations/versions/9e58c66f1ac1_add_flat_insee_column.py @@ -0,0 +1,24 @@ +"""Add flat INSEE column + +Revision ID: 9e58c66f1ac1 +Revises: d21933db9ad8 +Create Date: 2021-02-08 16:31:18.961186 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "9e58c66f1ac1" +down_revision = "d21933db9ad8" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column("postal_codes", sa.Column("insee_code", sa.String())) + + +def downgrade(): + op.drop_column("postal_codes", "insee_code")