Add INSEE filtering

This commit is contained in:
Gautier P 2021-01-29 12:03:50 +01:00
rodič b3e316cf5b
revize 9f328259a7
6 změnil soubory, kde provedl 77 přidání a 11 odebrání

Zobrazit soubor

@ -32,6 +32,7 @@ DEFAULT_CONFIG = {
"house_types": [], # List of house types, must be in APART, HOUSE,
# PARKING, LAND, OTHER or UNKNOWN
"postal_codes": [], # List of postal codes
"insees": [], # List of postal codes
"area": (None, None), # (min, max) in m^2
"cost": (None, None), # (min, max) in currency unit
"rooms": (None, None), # (min, max)
@ -202,13 +203,22 @@ def validate_config(config, check_with_data):
assert "postal_codes" in constraint
assert constraint["postal_codes"]
assert all(isinstance(x, str) for x in constraint["postal_codes"])
if "insee_codes" in constraint:
assert constraint["insee_codes"]
assert all(isinstance(x, str) for x in constraint["insee_codes"])
if check_with_data:
# Ensure data is built into db
data.preprocess_data(config, force=False)
# Check postal codes
opendata_postal_codes = [x.postal_code for x in data.load_data(PostalCode, constraint, config)]
opendata = data.load_data(PostalCode, constraint, config)
opendata_postal_codes = [x.postal_code for x in opendata]
opendata_insee_codes = [x.insee_code for x in opendata]
for postal_code in constraint["postal_codes"]:
assert postal_code in opendata_postal_codes # noqa: E501
if "insee_codes" in constraint:
for insee in constraint["insee_codes"]:
assert insee in opendata_insee_codes # noqa: E501
assert "area" in constraint
_check_constraints_bounds(constraint["area"])

Zobrazit soubor

@ -151,7 +151,7 @@ def _preprocess_laposte():
try:
area = french_postal_codes_to_quarter(fields["code_postal"])
if area is None:
LOGGER.info(
LOGGER.debug(
"No matching area found for postal code %s, skipping it.",
fields["code_postal"],
)
@ -167,6 +167,7 @@ def _preprocess_laposte():
PostalCode(
area=area,
postal_code=fields["code_postal"],
insee_code=fields["code_commune_insee"],
name=name,
lat=fields["coordonnees_gps"][0],
lng=fields["coordonnees_gps"][1],

Zobrazit soubor

@ -37,7 +37,22 @@ def refine_with_housing_criteria(flats_list, constraint):
# Check postal code
postal_code = flat["flatisfy"].get("postal_code", None)
if postal_code and postal_code not in constraint["postal_codes"]:
LOGGER.info("Postal code %s for flat %s is out of range.", postal_code, flat["id"])
LOGGER.info(
"Postal code %s for flat %s is out of range (%s).",
postal_code,
flat["id"],
", ".join(constraint["postal_codes"]),
)
is_ok[i] = is_ok[i] and False
# Check insee code
insee_code = flat["flatisfy"].get("insee_code", None)
if insee_code and "insee_codes" in constraint and insee_code not in constraint["insee_codes"]:
LOGGER.info(
"insee code %s for flat %s is out of range (%s).",
insee_code,
flat["id"],
", ".join(constraint["insee_codes"]),
)
is_ok[i] = is_ok[i] and False
# Check time_to

Zobrazit soubor

@ -88,8 +88,8 @@ def fuzzy_match(query, choices, limit=3, threshold=75):
[('denfert rochereau', 100), ('saint-jacques', 76)]
"""
# TODO: Is there a better confidence measure?
normalized_query = tools.normalize_string(query)
normalized_choices = [tools.normalize_string(choice) for choice in choices]
normalized_query = tools.normalize_string(query).replace("saint", "st")
normalized_choices = [tools.normalize_string(choice).replace("saint", "st") for choice in choices]
# Remove duplicates in the choices list
unique_normalized_choices = tools.uniqify(normalized_choices)
@ -116,10 +116,11 @@ def fuzzy_match(query, choices, limit=3, threshold=75):
return matches
def guess_location_position(location, cities, constraint):
def guess_location_position(location, cities, constraint, must_match):
# try to find a city
# Find all fuzzy-matching cities
postal_code = None
insee_code = None
position = None
matched_cities = fuzzy_match(location, [x.name for x in cities], limit=None)
@ -128,6 +129,7 @@ def guess_location_position(location, cities, constraint):
matched_postal_codes = []
for matched_city_name, _ in matched_cities:
postal_code_objects_for_city = [x for x in cities if x.name == matched_city_name]
insee_code = [pc.insee_code for pc in postal_code_objects_for_city][0]
matched_postal_codes.extend(pc.postal_code for pc in postal_code_objects_for_city)
# Try to match them with postal codes in config constraint
matched_postal_codes_in_config = set(matched_postal_codes) & set(constraint["postal_codes"])
@ -154,7 +156,15 @@ def guess_location_position(location, cities, constraint):
LOGGER.debug(("Found position %s using city %s."), position, matched_city_name)
break
return (postal_code, position)
if not postal_code and must_match:
postal_code = cities[0].postal_code
position = {
"lat": cities[0].lat,
"lng": cities[0].lng,
}
insee_code = cities[0].insee_code
return (postal_code, insee_code, position)
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
@ -189,6 +199,7 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
continue
postal_code = None
insee_code = None
position = None
# Try to find a postal code directly
@ -209,11 +220,12 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
postal_code = None
# Then fetch position (and postal_code is couldn't be found earlier)
cities = opendata["postal_codes"]
if postal_code:
cities = [x for x in opendata["postal_codes"] if x.postal_code == postal_code]
(_, position) = guess_location_position(location, cities, constraint)
else:
(postal_code, position) = guess_location_position(location, opendata["postal_codes"], constraint)
cities = [x for x in cities if x.postal_code == postal_code]
(postal_code, insee_code, position) = guess_location_position(
location, cities, constraint, postal_code is not None
)
# Check that postal code is not too far from the ones listed in config,
# limit bad fuzzy matching
@ -257,6 +269,9 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
else:
LOGGER.info("No postal code found for flat %s.", flat["id"])
if insee_code:
flat["flatisfy"]["insee_code"] = insee_code
if position:
flat["flatisfy"]["position"] = position

Zobrazit soubor

@ -27,6 +27,7 @@ class PostalCode(BASE):
# following ISO 3166-2.
area = Column(String, index=True)
postal_code = Column(String, index=True)
insee_code = Column(String, index=True)
name = Column(String, index=True)
lat = Column(Float)
lng = Column(Float)

Zobrazit soubor

@ -0,0 +1,24 @@
"""Add flat INSEE column
Revision ID: 9e58c66f1ac1
Revises: d21933db9ad8
Create Date: 2021-02-08 16:31:18.961186
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "9e58c66f1ac1"
down_revision = "d21933db9ad8"
branch_labels = None
depends_on = None
def upgrade():
op.add_column("postal_codes", sa.Column("insee_code", sa.String()))
def downgrade():
op.drop_column("postal_codes", "insee_code")