Better fix for #100

This commit is contained in:
Lucas Verney 2017-12-30 19:30:32 +01:00
parent 67d1299b15
commit 83ff6be409
3 changed files with 22 additions and 7 deletions

View File

@ -124,6 +124,8 @@ List of configuration options:
* `max_distance_housing_station` is the maximum distance (in meters) between
an housing and a public transport station found for this housing (default is
`1500`). This is useful to avoid false-positive.
* `duplicate_threshold` is the minimum score in the deep duplicate detection
step to consider two flats as being duplicates (defaults to `15`).
_Note:_ In production, you can either use the `serve` command with a reliable
webserver instead of the default Bottle webserver (specifying a `webserver`

View File

@ -47,6 +47,8 @@ DEFAULT_CONFIG = {
# Max distance between an housing and a found station, to avoid
# false-positive
"max_distance_housing_station": 1500,
# Score to consider two flats as being duplicates
"duplicate_threshold": 15,
# Navitia API key
"navitia_api_key": None,
# Number of filtering passes to run
@ -144,6 +146,7 @@ def validate_config(config, check_with_data):
assert isinstance(config["store_personal_data"], bool)
assert isinstance(config["max_distance_housing_station"], (int, float))
assert isinstance(config["duplicate_threshold"], int)
# Ensure constraints are ok
assert config["constraints"]

View File

@ -190,12 +190,6 @@ def deep_detect(flats_list):
if flat2["id"] in matching_flats[flat1["id"]]:
continue
if flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]:
# If the two flats are from the same website, consider they
# cannot be duplicates. See
# https://framagit.org/phyks/Flatisfy/issues/100.
continue
n_common_items = 0
try:
# They should have the same area, up to one unit
@ -266,7 +260,23 @@ def deep_detect(flats_list):
n_common_items += 5 * min(n_common_photos, 3)
# Minimal score to consider they are duplicates
assert n_common_items >= 15
assert n_common_items >= config["duplicate_threshold"]
# If the two flats are from the same website and have a
# different float part, consider they cannot be duplicates. See
# https://framagit.org/phyks/Flatisfy/issues/100.
both_are_from_same_backend = (
flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
)
both_have_float_part = (
(flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
)
both_have_different_float_part = (
(flat1["area"] % 1) != (flat2["area"] % 1)
)
if(both_have_float_part and both_are_from_same_backend and
both_have_different_float_part):
continue
except (AssertionError, TypeError):
# Skip and consider as not duplicates whenever the conditions
# are not met