From 83ff6be4090cf292739a123a270f886faec06f62 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Sat, 30 Dec 2017 19:30:32 +0100 Subject: [PATCH] Better fix for #100 --- doc/0.getting_started.md | 2 ++ flatisfy/config.py | 3 +++ flatisfy/filters/duplicates.py | 24 +++++++++++++++++------- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/0.getting_started.md b/doc/0.getting_started.md index 36be5af..e80c8dc 100644 --- a/doc/0.getting_started.md +++ b/doc/0.getting_started.md @@ -124,6 +124,8 @@ List of configuration options: * `max_distance_housing_station` is the maximum distance (in meters) between an housing and a public transport station found for this housing (default is `1500`). This is useful to avoid false-positive. +* `duplicate_threshold` is the minimum score in the deep duplicate detection + step to consider two flats as being duplicates (defaults to `15`). _Note:_ In production, you can either use the `serve` command with a reliable webserver instead of the default Bottle webserver (specifying a `webserver` diff --git a/flatisfy/config.py b/flatisfy/config.py index bd5fe52..101ca27 100644 --- a/flatisfy/config.py +++ b/flatisfy/config.py @@ -47,6 +47,8 @@ DEFAULT_CONFIG = { # Max distance between an housing and a found station, to avoid # false-positive "max_distance_housing_station": 1500, + # Score to consider two flats as being duplicates + "duplicate_threshold": 15, # Navitia API key "navitia_api_key": None, # Number of filtering passes to run @@ -144,6 +146,7 @@ def validate_config(config, check_with_data): assert isinstance(config["store_personal_data"], bool) assert isinstance(config["max_distance_housing_station"], (int, float)) + assert isinstance(config["duplicate_threshold"], int) # Ensure constraints are ok assert config["constraints"] diff --git a/flatisfy/filters/duplicates.py b/flatisfy/filters/duplicates.py index e1da485..3016fc7 100644 --- a/flatisfy/filters/duplicates.py +++ b/flatisfy/filters/duplicates.py @@ -190,12 +190,6 @@ def deep_detect(flats_list): if flat2["id"] in matching_flats[flat1["id"]]: continue - if flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]: - # If the two flats are from the same website, consider they - # cannot be duplicates. See - # https://framagit.org/phyks/Flatisfy/issues/100. - continue - n_common_items = 0 try: # They should have the same area, up to one unit @@ -266,7 +260,23 @@ def deep_detect(flats_list): n_common_items += 5 * min(n_common_photos, 3) # Minimal score to consider they are duplicates - assert n_common_items >= 15 + assert n_common_items >= config["duplicate_threshold"] + + # If the two flats are from the same website and have a + # different float part, consider they cannot be duplicates. See + # https://framagit.org/phyks/Flatisfy/issues/100. + both_are_from_same_backend = ( + flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1] + ) + both_have_float_part = ( + (flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0 + ) + both_have_different_float_part = ( + (flat1["area"] % 1) != (flat2["area"] % 1) + ) + if(both_have_float_part and both_are_from_same_backend and + both_have_different_float_part): + continue except (AssertionError, TypeError): # Skip and consider as not duplicates whenever the conditions # are not met