Better fix for #100
This commit is contained in:
parent
67d1299b15
commit
83ff6be409
@ -124,6 +124,8 @@ List of configuration options:
|
||||
* `max_distance_housing_station` is the maximum distance (in meters) between
|
||||
an housing and a public transport station found for this housing (default is
|
||||
`1500`). This is useful to avoid false-positive.
|
||||
* `duplicate_threshold` is the minimum score in the deep duplicate detection
|
||||
step to consider two flats as being duplicates (defaults to `15`).
|
||||
|
||||
_Note:_ In production, you can either use the `serve` command with a reliable
|
||||
webserver instead of the default Bottle webserver (specifying a `webserver`
|
||||
|
@ -47,6 +47,8 @@ DEFAULT_CONFIG = {
|
||||
# Max distance between an housing and a found station, to avoid
|
||||
# false-positive
|
||||
"max_distance_housing_station": 1500,
|
||||
# Score to consider two flats as being duplicates
|
||||
"duplicate_threshold": 15,
|
||||
# Navitia API key
|
||||
"navitia_api_key": None,
|
||||
# Number of filtering passes to run
|
||||
@ -144,6 +146,7 @@ def validate_config(config, check_with_data):
|
||||
|
||||
assert isinstance(config["store_personal_data"], bool)
|
||||
assert isinstance(config["max_distance_housing_station"], (int, float))
|
||||
assert isinstance(config["duplicate_threshold"], int)
|
||||
|
||||
# Ensure constraints are ok
|
||||
assert config["constraints"]
|
||||
|
@ -190,12 +190,6 @@ def deep_detect(flats_list):
|
||||
if flat2["id"] in matching_flats[flat1["id"]]:
|
||||
continue
|
||||
|
||||
if flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]:
|
||||
# If the two flats are from the same website, consider they
|
||||
# cannot be duplicates. See
|
||||
# https://framagit.org/phyks/Flatisfy/issues/100.
|
||||
continue
|
||||
|
||||
n_common_items = 0
|
||||
try:
|
||||
# They should have the same area, up to one unit
|
||||
@ -266,7 +260,23 @@ def deep_detect(flats_list):
|
||||
n_common_items += 5 * min(n_common_photos, 3)
|
||||
|
||||
# Minimal score to consider they are duplicates
|
||||
assert n_common_items >= 15
|
||||
assert n_common_items >= config["duplicate_threshold"]
|
||||
|
||||
# If the two flats are from the same website and have a
|
||||
# different float part, consider they cannot be duplicates. See
|
||||
# https://framagit.org/phyks/Flatisfy/issues/100.
|
||||
both_are_from_same_backend = (
|
||||
flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
|
||||
)
|
||||
both_have_float_part = (
|
||||
(flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
|
||||
)
|
||||
both_have_different_float_part = (
|
||||
(flat1["area"] % 1) != (flat2["area"] % 1)
|
||||
)
|
||||
if(both_have_float_part and both_are_from_same_backend and
|
||||
both_have_different_float_part):
|
||||
continue
|
||||
except (AssertionError, TypeError):
|
||||
# Skip and consider as not duplicates whenever the conditions
|
||||
# are not met
|
||||
|
Loading…
Reference in New Issue
Block a user