Better fix for #100
This commit is contained in:
parent
67d1299b15
commit
83ff6be409
@ -124,6 +124,8 @@ List of configuration options:
|
|||||||
* `max_distance_housing_station` is the maximum distance (in meters) between
|
* `max_distance_housing_station` is the maximum distance (in meters) between
|
||||||
an housing and a public transport station found for this housing (default is
|
an housing and a public transport station found for this housing (default is
|
||||||
`1500`). This is useful to avoid false-positive.
|
`1500`). This is useful to avoid false-positive.
|
||||||
|
* `duplicate_threshold` is the minimum score in the deep duplicate detection
|
||||||
|
step to consider two flats as being duplicates (defaults to `15`).
|
||||||
|
|
||||||
_Note:_ In production, you can either use the `serve` command with a reliable
|
_Note:_ In production, you can either use the `serve` command with a reliable
|
||||||
webserver instead of the default Bottle webserver (specifying a `webserver`
|
webserver instead of the default Bottle webserver (specifying a `webserver`
|
||||||
|
@ -47,6 +47,8 @@ DEFAULT_CONFIG = {
|
|||||||
# Max distance between an housing and a found station, to avoid
|
# Max distance between an housing and a found station, to avoid
|
||||||
# false-positive
|
# false-positive
|
||||||
"max_distance_housing_station": 1500,
|
"max_distance_housing_station": 1500,
|
||||||
|
# Score to consider two flats as being duplicates
|
||||||
|
"duplicate_threshold": 15,
|
||||||
# Navitia API key
|
# Navitia API key
|
||||||
"navitia_api_key": None,
|
"navitia_api_key": None,
|
||||||
# Number of filtering passes to run
|
# Number of filtering passes to run
|
||||||
@ -144,6 +146,7 @@ def validate_config(config, check_with_data):
|
|||||||
|
|
||||||
assert isinstance(config["store_personal_data"], bool)
|
assert isinstance(config["store_personal_data"], bool)
|
||||||
assert isinstance(config["max_distance_housing_station"], (int, float))
|
assert isinstance(config["max_distance_housing_station"], (int, float))
|
||||||
|
assert isinstance(config["duplicate_threshold"], int)
|
||||||
|
|
||||||
# Ensure constraints are ok
|
# Ensure constraints are ok
|
||||||
assert config["constraints"]
|
assert config["constraints"]
|
||||||
|
@ -190,12 +190,6 @@ def deep_detect(flats_list):
|
|||||||
if flat2["id"] in matching_flats[flat1["id"]]:
|
if flat2["id"] in matching_flats[flat1["id"]]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]:
|
|
||||||
# If the two flats are from the same website, consider they
|
|
||||||
# cannot be duplicates. See
|
|
||||||
# https://framagit.org/phyks/Flatisfy/issues/100.
|
|
||||||
continue
|
|
||||||
|
|
||||||
n_common_items = 0
|
n_common_items = 0
|
||||||
try:
|
try:
|
||||||
# They should have the same area, up to one unit
|
# They should have the same area, up to one unit
|
||||||
@ -266,7 +260,23 @@ def deep_detect(flats_list):
|
|||||||
n_common_items += 5 * min(n_common_photos, 3)
|
n_common_items += 5 * min(n_common_photos, 3)
|
||||||
|
|
||||||
# Minimal score to consider they are duplicates
|
# Minimal score to consider they are duplicates
|
||||||
assert n_common_items >= 15
|
assert n_common_items >= config["duplicate_threshold"]
|
||||||
|
|
||||||
|
# If the two flats are from the same website and have a
|
||||||
|
# different float part, consider they cannot be duplicates. See
|
||||||
|
# https://framagit.org/phyks/Flatisfy/issues/100.
|
||||||
|
both_are_from_same_backend = (
|
||||||
|
flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
|
||||||
|
)
|
||||||
|
both_have_float_part = (
|
||||||
|
(flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
|
||||||
|
)
|
||||||
|
both_have_different_float_part = (
|
||||||
|
(flat1["area"] % 1) != (flat2["area"] % 1)
|
||||||
|
)
|
||||||
|
if(both_have_float_part and both_are_from_same_backend and
|
||||||
|
both_have_different_float_part):
|
||||||
|
continue
|
||||||
except (AssertionError, TypeError):
|
except (AssertionError, TypeError):
|
||||||
# Skip and consider as not duplicates whenever the conditions
|
# Skip and consider as not duplicates whenever the conditions
|
||||||
# are not met
|
# are not met
|
||||||
|
Loading…
Reference in New Issue
Block a user