flatisfy/flatisfy/filters/duplicates.py

386 lines
14 KiB
Python
Raw Normal View History

2017-04-03 17:29:29 +02:00
# coding: utf-8
"""
Filtering functions to detect and merge duplicates.
"""
from __future__ import absolute_import, print_function, unicode_literals
import collections
import itertools
import logging
import os
import re
import imagehash
import requests
2017-04-03 17:29:29 +02:00
from flatisfy import tools
from flatisfy.constants import BACKENDS_BY_PRECEDENCE
from flatisfy.filters.cache import ImageCache
2017-04-03 17:29:29 +02:00
LOGGER = logging.getLogger(__name__)
2017-04-03 17:29:29 +02:00
def homogeneize_phone_number(numbers):
"""
Homogeneize the phone numbers, by stripping any space, dash or dot as well
as the international prefix. Assumes it is dealing with French phone
numbers (starting with a zero and having 10 characters).
:param numbers: The phone number string to homogeneize (can contain
multiple phone numbers).
:return: The cleaned phone number. ``None`` if the number is not valid.
"""
if not numbers:
return None
clean_numbers = []
2021-01-26 14:39:52 +01:00
for number in numbers.split(","):
number = number.strip()
number = number.replace(".", "")
number = number.replace(" ", "")
number = number.replace("-", "")
number = number.replace("(", "")
number = number.replace(")", "")
2021-01-26 14:39:52 +01:00
number = re.sub(r"^\+\d\d", "", number)
if not number.startswith("0"):
number = "0" + number
if len(number) == 10:
clean_numbers.append(number)
if not clean_numbers:
return None
return ", ".join(clean_numbers)
2018-01-05 19:01:08 +01:00
def get_or_compute_photo_hash(photo, photo_cache):
"""
Get the computed hash from the photo dict or compute it if not found.
:param photo: A photo, as a ``dict`` with (at least) a ``url`` key.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
"""
try:
# Try to get the computed hash from the photo dict
return photo["hash"]
except KeyError:
# Otherwise, get the image and compute the hash
image = photo_cache.get(photo["url"])
if not image:
return None
photo["hash"] = imagehash.average_hash(image)
return photo["hash"]
2018-01-21 11:52:52 +01:00
def compare_photos(photo1, photo2, photo_cache, hash_threshold):
2018-01-19 13:53:53 +01:00
"""
Compares two photos with average hash method.
:param photo1: First photo url.
:param photo2: Second photo url.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
2018-01-21 11:52:52 +01:00
:param hash_threshold: The hash threshold between two images. Usually two
different photos have a hash difference of 30.
2018-01-19 13:53:53 +01:00
:return: ``True`` if the photos are identical, else ``False``.
"""
try:
hash1 = get_or_compute_photo_hash(photo1, photo_cache)
hash2 = get_or_compute_photo_hash(photo2, photo_cache)
return hash1 - hash2 < hash_threshold
except (IOError, requests.exceptions.RequestException, TypeError):
2018-01-19 13:53:53 +01:00
return False
2021-01-26 14:39:52 +01:00
def find_number_common_photos(flat1_photos, flat2_photos, photo_cache, hash_threshold):
"""
Compute the number of common photos between the two lists of photos for the
flats.
2018-01-19 13:53:53 +01:00
Fetch the photos and compare them with average hash method.
:param flat1_photos: First list of flat photos. Each photo should be a
``dict`` with (at least) a ``url`` key.
2018-01-19 13:53:53 +01:00
:param flat2_photos: Second list of flat photos. Each photo should be a
``dict`` with (at least) a ``url`` key.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
2018-01-21 11:52:52 +01:00
:param hash_threshold: The hash threshold between two images.
:return: The found number of common photos.
"""
n_common_photos = 0
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
2018-01-19 13:53:53 +01:00
if compare_photos(photo1, photo2, photo_cache, hash_threshold):
n_common_photos += 1
return n_common_photos
def detect(flats_list, key="id", merge=True, should_intersect=False):
2017-04-03 17:29:29 +02:00
"""
Detect obvious duplicates within a given list of flats.
There may be duplicates found, as some queries could overlap (especially
since when asking for a given place, websites tend to return housings in
nearby locations as well). We need to handle them, by either deleting the
duplicates (``merge=False``) or merging them together in a single flat
object.
:param flats_list: A list of flats dicts.
:param key: The flat dicts key on which the duplicate detection should be
2017-12-05 14:56:08 +01:00
done.
2017-04-03 17:29:29 +02:00
:param merge: Whether the found duplicates should be merged or we should
2017-12-05 14:56:08 +01:00
only keep one of them.
:param should_intersect: Set to ``True`` if the values in the flat dicts
2017-12-05 14:56:08 +01:00
are lists and you want to deduplicate on non-empty intersection
(typically if they have a common url).
2017-04-03 17:29:29 +02:00
:return: A tuple of the deduplicated list of flat dicts and the list of all
2017-12-05 14:56:08 +01:00
the flats objects that should be removed and considered as duplicates
(they were already merged).
2017-04-03 17:29:29 +02:00
"""
# ``seen`` is a dict mapping aggregating the flats by the deduplication
# keys. We basically make buckets of flats for every key value. Flats in
# the same bucket should be merged together afterwards.
seen = collections.defaultdict(list)
for flat in flats_list:
if should_intersect:
# We add each value separately. We will add some flats multiple
# times, but we deduplicate again on id below to compensate.
for value in flat.get(key, []):
seen[value].append(flat)
else:
seen[flat.get(key, None)].append(flat)
2017-04-03 17:29:29 +02:00
# Generate the unique flats list based on these buckets
unique_flats_list = []
# Keep track of all the flats that were removed by deduplication
duplicate_flats = []
2017-04-03 17:29:29 +02:00
for flat_key, matching_flats in seen.items():
if flat_key is None:
# If the key is None, it means Weboob could not load the data. In
# this case, we consider every matching item as being independant
# of the others, to avoid over-deduplication.
unique_flats_list.extend(matching_flats)
else:
# Sort matching flats by backend precedence
matching_flats.sort(
key=lambda flat: next(
2021-01-26 14:39:52 +01:00
i
for (i, backend) in enumerate(BACKENDS_BY_PRECEDENCE)
if flat["id"].endswith(backend)
),
2021-01-26 14:39:52 +01:00
reverse=True,
)
if len(matching_flats) > 1:
2021-01-26 14:39:52 +01:00
LOGGER.info(
'Found duplicates using key "%s": %s.',
key,
[flat["id"] for flat in matching_flats],
)
2017-04-03 17:29:29 +02:00
# Otherwise, check the policy
if merge:
# If a merge is requested, do the merge
2021-01-26 14:39:52 +01:00
unique_flats_list.append(tools.merge_dicts(*matching_flats))
2017-04-03 17:29:29 +02:00
else:
# Otherwise, just keep the most important of them
unique_flats_list.append(matching_flats[-1])
# The ID of the added merged flat will be the one of the last item
# in ``matching_flats``. Then, any flat object that was before in
# the ``matching_flats`` list is to be considered as a duplicate
# and should have a ``duplicate`` status.
duplicate_flats.extend(matching_flats[:-1])
if should_intersect:
# We added some flats twice with the above method, let's deduplicate on
# id.
2021-01-26 14:39:52 +01:00
unique_flats_list, _ = detect(
unique_flats_list, key="id", merge=True, should_intersect=False
)
return unique_flats_list, duplicate_flats
2018-01-21 11:52:52 +01:00
def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
"""
Compute the duplicate score between two flats. The higher the score, the
more likely the two flats to be duplicates.
:param flat1: First flat dict.
:param flat2: Second flat dict.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
2018-01-21 11:52:52 +01:00
:param hash_threshold: The hash threshold between two images.
:return: The duplicate score as ``int``.
"""
2018-01-14 11:53:59 +01:00
n_common_items = 0
try:
# They should have the same area, up to one unit
assert abs(flat1["area"] - flat2["area"]) < 1
n_common_items += 1
# They should be at the same price, up to one unit
assert abs(flat1["cost"] - flat2["cost"]) < 1
n_common_items += 1
# They should have the same number of bedrooms if this was
# fetched for both
if flat1["bedrooms"] and flat2["bedrooms"]:
assert flat1["bedrooms"] == flat2["bedrooms"]
n_common_items += 1
# They should have the same utilities (included or excluded for
# both of them), if this was fetched for both
if flat1["utilities"] and flat2["utilities"]:
assert flat1["utilities"] == flat2["utilities"]
n_common_items += 1
# They should have the same number of rooms if it was fetched
# for both of them
if flat1["rooms"] and flat2["rooms"]:
assert flat1["rooms"] == flat2["rooms"]
n_common_items += 1
# They should have the same postal code, if available
if (
2021-01-26 14:39:52 +01:00
"flatisfy" in flat1
and "flatisfy" in flat2
and flat1["flatisfy"].get("postal_code", None)
and flat2["flatisfy"].get("postal_code", None)
2018-01-14 11:53:59 +01:00
):
2021-01-26 14:39:52 +01:00
assert flat1["flatisfy"]["postal_code"] == flat2["flatisfy"]["postal_code"]
2018-01-14 11:53:59 +01:00
n_common_items += 1
# TODO: Better text comparison (one included in the other, fuzzymatch)
flat1_text = tools.normalize_string(flat1.get("text", ""))
flat2_text = tools.normalize_string(flat2.get("text", ""))
if flat1_text and flat2_text and flat1_text == flat2_text:
n_common_items += 1
# They should have the same phone number if it was fetched for
# both
flat1_phone = homogeneize_phone_number(flat1["phone"])
flat2_phone = homogeneize_phone_number(flat2["phone"])
if flat1_phone and flat2_phone:
# Use an "in" test as there could be multiple phone numbers
# returned by a weboob module
if flat1_phone in flat2_phone or flat2_phone in flat1_phone:
n_common_items += 4 # Counts much more than the rest
2018-01-14 11:53:59 +01:00
# If the two flats are from the same website and have a
# different float part, consider they cannot be duplicates. See
# https://framagit.org/phyks/Flatisfy/issues/100.
both_are_from_same_backend = (
flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
)
2021-01-26 14:39:52 +01:00
both_have_float_part = (flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
both_have_equal_float_part = (flat1["area"] % 1) == (flat2["area"] % 1)
if both_have_float_part and both_are_from_same_backend:
assert both_have_equal_float_part
2018-01-14 11:53:59 +01:00
if flat1.get("photos", []) and flat2.get("photos", []):
n_common_photos = find_number_common_photos(
2021-01-26 14:39:52 +01:00
flat1["photos"], flat2["photos"], photo_cache, hash_threshold
2018-01-14 11:53:59 +01:00
)
2021-01-26 14:39:52 +01:00
min_number_photos = min(len(flat1["photos"]), len(flat2["photos"]))
2018-01-14 11:53:59 +01:00
# Either all the photos are the same, or there are at least
# three common photos.
if n_common_photos == min_number_photos:
n_common_items += 15
else:
n_common_items += 5 * min(n_common_photos, 3)
except (AssertionError, TypeError):
# Skip and consider as not duplicates whenever the conditions
# are not met
# TypeError occurs when an area or a cost is None, which should
# not be considered as duplicates
n_common_items = 0
return n_common_items
2018-01-05 19:01:08 +01:00
def deep_detect(flats_list, config):
"""
Deeper detection of duplicates based on any available data.
:param flats_list: A list of flats dicts.
2018-01-05 19:01:08 +01:00
:param config: A config dict.
:return: A tuple of the deduplicated list of flat dicts and the list of all
2017-12-05 14:56:08 +01:00
the flats objects that should be removed and considered as duplicates
(they were already merged).
"""
if config["serve_images_locally"]:
storage_dir = os.path.join(config["data_directory"], "images")
else:
storage_dir = None
2021-01-26 14:39:52 +01:00
photo_cache = ImageCache(storage_dir=storage_dir)
LOGGER.info("Running deep duplicates detection.")
matching_flats = collections.defaultdict(list)
for i, flat1 in enumerate(flats_list):
matching_flats[flat1["id"]].append(flat1["id"])
for j, flat2 in enumerate(flats_list):
if i <= j:
continue
if flat2["id"] in matching_flats[flat1["id"]]:
continue
2018-01-19 13:53:53 +01:00
n_common_items = get_duplicate_score(
2021-01-26 14:39:52 +01:00
flat1, flat2, photo_cache, config["duplicate_image_hash_threshold"]
2018-01-19 13:53:53 +01:00
)
2018-01-14 11:53:59 +01:00
# Minimal score to consider they are duplicates
if n_common_items >= config["duplicate_threshold"]:
# Mark flats as duplicates
LOGGER.info(
2021-01-26 14:39:52 +01:00
(
"Found duplicates using deep detection: (%s, %s). "
"Score is %d."
),
2018-01-14 11:53:59 +01:00
flat1["id"],
flat2["id"],
2021-01-26 14:39:52 +01:00
n_common_items,
2017-12-30 19:30:32 +01:00
)
2018-01-14 11:53:59 +01:00
matching_flats[flat1["id"]].append(flat2["id"])
matching_flats[flat2["id"]].append(flat1["id"])
if photo_cache.total():
2021-01-26 14:39:52 +01:00
LOGGER.debug(
"Photo cache: hits: %d%% / misses: %d%%.",
photo_cache.hit_rate(),
photo_cache.miss_rate(),
)
seen_ids = []
duplicate_flats = []
unique_flats_list = []
for flat_id in [flat["id"] for flat in flats_list]:
if flat_id in seen_ids:
continue
seen_ids.extend(matching_flats[flat_id])
to_merge = sorted(
2021-01-26 14:39:52 +01:00
[flat for flat in flats_list if flat["id"] in matching_flats[flat_id]],
key=lambda flat: next(
2021-01-26 14:39:52 +01:00
i
for (i, backend) in enumerate(BACKENDS_BY_PRECEDENCE)
if flat["id"].endswith(backend)
),
2021-01-26 14:39:52 +01:00
reverse=True,
)
unique_flats_list.append(tools.merge_dicts(*to_merge))
# The ID of the added merged flat will be the one of the last item
# in ``matching_flats``. Then, any flat object that was before in
# the ``matching_flats`` list is to be considered as a duplicate
# and should have a ``duplicate`` status.
duplicate_flats.extend(to_merge[:-1])
return unique_flats_list, duplicate_flats