Better deduplication

Perform deeper deduplication, based on all the available data, and trying to match common photos.
2017-04-28 20:59:46 +02:00 · 2017-04-28 20:59:46 +02:00 · 589bfdfb13
parent 2af742b764
commit 589bfdfb13
6 changed files with 166 additions and 28 deletions
--- a/flatisfy/cmds.py
+++ b/flatisfy/cmds.py
@ -60,7 +60,10 @@ def filter_flats(config, flats_list, fetch_details=True):
    return {
        "new": second_pass_result["new"],
-        "duplicate": first_pass_result["duplicate"],
+        "duplicate": (
            first_pass_result["duplicate"] +
            second_pass_result["duplicate"]
        ),
        "ignored": (
            first_pass_result["ignored"] + second_pass_result["ignored"]
        )
--- a/flatisfy/fetch.py
+++ b/flatisfy/fetch.py
@ -54,6 +54,8 @@ class WeboobProxy(object):
                flat[field] = float(flat[field])
            except (TypeError, ValueError):
                flat[field] = None
            except KeyError:
                pass
        return flat
    def __init__(self, config):
@ -193,15 +195,23 @@ class WeboobProxy(object):
        (ID@BACKEND)
        :return: The details in JSON.
        """
        housing = {}
        flat_id, backend_name = full_flat_id.rsplit("@", 1)
-        backend = next(
+        try:
-            backend
+            backend = next(
-            for backend in self.backends
+                backend
-            if backend.name == backend_name
+                for backend in self.backends
-        )
+                if backend.name == backend_name
            )
        except StopIteration:
            LOGGER.error("Backend %s is not available.", backend_name)
            return "{}"
        try:
            housing = backend.get_housing(flat_id)
            # Otherwise, we miss the @backend afterwards
            housing.id = full_flat_id
            return json.dumps(housing, cls=WeboobEncoder)
        except CallErrors as exc:
            # If an error occured, just log it
            LOGGER.error(
@ -210,9 +220,6 @@ class WeboobProxy(object):
                str(exc)
            )
        housing.id = full_flat_id  # Otherwise, we miss the @backend afterwards
        return json.dumps(housing, cls=WeboobEncoder)
 def fetch_flats_list(config):
    """
--- a/flatisfy/filters/init.py
+++ b/flatisfy/filters/init.py
@ -142,18 +142,20 @@ def second_pass(flats_list, config):
    # Confirm postal code
    flats_list = metadata.guess_postal_code(flats_list, config)
    # TODO: Guess the address
    # Better match with stations (confirm and check better)
    flats_list = metadata.guess_stations(flats_list, config)
    # Compute travel time to specified points
    flats_list = metadata.compute_travel_times(flats_list, config)
    # Deduplicate the list using every available data
    flats_list, duplicate_flats = duplicates.deep_detect(flats_list)
    # Remove returned housing posts that do not match criteria
    flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
    return {
        "new": flats_list,
-        "ignored": ignored_list
+        "ignored": ignored_list,
        "duplicate": duplicate_flats
    }
--- a/flatisfy/filters/duplicates.py
+++ b/flatisfy/filters/duplicates.py
@ -5,7 +5,15 @@ Filtering functions to detect and merge duplicates.
 from __future__ import absolute_import, print_function, unicode_literals
 import collections
 import itertools
 import logging
 import re
 from io import BytesIO
 import imagehash
 import PIL.Image
 import requests
 from flatisfy import tools
@ -23,6 +31,64 @@ BACKENDS_PRECEDENCE = [
 ]
 def homogeneize_phone_number(number):
    """
    Homogeneize the phone numbers, by stripping any space, dash or dot as well
    as the international prefix. Assumes it is dealing with French phone
    numbers (starting with a zero and having 10 characters).
    :param number: The phone number to homogeneize.
    :return: The cleaned phone number. ``None`` if the number is not valid.
    """
    if not number:
        return None
    number = number.replace(".", "")
    number = number.replace(" ", "")
    number = number.replace("-", "")
    number = number.replace("(", "")
    number = number.replace(")", "")
    number = re.sub(r'^\+\d\d', "", number)
    if not number.startswith("0"):
        number = "0" + number
    if len(number) != 10:
        return None
    return number
 def find_number_common_photos(flat1_photos, flat2_photos):
    """
    Compute the number of common photos between the two lists of photos for the
    flats.
    Fetch the photos and compare them with dHash method.
    :param flat1_photos: First list of flat photos. Each photo should be a
    ``dict`` with a ``url`` key.
    :param flat2_photos: First list of flat photos. Each photo should be a
    ``dict`` with a ``url`` key.
    :return: The found number of common photos.
    """
    n_common_photos = 0
    for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
        try:
            req1 = requests.get(photo1["url"])
            im1 = PIL.Image.open(BytesIO(req1.content))
            hash1 = imagehash.average_hash(im1)
            req2 = requests.get(photo2["url"])
            im2 = PIL.Image.open(BytesIO(req2.content))
            hash2 = imagehash.average_hash(im2)
            if hash1 - hash2 == 0:
                n_common_photos += 1
        except (IOError, requests.exceptions.RequestException):
            pass
    return n_common_photos
 def detect(flats_list, key="id", merge=True, should_intersect=False):
    """
    Detect obvious duplicates within a given list of flats.
@ -111,11 +177,21 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
 def deep_detect(flats_list):
    """
-    TODO
+    Deeper detection of duplicates based on any available data.
    :param flats_list: A list of flats dicts.
    :return: A tuple of the deduplicated list of flat dicts and the list of all
    the flats objects that should be removed and considered as duplicates (they
    were already merged).
    """
    matching_flats = collections.defaultdict(list)
    for i, flat1 in enumerate(flats_list):
        matching_flats[flat1["id"]].append(flat1["id"])
        for j, flat2 in enumerate(flats_list):
-            if i < j:
+            if i <= j:
                continue
            if flat2["id"] in matching_flats[flat1["id"]]:
                continue
            n_common_items = 0
@ -157,26 +233,75 @@ def deep_detect(flats_list):
                    )
                    n_common_items += 1
                # TODO: Compare texts (one is included in another? fuzzymatch?)
                # They should have the same phone number if it was fetched for
                # both
-                if flat1["phone"] and flat2["phone"]:
+                flat1_phone = homogeneize_phone_number(flat1["phone"])
-                    homogeneize_phone_number = lambda number: (
+                flat2_phone = homogeneize_phone_number(flat2["phone"])
-                        number.replace(".", "").replace(" ", "")
+                if flat1_phone and flat2_phone:
-                    )
+                    assert flat1_phone == flat2_phone
-                    pass  # TODO: Homogeneize phone numbers
+                    n_common_items += 10  # Counts much more that the rest
-                # TODO: Compare texts (one is included in another? fuzzymatch?)
+                # They should have at least one photo in common if there
-            except AssertionError:
+                # are some photos
                if flat1["photos"] and flat2["photos"]:
                    max_number_photos = max(len(flat1["photos"]),
                                            len(flat2["photos"]))
                    n_common_photos = find_number_common_photos(
                        flat1["photos"],
                        flat2["photos"]
                    )
                    assert n_common_photos > 1
                    n_common_items += int(
                        20 * n_common_photos / max_number_photos
                    )
                # Minimal score to consider they are duplicates
                assert n_common_items >= 15
            except (AssertionError, TypeError):
                # Skip and consider as not duplicates whenever the conditions
                # are not met
                continue
            except TypeError:
                # TypeError occurs when an area or a cost is None, which should
                # not be considered as duplicates
                continue
-            # TODO: Check the number of common items
+            # Mark flats as duplicates
            LOGGER.info(
                ("Found duplicates using deep detection: (%s, %s). "
                 "Score is %d."),
                flat1["id"],
                flat2["id"],
                n_common_items
            )
            matching_flats[flat1["id"]].append(flat2["id"])
            matching_flats[flat2["id"]].append(flat1["id"])
-            # TODO: Merge flats
+    seen_ids = []
    duplicate_flats = []
    unique_flats_list = []
    for flat_id in [flat["id"] for flat in flats_list]:
        if flat_id in seen_ids:
            continue
-            # TODO: Compare photos
+        seen_ids.extend(matching_flats[flat_id])
        to_merge = sorted(
            [
                flat
                for flat in flats_list
                if flat["id"] in matching_flats[flat_id]
            ],
            key=lambda flat: next(
                i for (i, backend) in enumerate(BACKENDS_PRECEDENCE)
                if flat["id"].endswith(backend)
            ),
            reverse=True
        )
        unique_flats_list.append(tools.merge_dicts(*to_merge))
        # The ID of the added merged flat will be the one of the last item
        # in ``matching_flats``. Then, any flat object that was before in
        # the ``matching_flats`` list is to be considered as a duplicate
        # and should have a ``duplicate`` status.
        duplicate_flats.extend(to_merge[:-1])
    return unique_flats_list, duplicate_flats
--- a/flatisfy/filters/metadata.py
+++ b/flatisfy/filters/metadata.py
@ -236,7 +236,6 @@ def guess_stations(flats_list, config, distance_threshold=1500):
    for flat in flats_list:
        flat_station = flat.get("station", None)
        # TODO: Use flat location field as well?
        if not flat_station:
            # Skip everything if empty station
--- a/requirements.txt
+++ b/requirements.txt
@ -5,6 +5,8 @@ bottle-sqlalchemy
 canister
 enum34
 future
 imagehash
 pillow
 request
 sqlalchemy
 unidecode