Better deduplication

Perform deeper deduplication, based on all the available data, and
trying to match common photos.
This commit is contained in:
Lucas Verney 2017-04-28 20:59:46 +02:00
parent 2af742b764
commit 589bfdfb13
6 changed files with 166 additions and 28 deletions

View File

@ -60,7 +60,10 @@ def filter_flats(config, flats_list, fetch_details=True):
return { return {
"new": second_pass_result["new"], "new": second_pass_result["new"],
"duplicate": first_pass_result["duplicate"], "duplicate": (
first_pass_result["duplicate"] +
second_pass_result["duplicate"]
),
"ignored": ( "ignored": (
first_pass_result["ignored"] + second_pass_result["ignored"] first_pass_result["ignored"] + second_pass_result["ignored"]
) )

View File

@ -54,6 +54,8 @@ class WeboobProxy(object):
flat[field] = float(flat[field]) flat[field] = float(flat[field])
except (TypeError, ValueError): except (TypeError, ValueError):
flat[field] = None flat[field] = None
except KeyError:
pass
return flat return flat
def __init__(self, config): def __init__(self, config):
@ -193,15 +195,23 @@ class WeboobProxy(object):
(ID@BACKEND) (ID@BACKEND)
:return: The details in JSON. :return: The details in JSON.
""" """
housing = {}
flat_id, backend_name = full_flat_id.rsplit("@", 1) flat_id, backend_name = full_flat_id.rsplit("@", 1)
backend = next( try:
backend backend = next(
for backend in self.backends backend
if backend.name == backend_name for backend in self.backends
) if backend.name == backend_name
)
except StopIteration:
LOGGER.error("Backend %s is not available.", backend_name)
return "{}"
try: try:
housing = backend.get_housing(flat_id) housing = backend.get_housing(flat_id)
# Otherwise, we miss the @backend afterwards
housing.id = full_flat_id
return json.dumps(housing, cls=WeboobEncoder)
except CallErrors as exc: except CallErrors as exc:
# If an error occured, just log it # If an error occured, just log it
LOGGER.error( LOGGER.error(
@ -210,9 +220,6 @@ class WeboobProxy(object):
str(exc) str(exc)
) )
housing.id = full_flat_id # Otherwise, we miss the @backend afterwards
return json.dumps(housing, cls=WeboobEncoder)
def fetch_flats_list(config): def fetch_flats_list(config):
""" """

View File

@ -142,18 +142,20 @@ def second_pass(flats_list, config):
# Confirm postal code # Confirm postal code
flats_list = metadata.guess_postal_code(flats_list, config) flats_list = metadata.guess_postal_code(flats_list, config)
# TODO: Guess the address
# Better match with stations (confirm and check better) # Better match with stations (confirm and check better)
flats_list = metadata.guess_stations(flats_list, config) flats_list = metadata.guess_stations(flats_list, config)
# Compute travel time to specified points # Compute travel time to specified points
flats_list = metadata.compute_travel_times(flats_list, config) flats_list = metadata.compute_travel_times(flats_list, config)
# Deduplicate the list using every available data
flats_list, duplicate_flats = duplicates.deep_detect(flats_list)
# Remove returned housing posts that do not match criteria # Remove returned housing posts that do not match criteria
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config) flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
return { return {
"new": flats_list, "new": flats_list,
"ignored": ignored_list "ignored": ignored_list,
"duplicate": duplicate_flats
} }

View File

@ -5,7 +5,15 @@ Filtering functions to detect and merge duplicates.
from __future__ import absolute_import, print_function, unicode_literals from __future__ import absolute_import, print_function, unicode_literals
import collections import collections
import itertools
import logging import logging
import re
from io import BytesIO
import imagehash
import PIL.Image
import requests
from flatisfy import tools from flatisfy import tools
@ -23,6 +31,64 @@ BACKENDS_PRECEDENCE = [
] ]
def homogeneize_phone_number(number):
"""
Homogeneize the phone numbers, by stripping any space, dash or dot as well
as the international prefix. Assumes it is dealing with French phone
numbers (starting with a zero and having 10 characters).
:param number: The phone number to homogeneize.
:return: The cleaned phone number. ``None`` if the number is not valid.
"""
if not number:
return None
number = number.replace(".", "")
number = number.replace(" ", "")
number = number.replace("-", "")
number = number.replace("(", "")
number = number.replace(")", "")
number = re.sub(r'^\+\d\d', "", number)
if not number.startswith("0"):
number = "0" + number
if len(number) != 10:
return None
return number
def find_number_common_photos(flat1_photos, flat2_photos):
"""
Compute the number of common photos between the two lists of photos for the
flats.
Fetch the photos and compare them with dHash method.
:param flat1_photos: First list of flat photos. Each photo should be a
``dict`` with a ``url`` key.
:param flat2_photos: First list of flat photos. Each photo should be a
``dict`` with a ``url`` key.
:return: The found number of common photos.
"""
n_common_photos = 0
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
try:
req1 = requests.get(photo1["url"])
im1 = PIL.Image.open(BytesIO(req1.content))
hash1 = imagehash.average_hash(im1)
req2 = requests.get(photo2["url"])
im2 = PIL.Image.open(BytesIO(req2.content))
hash2 = imagehash.average_hash(im2)
if hash1 - hash2 == 0:
n_common_photos += 1
except (IOError, requests.exceptions.RequestException):
pass
return n_common_photos
def detect(flats_list, key="id", merge=True, should_intersect=False): def detect(flats_list, key="id", merge=True, should_intersect=False):
""" """
Detect obvious duplicates within a given list of flats. Detect obvious duplicates within a given list of flats.
@ -111,11 +177,21 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
def deep_detect(flats_list): def deep_detect(flats_list):
""" """
TODO Deeper detection of duplicates based on any available data.
:param flats_list: A list of flats dicts.
:return: A tuple of the deduplicated list of flat dicts and the list of all
the flats objects that should be removed and considered as duplicates (they
were already merged).
""" """
matching_flats = collections.defaultdict(list)
for i, flat1 in enumerate(flats_list): for i, flat1 in enumerate(flats_list):
matching_flats[flat1["id"]].append(flat1["id"])
for j, flat2 in enumerate(flats_list): for j, flat2 in enumerate(flats_list):
if i < j: if i <= j:
continue
if flat2["id"] in matching_flats[flat1["id"]]:
continue continue
n_common_items = 0 n_common_items = 0
@ -157,26 +233,75 @@ def deep_detect(flats_list):
) )
n_common_items += 1 n_common_items += 1
# TODO: Compare texts (one is included in another? fuzzymatch?)
# They should have the same phone number if it was fetched for # They should have the same phone number if it was fetched for
# both # both
if flat1["phone"] and flat2["phone"]: flat1_phone = homogeneize_phone_number(flat1["phone"])
homogeneize_phone_number = lambda number: ( flat2_phone = homogeneize_phone_number(flat2["phone"])
number.replace(".", "").replace(" ", "") if flat1_phone and flat2_phone:
) assert flat1_phone == flat2_phone
pass # TODO: Homogeneize phone numbers n_common_items += 10 # Counts much more that the rest
# TODO: Compare texts (one is included in another? fuzzymatch?) # They should have at least one photo in common if there
except AssertionError: # are some photos
if flat1["photos"] and flat2["photos"]:
max_number_photos = max(len(flat1["photos"]),
len(flat2["photos"]))
n_common_photos = find_number_common_photos(
flat1["photos"],
flat2["photos"]
)
assert n_common_photos > 1
n_common_items += int(
20 * n_common_photos / max_number_photos
)
# Minimal score to consider they are duplicates
assert n_common_items >= 15
except (AssertionError, TypeError):
# Skip and consider as not duplicates whenever the conditions # Skip and consider as not duplicates whenever the conditions
# are not met # are not met
continue
except TypeError:
# TypeError occurs when an area or a cost is None, which should # TypeError occurs when an area or a cost is None, which should
# not be considered as duplicates # not be considered as duplicates
continue continue
# TODO: Check the number of common items # Mark flats as duplicates
LOGGER.info(
("Found duplicates using deep detection: (%s, %s). "
"Score is %d."),
flat1["id"],
flat2["id"],
n_common_items
)
matching_flats[flat1["id"]].append(flat2["id"])
matching_flats[flat2["id"]].append(flat1["id"])
# TODO: Merge flats seen_ids = []
duplicate_flats = []
unique_flats_list = []
for flat_id in [flat["id"] for flat in flats_list]:
if flat_id in seen_ids:
continue
# TODO: Compare photos seen_ids.extend(matching_flats[flat_id])
to_merge = sorted(
[
flat
for flat in flats_list
if flat["id"] in matching_flats[flat_id]
],
key=lambda flat: next(
i for (i, backend) in enumerate(BACKENDS_PRECEDENCE)
if flat["id"].endswith(backend)
),
reverse=True
)
unique_flats_list.append(tools.merge_dicts(*to_merge))
# The ID of the added merged flat will be the one of the last item
# in ``matching_flats``. Then, any flat object that was before in
# the ``matching_flats`` list is to be considered as a duplicate
# and should have a ``duplicate`` status.
duplicate_flats.extend(to_merge[:-1])
return unique_flats_list, duplicate_flats

View File

@ -236,7 +236,6 @@ def guess_stations(flats_list, config, distance_threshold=1500):
for flat in flats_list: for flat in flats_list:
flat_station = flat.get("station", None) flat_station = flat.get("station", None)
# TODO: Use flat location field as well?
if not flat_station: if not flat_station:
# Skip everything if empty station # Skip everything if empty station

View File

@ -5,6 +5,8 @@ bottle-sqlalchemy
canister canister
enum34 enum34
future future
imagehash
pillow
request request
sqlalchemy sqlalchemy
unidecode unidecode