Better deduplication

Perform deeper deduplication, based on all the available data, and
trying to match common photos.
This commit is contained in:
Lucas Verney 2017-04-28 20:59:46 +02:00
parent 2af742b764
commit 589bfdfb13
6 changed files with 166 additions and 28 deletions

View File

@ -60,7 +60,10 @@ def filter_flats(config, flats_list, fetch_details=True):
return {
"new": second_pass_result["new"],
"duplicate": first_pass_result["duplicate"],
"duplicate": (
first_pass_result["duplicate"] +
second_pass_result["duplicate"]
),
"ignored": (
first_pass_result["ignored"] + second_pass_result["ignored"]
)

View File

@ -54,6 +54,8 @@ class WeboobProxy(object):
flat[field] = float(flat[field])
except (TypeError, ValueError):
flat[field] = None
except KeyError:
pass
return flat
def __init__(self, config):
@ -193,15 +195,23 @@ class WeboobProxy(object):
(ID@BACKEND)
:return: The details in JSON.
"""
housing = {}
flat_id, backend_name = full_flat_id.rsplit("@", 1)
try:
backend = next(
backend
for backend in self.backends
if backend.name == backend_name
)
except StopIteration:
LOGGER.error("Backend %s is not available.", backend_name)
return "{}"
try:
housing = backend.get_housing(flat_id)
# Otherwise, we miss the @backend afterwards
housing.id = full_flat_id
return json.dumps(housing, cls=WeboobEncoder)
except CallErrors as exc:
# If an error occured, just log it
LOGGER.error(
@ -210,9 +220,6 @@ class WeboobProxy(object):
str(exc)
)
housing.id = full_flat_id # Otherwise, we miss the @backend afterwards
return json.dumps(housing, cls=WeboobEncoder)
def fetch_flats_list(config):
"""

View File

@ -142,18 +142,20 @@ def second_pass(flats_list, config):
# Confirm postal code
flats_list = metadata.guess_postal_code(flats_list, config)
# TODO: Guess the address
# Better match with stations (confirm and check better)
flats_list = metadata.guess_stations(flats_list, config)
# Compute travel time to specified points
flats_list = metadata.compute_travel_times(flats_list, config)
# Deduplicate the list using every available data
flats_list, duplicate_flats = duplicates.deep_detect(flats_list)
# Remove returned housing posts that do not match criteria
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
return {
"new": flats_list,
"ignored": ignored_list
"ignored": ignored_list,
"duplicate": duplicate_flats
}

View File

@ -5,7 +5,15 @@ Filtering functions to detect and merge duplicates.
from __future__ import absolute_import, print_function, unicode_literals
import collections
import itertools
import logging
import re
from io import BytesIO
import imagehash
import PIL.Image
import requests
from flatisfy import tools
@ -23,6 +31,64 @@ BACKENDS_PRECEDENCE = [
]
def homogeneize_phone_number(number):
"""
Homogeneize the phone numbers, by stripping any space, dash or dot as well
as the international prefix. Assumes it is dealing with French phone
numbers (starting with a zero and having 10 characters).
:param number: The phone number to homogeneize.
:return: The cleaned phone number. ``None`` if the number is not valid.
"""
if not number:
return None
number = number.replace(".", "")
number = number.replace(" ", "")
number = number.replace("-", "")
number = number.replace("(", "")
number = number.replace(")", "")
number = re.sub(r'^\+\d\d', "", number)
if not number.startswith("0"):
number = "0" + number
if len(number) != 10:
return None
return number
def find_number_common_photos(flat1_photos, flat2_photos):
"""
Compute the number of common photos between the two lists of photos for the
flats.
Fetch the photos and compare them with dHash method.
:param flat1_photos: First list of flat photos. Each photo should be a
``dict`` with a ``url`` key.
:param flat2_photos: First list of flat photos. Each photo should be a
``dict`` with a ``url`` key.
:return: The found number of common photos.
"""
n_common_photos = 0
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
try:
req1 = requests.get(photo1["url"])
im1 = PIL.Image.open(BytesIO(req1.content))
hash1 = imagehash.average_hash(im1)
req2 = requests.get(photo2["url"])
im2 = PIL.Image.open(BytesIO(req2.content))
hash2 = imagehash.average_hash(im2)
if hash1 - hash2 == 0:
n_common_photos += 1
except (IOError, requests.exceptions.RequestException):
pass
return n_common_photos
def detect(flats_list, key="id", merge=True, should_intersect=False):
"""
Detect obvious duplicates within a given list of flats.
@ -111,11 +177,21 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
def deep_detect(flats_list):
"""
TODO
Deeper detection of duplicates based on any available data.
:param flats_list: A list of flats dicts.
:return: A tuple of the deduplicated list of flat dicts and the list of all
the flats objects that should be removed and considered as duplicates (they
were already merged).
"""
matching_flats = collections.defaultdict(list)
for i, flat1 in enumerate(flats_list):
matching_flats[flat1["id"]].append(flat1["id"])
for j, flat2 in enumerate(flats_list):
if i < j:
if i <= j:
continue
if flat2["id"] in matching_flats[flat1["id"]]:
continue
n_common_items = 0
@ -157,26 +233,75 @@ def deep_detect(flats_list):
)
n_common_items += 1
# TODO: Compare texts (one is included in another? fuzzymatch?)
# They should have the same phone number if it was fetched for
# both
if flat1["phone"] and flat2["phone"]:
homogeneize_phone_number = lambda number: (
number.replace(".", "").replace(" ", "")
)
pass # TODO: Homogeneize phone numbers
flat1_phone = homogeneize_phone_number(flat1["phone"])
flat2_phone = homogeneize_phone_number(flat2["phone"])
if flat1_phone and flat2_phone:
assert flat1_phone == flat2_phone
n_common_items += 10 # Counts much more that the rest
# TODO: Compare texts (one is included in another? fuzzymatch?)
except AssertionError:
# They should have at least one photo in common if there
# are some photos
if flat1["photos"] and flat2["photos"]:
max_number_photos = max(len(flat1["photos"]),
len(flat2["photos"]))
n_common_photos = find_number_common_photos(
flat1["photos"],
flat2["photos"]
)
assert n_common_photos > 1
n_common_items += int(
20 * n_common_photos / max_number_photos
)
# Minimal score to consider they are duplicates
assert n_common_items >= 15
except (AssertionError, TypeError):
# Skip and consider as not duplicates whenever the conditions
# are not met
continue
except TypeError:
# TypeError occurs when an area or a cost is None, which should
# not be considered as duplicates
continue
# TODO: Check the number of common items
# Mark flats as duplicates
LOGGER.info(
("Found duplicates using deep detection: (%s, %s). "
"Score is %d."),
flat1["id"],
flat2["id"],
n_common_items
)
matching_flats[flat1["id"]].append(flat2["id"])
matching_flats[flat2["id"]].append(flat1["id"])
# TODO: Merge flats
seen_ids = []
duplicate_flats = []
unique_flats_list = []
for flat_id in [flat["id"] for flat in flats_list]:
if flat_id in seen_ids:
continue
# TODO: Compare photos
seen_ids.extend(matching_flats[flat_id])
to_merge = sorted(
[
flat
for flat in flats_list
if flat["id"] in matching_flats[flat_id]
],
key=lambda flat: next(
i for (i, backend) in enumerate(BACKENDS_PRECEDENCE)
if flat["id"].endswith(backend)
),
reverse=True
)
unique_flats_list.append(tools.merge_dicts(*to_merge))
# The ID of the added merged flat will be the one of the last item
# in ``matching_flats``. Then, any flat object that was before in
# the ``matching_flats`` list is to be considered as a duplicate
# and should have a ``duplicate`` status.
duplicate_flats.extend(to_merge[:-1])
return unique_flats_list, duplicate_flats

View File

@ -236,7 +236,6 @@ def guess_stations(flats_list, config, distance_threshold=1500):
for flat in flats_list:
flat_station = flat.get("station", None)
# TODO: Use flat location field as well?
if not flat_station:
# Skip everything if empty station

View File

@ -5,6 +5,8 @@ bottle-sqlalchemy
canister
enum34
future
imagehash
pillow
request
sqlalchemy
unidecode