Better deduplication
Perform deeper deduplication, based on all the available data, and trying to match common photos.
This commit is contained in:
parent
2af742b764
commit
589bfdfb13
@ -60,7 +60,10 @@ def filter_flats(config, flats_list, fetch_details=True):
|
||||
|
||||
return {
|
||||
"new": second_pass_result["new"],
|
||||
"duplicate": first_pass_result["duplicate"],
|
||||
"duplicate": (
|
||||
first_pass_result["duplicate"] +
|
||||
second_pass_result["duplicate"]
|
||||
),
|
||||
"ignored": (
|
||||
first_pass_result["ignored"] + second_pass_result["ignored"]
|
||||
)
|
||||
|
@ -54,6 +54,8 @@ class WeboobProxy(object):
|
||||
flat[field] = float(flat[field])
|
||||
except (TypeError, ValueError):
|
||||
flat[field] = None
|
||||
except KeyError:
|
||||
pass
|
||||
return flat
|
||||
|
||||
def __init__(self, config):
|
||||
@ -193,15 +195,23 @@ class WeboobProxy(object):
|
||||
(ID@BACKEND)
|
||||
:return: The details in JSON.
|
||||
"""
|
||||
housing = {}
|
||||
flat_id, backend_name = full_flat_id.rsplit("@", 1)
|
||||
backend = next(
|
||||
backend
|
||||
for backend in self.backends
|
||||
if backend.name == backend_name
|
||||
)
|
||||
try:
|
||||
backend = next(
|
||||
backend
|
||||
for backend in self.backends
|
||||
if backend.name == backend_name
|
||||
)
|
||||
except StopIteration:
|
||||
LOGGER.error("Backend %s is not available.", backend_name)
|
||||
return "{}"
|
||||
|
||||
try:
|
||||
housing = backend.get_housing(flat_id)
|
||||
# Otherwise, we miss the @backend afterwards
|
||||
housing.id = full_flat_id
|
||||
|
||||
return json.dumps(housing, cls=WeboobEncoder)
|
||||
except CallErrors as exc:
|
||||
# If an error occured, just log it
|
||||
LOGGER.error(
|
||||
@ -210,9 +220,6 @@ class WeboobProxy(object):
|
||||
str(exc)
|
||||
)
|
||||
|
||||
housing.id = full_flat_id # Otherwise, we miss the @backend afterwards
|
||||
return json.dumps(housing, cls=WeboobEncoder)
|
||||
|
||||
|
||||
def fetch_flats_list(config):
|
||||
"""
|
||||
|
@ -142,18 +142,20 @@ def second_pass(flats_list, config):
|
||||
# Confirm postal code
|
||||
flats_list = metadata.guess_postal_code(flats_list, config)
|
||||
|
||||
# TODO: Guess the address
|
||||
|
||||
# Better match with stations (confirm and check better)
|
||||
flats_list = metadata.guess_stations(flats_list, config)
|
||||
|
||||
# Compute travel time to specified points
|
||||
flats_list = metadata.compute_travel_times(flats_list, config)
|
||||
|
||||
# Deduplicate the list using every available data
|
||||
flats_list, duplicate_flats = duplicates.deep_detect(flats_list)
|
||||
|
||||
# Remove returned housing posts that do not match criteria
|
||||
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
||||
|
||||
return {
|
||||
"new": flats_list,
|
||||
"ignored": ignored_list
|
||||
"ignored": ignored_list,
|
||||
"duplicate": duplicate_flats
|
||||
}
|
||||
|
@ -5,7 +5,15 @@ Filtering functions to detect and merge duplicates.
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import collections
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
import imagehash
|
||||
import PIL.Image
|
||||
import requests
|
||||
|
||||
from flatisfy import tools
|
||||
|
||||
@ -23,6 +31,64 @@ BACKENDS_PRECEDENCE = [
|
||||
]
|
||||
|
||||
|
||||
def homogeneize_phone_number(number):
|
||||
"""
|
||||
Homogeneize the phone numbers, by stripping any space, dash or dot as well
|
||||
as the international prefix. Assumes it is dealing with French phone
|
||||
numbers (starting with a zero and having 10 characters).
|
||||
|
||||
:param number: The phone number to homogeneize.
|
||||
:return: The cleaned phone number. ``None`` if the number is not valid.
|
||||
"""
|
||||
if not number:
|
||||
return None
|
||||
number = number.replace(".", "")
|
||||
number = number.replace(" ", "")
|
||||
number = number.replace("-", "")
|
||||
number = number.replace("(", "")
|
||||
number = number.replace(")", "")
|
||||
number = re.sub(r'^\+\d\d', "", number)
|
||||
|
||||
if not number.startswith("0"):
|
||||
number = "0" + number
|
||||
|
||||
if len(number) != 10:
|
||||
return None
|
||||
|
||||
return number
|
||||
|
||||
|
||||
def find_number_common_photos(flat1_photos, flat2_photos):
|
||||
"""
|
||||
Compute the number of common photos between the two lists of photos for the
|
||||
flats.
|
||||
|
||||
Fetch the photos and compare them with dHash method.
|
||||
|
||||
:param flat1_photos: First list of flat photos. Each photo should be a
|
||||
``dict`` with a ``url`` key.
|
||||
:param flat2_photos: First list of flat photos. Each photo should be a
|
||||
``dict`` with a ``url`` key.
|
||||
:return: The found number of common photos.
|
||||
"""
|
||||
n_common_photos = 0
|
||||
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
|
||||
try:
|
||||
req1 = requests.get(photo1["url"])
|
||||
im1 = PIL.Image.open(BytesIO(req1.content))
|
||||
hash1 = imagehash.average_hash(im1)
|
||||
|
||||
req2 = requests.get(photo2["url"])
|
||||
im2 = PIL.Image.open(BytesIO(req2.content))
|
||||
hash2 = imagehash.average_hash(im2)
|
||||
|
||||
if hash1 - hash2 == 0:
|
||||
n_common_photos += 1
|
||||
except (IOError, requests.exceptions.RequestException):
|
||||
pass
|
||||
return n_common_photos
|
||||
|
||||
|
||||
def detect(flats_list, key="id", merge=True, should_intersect=False):
|
||||
"""
|
||||
Detect obvious duplicates within a given list of flats.
|
||||
@ -111,11 +177,21 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
|
||||
|
||||
def deep_detect(flats_list):
|
||||
"""
|
||||
TODO
|
||||
Deeper detection of duplicates based on any available data.
|
||||
|
||||
:param flats_list: A list of flats dicts.
|
||||
:return: A tuple of the deduplicated list of flat dicts and the list of all
|
||||
the flats objects that should be removed and considered as duplicates (they
|
||||
were already merged).
|
||||
"""
|
||||
matching_flats = collections.defaultdict(list)
|
||||
for i, flat1 in enumerate(flats_list):
|
||||
matching_flats[flat1["id"]].append(flat1["id"])
|
||||
for j, flat2 in enumerate(flats_list):
|
||||
if i < j:
|
||||
if i <= j:
|
||||
continue
|
||||
|
||||
if flat2["id"] in matching_flats[flat1["id"]]:
|
||||
continue
|
||||
|
||||
n_common_items = 0
|
||||
@ -157,26 +233,75 @@ def deep_detect(flats_list):
|
||||
)
|
||||
n_common_items += 1
|
||||
|
||||
# TODO: Compare texts (one is included in another? fuzzymatch?)
|
||||
|
||||
# They should have the same phone number if it was fetched for
|
||||
# both
|
||||
if flat1["phone"] and flat2["phone"]:
|
||||
homogeneize_phone_number = lambda number: (
|
||||
number.replace(".", "").replace(" ", "")
|
||||
)
|
||||
pass # TODO: Homogeneize phone numbers
|
||||
flat1_phone = homogeneize_phone_number(flat1["phone"])
|
||||
flat2_phone = homogeneize_phone_number(flat2["phone"])
|
||||
if flat1_phone and flat2_phone:
|
||||
assert flat1_phone == flat2_phone
|
||||
n_common_items += 10 # Counts much more that the rest
|
||||
|
||||
# TODO: Compare texts (one is included in another? fuzzymatch?)
|
||||
except AssertionError:
|
||||
# They should have at least one photo in common if there
|
||||
# are some photos
|
||||
if flat1["photos"] and flat2["photos"]:
|
||||
max_number_photos = max(len(flat1["photos"]),
|
||||
len(flat2["photos"]))
|
||||
n_common_photos = find_number_common_photos(
|
||||
flat1["photos"],
|
||||
flat2["photos"]
|
||||
)
|
||||
assert n_common_photos > 1
|
||||
n_common_items += int(
|
||||
20 * n_common_photos / max_number_photos
|
||||
)
|
||||
|
||||
# Minimal score to consider they are duplicates
|
||||
assert n_common_items >= 15
|
||||
except (AssertionError, TypeError):
|
||||
# Skip and consider as not duplicates whenever the conditions
|
||||
# are not met
|
||||
continue
|
||||
except TypeError:
|
||||
# TypeError occurs when an area or a cost is None, which should
|
||||
# not be considered as duplicates
|
||||
continue
|
||||
|
||||
# TODO: Check the number of common items
|
||||
# Mark flats as duplicates
|
||||
LOGGER.info(
|
||||
("Found duplicates using deep detection: (%s, %s). "
|
||||
"Score is %d."),
|
||||
flat1["id"],
|
||||
flat2["id"],
|
||||
n_common_items
|
||||
)
|
||||
matching_flats[flat1["id"]].append(flat2["id"])
|
||||
matching_flats[flat2["id"]].append(flat1["id"])
|
||||
|
||||
# TODO: Merge flats
|
||||
seen_ids = []
|
||||
duplicate_flats = []
|
||||
unique_flats_list = []
|
||||
for flat_id in [flat["id"] for flat in flats_list]:
|
||||
if flat_id in seen_ids:
|
||||
continue
|
||||
|
||||
# TODO: Compare photos
|
||||
seen_ids.extend(matching_flats[flat_id])
|
||||
to_merge = sorted(
|
||||
[
|
||||
flat
|
||||
for flat in flats_list
|
||||
if flat["id"] in matching_flats[flat_id]
|
||||
],
|
||||
key=lambda flat: next(
|
||||
i for (i, backend) in enumerate(BACKENDS_PRECEDENCE)
|
||||
if flat["id"].endswith(backend)
|
||||
),
|
||||
reverse=True
|
||||
)
|
||||
unique_flats_list.append(tools.merge_dicts(*to_merge))
|
||||
# The ID of the added merged flat will be the one of the last item
|
||||
# in ``matching_flats``. Then, any flat object that was before in
|
||||
# the ``matching_flats`` list is to be considered as a duplicate
|
||||
# and should have a ``duplicate`` status.
|
||||
duplicate_flats.extend(to_merge[:-1])
|
||||
|
||||
return unique_flats_list, duplicate_flats
|
||||
|
@ -236,7 +236,6 @@ def guess_stations(flats_list, config, distance_threshold=1500):
|
||||
|
||||
for flat in flats_list:
|
||||
flat_station = flat.get("station", None)
|
||||
# TODO: Use flat location field as well?
|
||||
|
||||
if not flat_station:
|
||||
# Skip everything if empty station
|
||||
|
@ -5,6 +5,8 @@ bottle-sqlalchemy
|
||||
canister
|
||||
enum34
|
||||
future
|
||||
imagehash
|
||||
pillow
|
||||
request
|
||||
sqlalchemy
|
||||
unidecode
|
||||
|
Loading…
Reference in New Issue
Block a user