Better deduplication
Perform deeper deduplication, based on all the available data, and trying to match common photos.
This commit is contained in:
parent
2af742b764
commit
589bfdfb13
@ -60,7 +60,10 @@ def filter_flats(config, flats_list, fetch_details=True):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"new": second_pass_result["new"],
|
"new": second_pass_result["new"],
|
||||||
"duplicate": first_pass_result["duplicate"],
|
"duplicate": (
|
||||||
|
first_pass_result["duplicate"] +
|
||||||
|
second_pass_result["duplicate"]
|
||||||
|
),
|
||||||
"ignored": (
|
"ignored": (
|
||||||
first_pass_result["ignored"] + second_pass_result["ignored"]
|
first_pass_result["ignored"] + second_pass_result["ignored"]
|
||||||
)
|
)
|
||||||
|
@ -54,6 +54,8 @@ class WeboobProxy(object):
|
|||||||
flat[field] = float(flat[field])
|
flat[field] = float(flat[field])
|
||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
flat[field] = None
|
flat[field] = None
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
return flat
|
return flat
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@ -193,15 +195,23 @@ class WeboobProxy(object):
|
|||||||
(ID@BACKEND)
|
(ID@BACKEND)
|
||||||
:return: The details in JSON.
|
:return: The details in JSON.
|
||||||
"""
|
"""
|
||||||
housing = {}
|
|
||||||
flat_id, backend_name = full_flat_id.rsplit("@", 1)
|
flat_id, backend_name = full_flat_id.rsplit("@", 1)
|
||||||
|
try:
|
||||||
backend = next(
|
backend = next(
|
||||||
backend
|
backend
|
||||||
for backend in self.backends
|
for backend in self.backends
|
||||||
if backend.name == backend_name
|
if backend.name == backend_name
|
||||||
)
|
)
|
||||||
|
except StopIteration:
|
||||||
|
LOGGER.error("Backend %s is not available.", backend_name)
|
||||||
|
return "{}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
housing = backend.get_housing(flat_id)
|
housing = backend.get_housing(flat_id)
|
||||||
|
# Otherwise, we miss the @backend afterwards
|
||||||
|
housing.id = full_flat_id
|
||||||
|
|
||||||
|
return json.dumps(housing, cls=WeboobEncoder)
|
||||||
except CallErrors as exc:
|
except CallErrors as exc:
|
||||||
# If an error occured, just log it
|
# If an error occured, just log it
|
||||||
LOGGER.error(
|
LOGGER.error(
|
||||||
@ -210,9 +220,6 @@ class WeboobProxy(object):
|
|||||||
str(exc)
|
str(exc)
|
||||||
)
|
)
|
||||||
|
|
||||||
housing.id = full_flat_id # Otherwise, we miss the @backend afterwards
|
|
||||||
return json.dumps(housing, cls=WeboobEncoder)
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_flats_list(config):
|
def fetch_flats_list(config):
|
||||||
"""
|
"""
|
||||||
|
@ -142,18 +142,20 @@ def second_pass(flats_list, config):
|
|||||||
# Confirm postal code
|
# Confirm postal code
|
||||||
flats_list = metadata.guess_postal_code(flats_list, config)
|
flats_list = metadata.guess_postal_code(flats_list, config)
|
||||||
|
|
||||||
# TODO: Guess the address
|
|
||||||
|
|
||||||
# Better match with stations (confirm and check better)
|
# Better match with stations (confirm and check better)
|
||||||
flats_list = metadata.guess_stations(flats_list, config)
|
flats_list = metadata.guess_stations(flats_list, config)
|
||||||
|
|
||||||
# Compute travel time to specified points
|
# Compute travel time to specified points
|
||||||
flats_list = metadata.compute_travel_times(flats_list, config)
|
flats_list = metadata.compute_travel_times(flats_list, config)
|
||||||
|
|
||||||
|
# Deduplicate the list using every available data
|
||||||
|
flats_list, duplicate_flats = duplicates.deep_detect(flats_list)
|
||||||
|
|
||||||
# Remove returned housing posts that do not match criteria
|
# Remove returned housing posts that do not match criteria
|
||||||
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"new": flats_list,
|
"new": flats_list,
|
||||||
"ignored": ignored_list
|
"ignored": ignored_list,
|
||||||
|
"duplicate": duplicate_flats
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,15 @@ Filtering functions to detect and merge duplicates.
|
|||||||
from __future__ import absolute_import, print_function, unicode_literals
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
import collections
|
import collections
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
import imagehash
|
||||||
|
import PIL.Image
|
||||||
|
import requests
|
||||||
|
|
||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
|
|
||||||
@ -23,6 +31,64 @@ BACKENDS_PRECEDENCE = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def homogeneize_phone_number(number):
|
||||||
|
"""
|
||||||
|
Homogeneize the phone numbers, by stripping any space, dash or dot as well
|
||||||
|
as the international prefix. Assumes it is dealing with French phone
|
||||||
|
numbers (starting with a zero and having 10 characters).
|
||||||
|
|
||||||
|
:param number: The phone number to homogeneize.
|
||||||
|
:return: The cleaned phone number. ``None`` if the number is not valid.
|
||||||
|
"""
|
||||||
|
if not number:
|
||||||
|
return None
|
||||||
|
number = number.replace(".", "")
|
||||||
|
number = number.replace(" ", "")
|
||||||
|
number = number.replace("-", "")
|
||||||
|
number = number.replace("(", "")
|
||||||
|
number = number.replace(")", "")
|
||||||
|
number = re.sub(r'^\+\d\d', "", number)
|
||||||
|
|
||||||
|
if not number.startswith("0"):
|
||||||
|
number = "0" + number
|
||||||
|
|
||||||
|
if len(number) != 10:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return number
|
||||||
|
|
||||||
|
|
||||||
|
def find_number_common_photos(flat1_photos, flat2_photos):
|
||||||
|
"""
|
||||||
|
Compute the number of common photos between the two lists of photos for the
|
||||||
|
flats.
|
||||||
|
|
||||||
|
Fetch the photos and compare them with dHash method.
|
||||||
|
|
||||||
|
:param flat1_photos: First list of flat photos. Each photo should be a
|
||||||
|
``dict`` with a ``url`` key.
|
||||||
|
:param flat2_photos: First list of flat photos. Each photo should be a
|
||||||
|
``dict`` with a ``url`` key.
|
||||||
|
:return: The found number of common photos.
|
||||||
|
"""
|
||||||
|
n_common_photos = 0
|
||||||
|
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
|
||||||
|
try:
|
||||||
|
req1 = requests.get(photo1["url"])
|
||||||
|
im1 = PIL.Image.open(BytesIO(req1.content))
|
||||||
|
hash1 = imagehash.average_hash(im1)
|
||||||
|
|
||||||
|
req2 = requests.get(photo2["url"])
|
||||||
|
im2 = PIL.Image.open(BytesIO(req2.content))
|
||||||
|
hash2 = imagehash.average_hash(im2)
|
||||||
|
|
||||||
|
if hash1 - hash2 == 0:
|
||||||
|
n_common_photos += 1
|
||||||
|
except (IOError, requests.exceptions.RequestException):
|
||||||
|
pass
|
||||||
|
return n_common_photos
|
||||||
|
|
||||||
|
|
||||||
def detect(flats_list, key="id", merge=True, should_intersect=False):
|
def detect(flats_list, key="id", merge=True, should_intersect=False):
|
||||||
"""
|
"""
|
||||||
Detect obvious duplicates within a given list of flats.
|
Detect obvious duplicates within a given list of flats.
|
||||||
@ -111,11 +177,21 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
|
|||||||
|
|
||||||
def deep_detect(flats_list):
|
def deep_detect(flats_list):
|
||||||
"""
|
"""
|
||||||
TODO
|
Deeper detection of duplicates based on any available data.
|
||||||
|
|
||||||
|
:param flats_list: A list of flats dicts.
|
||||||
|
:return: A tuple of the deduplicated list of flat dicts and the list of all
|
||||||
|
the flats objects that should be removed and considered as duplicates (they
|
||||||
|
were already merged).
|
||||||
"""
|
"""
|
||||||
|
matching_flats = collections.defaultdict(list)
|
||||||
for i, flat1 in enumerate(flats_list):
|
for i, flat1 in enumerate(flats_list):
|
||||||
|
matching_flats[flat1["id"]].append(flat1["id"])
|
||||||
for j, flat2 in enumerate(flats_list):
|
for j, flat2 in enumerate(flats_list):
|
||||||
if i < j:
|
if i <= j:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if flat2["id"] in matching_flats[flat1["id"]]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
n_common_items = 0
|
n_common_items = 0
|
||||||
@ -157,26 +233,75 @@ def deep_detect(flats_list):
|
|||||||
)
|
)
|
||||||
n_common_items += 1
|
n_common_items += 1
|
||||||
|
|
||||||
|
# TODO: Compare texts (one is included in another? fuzzymatch?)
|
||||||
|
|
||||||
# They should have the same phone number if it was fetched for
|
# They should have the same phone number if it was fetched for
|
||||||
# both
|
# both
|
||||||
if flat1["phone"] and flat2["phone"]:
|
flat1_phone = homogeneize_phone_number(flat1["phone"])
|
||||||
homogeneize_phone_number = lambda number: (
|
flat2_phone = homogeneize_phone_number(flat2["phone"])
|
||||||
number.replace(".", "").replace(" ", "")
|
if flat1_phone and flat2_phone:
|
||||||
)
|
assert flat1_phone == flat2_phone
|
||||||
pass # TODO: Homogeneize phone numbers
|
n_common_items += 10 # Counts much more that the rest
|
||||||
|
|
||||||
# TODO: Compare texts (one is included in another? fuzzymatch?)
|
# They should have at least one photo in common if there
|
||||||
except AssertionError:
|
# are some photos
|
||||||
|
if flat1["photos"] and flat2["photos"]:
|
||||||
|
max_number_photos = max(len(flat1["photos"]),
|
||||||
|
len(flat2["photos"]))
|
||||||
|
n_common_photos = find_number_common_photos(
|
||||||
|
flat1["photos"],
|
||||||
|
flat2["photos"]
|
||||||
|
)
|
||||||
|
assert n_common_photos > 1
|
||||||
|
n_common_items += int(
|
||||||
|
20 * n_common_photos / max_number_photos
|
||||||
|
)
|
||||||
|
|
||||||
|
# Minimal score to consider they are duplicates
|
||||||
|
assert n_common_items >= 15
|
||||||
|
except (AssertionError, TypeError):
|
||||||
# Skip and consider as not duplicates whenever the conditions
|
# Skip and consider as not duplicates whenever the conditions
|
||||||
# are not met
|
# are not met
|
||||||
continue
|
|
||||||
except TypeError:
|
|
||||||
# TypeError occurs when an area or a cost is None, which should
|
# TypeError occurs when an area or a cost is None, which should
|
||||||
# not be considered as duplicates
|
# not be considered as duplicates
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# TODO: Check the number of common items
|
# Mark flats as duplicates
|
||||||
|
LOGGER.info(
|
||||||
|
("Found duplicates using deep detection: (%s, %s). "
|
||||||
|
"Score is %d."),
|
||||||
|
flat1["id"],
|
||||||
|
flat2["id"],
|
||||||
|
n_common_items
|
||||||
|
)
|
||||||
|
matching_flats[flat1["id"]].append(flat2["id"])
|
||||||
|
matching_flats[flat2["id"]].append(flat1["id"])
|
||||||
|
|
||||||
# TODO: Merge flats
|
seen_ids = []
|
||||||
|
duplicate_flats = []
|
||||||
|
unique_flats_list = []
|
||||||
|
for flat_id in [flat["id"] for flat in flats_list]:
|
||||||
|
if flat_id in seen_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
# TODO: Compare photos
|
seen_ids.extend(matching_flats[flat_id])
|
||||||
|
to_merge = sorted(
|
||||||
|
[
|
||||||
|
flat
|
||||||
|
for flat in flats_list
|
||||||
|
if flat["id"] in matching_flats[flat_id]
|
||||||
|
],
|
||||||
|
key=lambda flat: next(
|
||||||
|
i for (i, backend) in enumerate(BACKENDS_PRECEDENCE)
|
||||||
|
if flat["id"].endswith(backend)
|
||||||
|
),
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
unique_flats_list.append(tools.merge_dicts(*to_merge))
|
||||||
|
# The ID of the added merged flat will be the one of the last item
|
||||||
|
# in ``matching_flats``. Then, any flat object that was before in
|
||||||
|
# the ``matching_flats`` list is to be considered as a duplicate
|
||||||
|
# and should have a ``duplicate`` status.
|
||||||
|
duplicate_flats.extend(to_merge[:-1])
|
||||||
|
|
||||||
|
return unique_flats_list, duplicate_flats
|
||||||
|
@ -236,7 +236,6 @@ def guess_stations(flats_list, config, distance_threshold=1500):
|
|||||||
|
|
||||||
for flat in flats_list:
|
for flat in flats_list:
|
||||||
flat_station = flat.get("station", None)
|
flat_station = flat.get("station", None)
|
||||||
# TODO: Use flat location field as well?
|
|
||||||
|
|
||||||
if not flat_station:
|
if not flat_station:
|
||||||
# Skip everything if empty station
|
# Skip everything if empty station
|
||||||
|
@ -5,6 +5,8 @@ bottle-sqlalchemy
|
|||||||
canister
|
canister
|
||||||
enum34
|
enum34
|
||||||
future
|
future
|
||||||
|
imagehash
|
||||||
|
pillow
|
||||||
request
|
request
|
||||||
sqlalchemy
|
sqlalchemy
|
||||||
unidecode
|
unidecode
|
||||||
|
Loading…
Reference in New Issue
Block a user