From 6a0681fc99193a2feded97e20be07ce7d9802682 Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Tue, 13 Jun 2017 19:22:08 +0200 Subject: [PATCH] Optimize photo comparisons; fixes #41, fixes #59 --- flatisfy/filters/__init__.py | 8 +++--- flatisfy/filters/cache.py | 45 ++++++++++++++++++++++++++++++++++ flatisfy/filters/duplicates.py | 34 ++++++++++++++++++------- flatisfy/tools.py | 14 +++++++++++ 4 files changed, 89 insertions(+), 12 deletions(-) create mode 100644 flatisfy/filters/cache.py diff --git a/flatisfy/filters/__init__.py b/flatisfy/filters/__init__.py index f11e5a5..0086dde 100644 --- a/flatisfy/filters/__init__.py +++ b/flatisfy/filters/__init__.py @@ -80,7 +80,7 @@ def refine_with_housing_criteria(flats_list, constraint): ] ) - +@tools.timeit def first_pass(flats_list, constraint, config): """ First filtering pass. @@ -123,7 +123,7 @@ def first_pass(flats_list, constraint, config): "duplicate": duplicates_by_id + duplicates_by_urls } - +@tools.timeit def second_pass(flats_list, constraint, config): """ Second filtering pass. @@ -163,7 +163,7 @@ def second_pass(flats_list, constraint, config): "duplicate": [] } - +@tools.timeit def third_pass(flats_list, config): """ Third filtering pass. @@ -175,6 +175,8 @@ def third_pass(flats_list, config): :param config: A config dict. :return: A dict mapping flat status and list of flat objects. """ + LOGGER.info("Running third filtering pass.") + # Deduplicate the list using every available data flats_list, duplicate_flats = duplicates.deep_detect(flats_list) diff --git a/flatisfy/filters/cache.py b/flatisfy/filters/cache.py new file mode 100644 index 0000000..d53950f --- /dev/null +++ b/flatisfy/filters/cache.py @@ -0,0 +1,45 @@ +# coding: utf-8 + +""" +Caching function for pictures. +""" + +from __future__ import absolute_import, print_function, unicode_literals + +import requests + +class MemoryCache(object): + def __init__(self, on_miss): + self.hits = 0 + self.misses = 0 + self.map = {} + self.on_miss = on_miss + + def get(self, key): + cached = self.map.get(key, None) + if cached is not None: + self.hits += 1 + return cached + + item = self.map[key] = self.on_miss(key) + self.misses += 1 + return item + + def total(self): + return self.hits + self.misses + + def hit_rate(self): + assert self.total() > 0 + return 100 * self.hits // self.total() + + def miss_rate(self): + assert self.total() > 0 + return 100 * self.misses // self.total() + +class ImageCache(MemoryCache): + @staticmethod + def retrieve_photo(url): + return requests.get(url) + + def __init__(self): + super(self.__class__, self).__init__(on_miss=ImageCache.retrieve_photo) diff --git a/flatisfy/filters/duplicates.py b/flatisfy/filters/duplicates.py index b75bb65..02420e0 100644 --- a/flatisfy/filters/duplicates.py +++ b/flatisfy/filters/duplicates.py @@ -16,6 +16,7 @@ import PIL.Image import requests from flatisfy import tools +from flatisfy.filters.cache import ImageCache LOGGER = logging.getLogger(__name__) @@ -60,8 +61,7 @@ def homogeneize_phone_number(number): return number - -def find_number_common_photos(flat1_photos, flat2_photos): +def find_number_common_photos(photo_cache, flat1_photos, flat2_photos): """ Compute the number of common photos between the two lists of photos for the flats. @@ -75,13 +75,14 @@ def find_number_common_photos(flat1_photos, flat2_photos): :return: The found number of common photos. """ n_common_photos = 0 + for photo1, photo2 in itertools.product(flat1_photos, flat2_photos): try: - req1 = requests.get(photo1["url"]) + req1 = photo_cache.get(photo1["url"]) im1 = PIL.Image.open(BytesIO(req1.content)) hash1 = imagehash.average_hash(im1) - req2 = requests.get(photo2["url"]) + req2 = photo_cache.get(photo2["url"]) im2 = PIL.Image.open(BytesIO(req2.content)) hash2 = imagehash.average_hash(im2) @@ -89,6 +90,7 @@ def find_number_common_photos(flat1_photos, flat2_photos): n_common_photos += 1 except (IOError, requests.exceptions.RequestException): pass + return n_common_photos @@ -187,6 +189,9 @@ def deep_detect(flats_list): the flats objects that should be removed and considered as duplicates (they were already merged). """ + + photo_cache = ImageCache() + LOGGER.info("Running deep duplicates detection.") matching_flats = collections.defaultdict(list) for i, flat1 in enumerate(flats_list): @@ -250,16 +255,22 @@ def deep_detect(flats_list): # They should have at least one photo in common if there # are some photos if flat1["photos"] and flat2["photos"]: - max_number_photos = max(len(flat1["photos"]), - len(flat2["photos"])) n_common_photos = find_number_common_photos( + photo_cache, flat1["photos"], flat2["photos"] ) assert n_common_photos > 1 - n_common_items += int( - 20 * n_common_photos / max_number_photos - ) + + min_number_photos = min(len(flat1["photos"]), + len(flat2["photos"])) + + # Either all the photos are the same, or there are at least + # three common photos. + if n_common_photos == min_number_photos: + n_common_items += 15 + else: + n_common_items += 5 * min(n_common_photos, 3) # Minimal score to consider they are duplicates assert n_common_items >= 15 @@ -281,6 +292,11 @@ def deep_detect(flats_list): matching_flats[flat1["id"]].append(flat2["id"]) matching_flats[flat2["id"]].append(flat1["id"]) + if photo_cache.total(): + LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.", + photo_cache.hit_rate(), + photo_cache.miss_rate()) + seen_ids = [] duplicate_flats = [] unique_flats_list = [] diff --git a/flatisfy/tools.py b/flatisfy/tools.py index 86045c3..698dddc 100644 --- a/flatisfy/tools.py +++ b/flatisfy/tools.py @@ -13,6 +13,7 @@ import json import logging import math import re +import time import requests import unidecode @@ -299,3 +300,16 @@ def get_travel_time_between(latlng_from, latlng_to, config): "sections": sections } return None + + +def timeit(f): + """ + A decorator that logs how much time was spent in the function. + """ + def wrapped(*args, **kwargs): + before = time.time() + res = f(*args, **kwargs) + runtime = time.time() - before + LOGGER.info("%s -- Execution took %s seconds.", f.__name__, runtime) + return res + return wrapped