Merge branch 'better-dedup' into 'master'

Optimize photo comparisons

Closes #41 and #59

See merge request !9
This commit is contained in:
Lucas Verney 2017-06-29 12:04:44 +02:00
commit 4c07fc8ba1
4 changed files with 89 additions and 12 deletions

View File

@ -80,7 +80,7 @@ def refine_with_housing_criteria(flats_list, constraint):
] ]
) )
@tools.timeit
def first_pass(flats_list, constraint, config): def first_pass(flats_list, constraint, config):
""" """
First filtering pass. First filtering pass.
@ -123,7 +123,7 @@ def first_pass(flats_list, constraint, config):
"duplicate": duplicates_by_id + duplicates_by_urls "duplicate": duplicates_by_id + duplicates_by_urls
} }
@tools.timeit
def second_pass(flats_list, constraint, config): def second_pass(flats_list, constraint, config):
""" """
Second filtering pass. Second filtering pass.
@ -163,7 +163,7 @@ def second_pass(flats_list, constraint, config):
"duplicate": [] "duplicate": []
} }
@tools.timeit
def third_pass(flats_list, config): def third_pass(flats_list, config):
""" """
Third filtering pass. Third filtering pass.
@ -175,6 +175,8 @@ def third_pass(flats_list, config):
:param config: A config dict. :param config: A config dict.
:return: A dict mapping flat status and list of flat objects. :return: A dict mapping flat status and list of flat objects.
""" """
LOGGER.info("Running third filtering pass.")
# Deduplicate the list using every available data # Deduplicate the list using every available data
flats_list, duplicate_flats = duplicates.deep_detect(flats_list) flats_list, duplicate_flats = duplicates.deep_detect(flats_list)

45
flatisfy/filters/cache.py Normal file
View File

@ -0,0 +1,45 @@
# coding: utf-8
"""
Caching function for pictures.
"""
from __future__ import absolute_import, print_function, unicode_literals
import requests
class MemoryCache(object):
def __init__(self, on_miss):
self.hits = 0
self.misses = 0
self.map = {}
self.on_miss = on_miss
def get(self, key):
cached = self.map.get(key, None)
if cached is not None:
self.hits += 1
return cached
item = self.map[key] = self.on_miss(key)
self.misses += 1
return item
def total(self):
return self.hits + self.misses
def hit_rate(self):
assert self.total() > 0
return 100 * self.hits // self.total()
def miss_rate(self):
assert self.total() > 0
return 100 * self.misses // self.total()
class ImageCache(MemoryCache):
@staticmethod
def retrieve_photo(url):
return requests.get(url)
def __init__(self):
super(self.__class__, self).__init__(on_miss=ImageCache.retrieve_photo)

View File

@ -16,6 +16,7 @@ import PIL.Image
import requests import requests
from flatisfy import tools from flatisfy import tools
from flatisfy.filters.cache import ImageCache
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
@ -60,8 +61,7 @@ def homogeneize_phone_number(number):
return number return number
def find_number_common_photos(photo_cache, flat1_photos, flat2_photos):
def find_number_common_photos(flat1_photos, flat2_photos):
""" """
Compute the number of common photos between the two lists of photos for the Compute the number of common photos between the two lists of photos for the
flats. flats.
@ -75,13 +75,14 @@ def find_number_common_photos(flat1_photos, flat2_photos):
:return: The found number of common photos. :return: The found number of common photos.
""" """
n_common_photos = 0 n_common_photos = 0
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos): for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
try: try:
req1 = requests.get(photo1["url"]) req1 = photo_cache.get(photo1["url"])
im1 = PIL.Image.open(BytesIO(req1.content)) im1 = PIL.Image.open(BytesIO(req1.content))
hash1 = imagehash.average_hash(im1) hash1 = imagehash.average_hash(im1)
req2 = requests.get(photo2["url"]) req2 = photo_cache.get(photo2["url"])
im2 = PIL.Image.open(BytesIO(req2.content)) im2 = PIL.Image.open(BytesIO(req2.content))
hash2 = imagehash.average_hash(im2) hash2 = imagehash.average_hash(im2)
@ -89,6 +90,7 @@ def find_number_common_photos(flat1_photos, flat2_photos):
n_common_photos += 1 n_common_photos += 1
except (IOError, requests.exceptions.RequestException): except (IOError, requests.exceptions.RequestException):
pass pass
return n_common_photos return n_common_photos
@ -187,6 +189,9 @@ def deep_detect(flats_list):
the flats objects that should be removed and considered as duplicates (they the flats objects that should be removed and considered as duplicates (they
were already merged). were already merged).
""" """
photo_cache = ImageCache()
LOGGER.info("Running deep duplicates detection.") LOGGER.info("Running deep duplicates detection.")
matching_flats = collections.defaultdict(list) matching_flats = collections.defaultdict(list)
for i, flat1 in enumerate(flats_list): for i, flat1 in enumerate(flats_list):
@ -250,16 +255,22 @@ def deep_detect(flats_list):
# They should have at least one photo in common if there # They should have at least one photo in common if there
# are some photos # are some photos
if flat1["photos"] and flat2["photos"]: if flat1["photos"] and flat2["photos"]:
max_number_photos = max(len(flat1["photos"]),
len(flat2["photos"]))
n_common_photos = find_number_common_photos( n_common_photos = find_number_common_photos(
photo_cache,
flat1["photos"], flat1["photos"],
flat2["photos"] flat2["photos"]
) )
assert n_common_photos > 1 assert n_common_photos > 1
n_common_items += int(
20 * n_common_photos / max_number_photos min_number_photos = min(len(flat1["photos"]),
) len(flat2["photos"]))
# Either all the photos are the same, or there are at least
# three common photos.
if n_common_photos == min_number_photos:
n_common_items += 15
else:
n_common_items += 5 * min(n_common_photos, 3)
# Minimal score to consider they are duplicates # Minimal score to consider they are duplicates
assert n_common_items >= 15 assert n_common_items >= 15
@ -281,6 +292,11 @@ def deep_detect(flats_list):
matching_flats[flat1["id"]].append(flat2["id"]) matching_flats[flat1["id"]].append(flat2["id"])
matching_flats[flat2["id"]].append(flat1["id"]) matching_flats[flat2["id"]].append(flat1["id"])
if photo_cache.total():
LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",
photo_cache.hit_rate(),
photo_cache.miss_rate())
seen_ids = [] seen_ids = []
duplicate_flats = [] duplicate_flats = []
unique_flats_list = [] unique_flats_list = []

View File

@ -13,6 +13,7 @@ import json
import logging import logging
import math import math
import re import re
import time
import requests import requests
import unidecode import unidecode
@ -299,3 +300,16 @@ def get_travel_time_between(latlng_from, latlng_to, config):
"sections": sections "sections": sections
} }
return None return None
def timeit(f):
"""
A decorator that logs how much time was spent in the function.
"""
def wrapped(*args, **kwargs):
before = time.time()
res = f(*args, **kwargs)
runtime = time.time() - before
LOGGER.info("%s -- Execution took %s seconds.", f.__name__, runtime)
return res
return wrapped