Merge branch 'better-dedup' into 'master'

Optimize photo comparisons

Closes #41 and #59

See merge request !9
This commit is contained in:
Lucas Verney 2017-06-29 12:04:44 +02:00
commit 4c07fc8ba1
4 changed files with 89 additions and 12 deletions

View File

@ -80,7 +80,7 @@ def refine_with_housing_criteria(flats_list, constraint):
]
)
@tools.timeit
def first_pass(flats_list, constraint, config):
"""
First filtering pass.
@ -123,7 +123,7 @@ def first_pass(flats_list, constraint, config):
"duplicate": duplicates_by_id + duplicates_by_urls
}
@tools.timeit
def second_pass(flats_list, constraint, config):
"""
Second filtering pass.
@ -163,7 +163,7 @@ def second_pass(flats_list, constraint, config):
"duplicate": []
}
@tools.timeit
def third_pass(flats_list, config):
"""
Third filtering pass.
@ -175,6 +175,8 @@ def third_pass(flats_list, config):
:param config: A config dict.
:return: A dict mapping flat status and list of flat objects.
"""
LOGGER.info("Running third filtering pass.")
# Deduplicate the list using every available data
flats_list, duplicate_flats = duplicates.deep_detect(flats_list)

45
flatisfy/filters/cache.py Normal file
View File

@ -0,0 +1,45 @@
# coding: utf-8
"""
Caching function for pictures.
"""
from __future__ import absolute_import, print_function, unicode_literals
import requests
class MemoryCache(object):
def __init__(self, on_miss):
self.hits = 0
self.misses = 0
self.map = {}
self.on_miss = on_miss
def get(self, key):
cached = self.map.get(key, None)
if cached is not None:
self.hits += 1
return cached
item = self.map[key] = self.on_miss(key)
self.misses += 1
return item
def total(self):
return self.hits + self.misses
def hit_rate(self):
assert self.total() > 0
return 100 * self.hits // self.total()
def miss_rate(self):
assert self.total() > 0
return 100 * self.misses // self.total()
class ImageCache(MemoryCache):
@staticmethod
def retrieve_photo(url):
return requests.get(url)
def __init__(self):
super(self.__class__, self).__init__(on_miss=ImageCache.retrieve_photo)

View File

@ -16,6 +16,7 @@ import PIL.Image
import requests
from flatisfy import tools
from flatisfy.filters.cache import ImageCache
LOGGER = logging.getLogger(__name__)
@ -60,8 +61,7 @@ def homogeneize_phone_number(number):
return number
def find_number_common_photos(flat1_photos, flat2_photos):
def find_number_common_photos(photo_cache, flat1_photos, flat2_photos):
"""
Compute the number of common photos between the two lists of photos for the
flats.
@ -75,13 +75,14 @@ def find_number_common_photos(flat1_photos, flat2_photos):
:return: The found number of common photos.
"""
n_common_photos = 0
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
try:
req1 = requests.get(photo1["url"])
req1 = photo_cache.get(photo1["url"])
im1 = PIL.Image.open(BytesIO(req1.content))
hash1 = imagehash.average_hash(im1)
req2 = requests.get(photo2["url"])
req2 = photo_cache.get(photo2["url"])
im2 = PIL.Image.open(BytesIO(req2.content))
hash2 = imagehash.average_hash(im2)
@ -89,6 +90,7 @@ def find_number_common_photos(flat1_photos, flat2_photos):
n_common_photos += 1
except (IOError, requests.exceptions.RequestException):
pass
return n_common_photos
@ -187,6 +189,9 @@ def deep_detect(flats_list):
the flats objects that should be removed and considered as duplicates (they
were already merged).
"""
photo_cache = ImageCache()
LOGGER.info("Running deep duplicates detection.")
matching_flats = collections.defaultdict(list)
for i, flat1 in enumerate(flats_list):
@ -250,16 +255,22 @@ def deep_detect(flats_list):
# They should have at least one photo in common if there
# are some photos
if flat1["photos"] and flat2["photos"]:
max_number_photos = max(len(flat1["photos"]),
len(flat2["photos"]))
n_common_photos = find_number_common_photos(
photo_cache,
flat1["photos"],
flat2["photos"]
)
assert n_common_photos > 1
n_common_items += int(
20 * n_common_photos / max_number_photos
)
min_number_photos = min(len(flat1["photos"]),
len(flat2["photos"]))
# Either all the photos are the same, or there are at least
# three common photos.
if n_common_photos == min_number_photos:
n_common_items += 15
else:
n_common_items += 5 * min(n_common_photos, 3)
# Minimal score to consider they are duplicates
assert n_common_items >= 15
@ -281,6 +292,11 @@ def deep_detect(flats_list):
matching_flats[flat1["id"]].append(flat2["id"])
matching_flats[flat2["id"]].append(flat1["id"])
if photo_cache.total():
LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",
photo_cache.hit_rate(),
photo_cache.miss_rate())
seen_ids = []
duplicate_flats = []
unique_flats_list = []

View File

@ -13,6 +13,7 @@ import json
import logging
import math
import re
import time
import requests
import unidecode
@ -299,3 +300,16 @@ def get_travel_time_between(latlng_from, latlng_to, config):
"sections": sections
}
return None
def timeit(f):
"""
A decorator that logs how much time was spent in the function.
"""
def wrapped(*args, **kwargs):
before = time.time()
res = f(*args, **kwargs)
runtime = time.time() - before
LOGGER.info("%s -- Execution took %s seconds.", f.__name__, runtime)
return res
return wrapped