Merge branch 'better-dedup' into 'master'
Optimize photo comparisons Closes #41 and #59 See merge request !9
This commit is contained in:
commit
4c07fc8ba1
@ -80,7 +80,7 @@ def refine_with_housing_criteria(flats_list, constraint):
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@tools.timeit
|
||||
def first_pass(flats_list, constraint, config):
|
||||
"""
|
||||
First filtering pass.
|
||||
@ -123,7 +123,7 @@ def first_pass(flats_list, constraint, config):
|
||||
"duplicate": duplicates_by_id + duplicates_by_urls
|
||||
}
|
||||
|
||||
|
||||
@tools.timeit
|
||||
def second_pass(flats_list, constraint, config):
|
||||
"""
|
||||
Second filtering pass.
|
||||
@ -163,7 +163,7 @@ def second_pass(flats_list, constraint, config):
|
||||
"duplicate": []
|
||||
}
|
||||
|
||||
|
||||
@tools.timeit
|
||||
def third_pass(flats_list, config):
|
||||
"""
|
||||
Third filtering pass.
|
||||
@ -175,6 +175,8 @@ def third_pass(flats_list, config):
|
||||
:param config: A config dict.
|
||||
:return: A dict mapping flat status and list of flat objects.
|
||||
"""
|
||||
LOGGER.info("Running third filtering pass.")
|
||||
|
||||
# Deduplicate the list using every available data
|
||||
flats_list, duplicate_flats = duplicates.deep_detect(flats_list)
|
||||
|
||||
|
45
flatisfy/filters/cache.py
Normal file
45
flatisfy/filters/cache.py
Normal file
@ -0,0 +1,45 @@
|
||||
# coding: utf-8
|
||||
|
||||
"""
|
||||
Caching function for pictures.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import requests
|
||||
|
||||
class MemoryCache(object):
|
||||
def __init__(self, on_miss):
|
||||
self.hits = 0
|
||||
self.misses = 0
|
||||
self.map = {}
|
||||
self.on_miss = on_miss
|
||||
|
||||
def get(self, key):
|
||||
cached = self.map.get(key, None)
|
||||
if cached is not None:
|
||||
self.hits += 1
|
||||
return cached
|
||||
|
||||
item = self.map[key] = self.on_miss(key)
|
||||
self.misses += 1
|
||||
return item
|
||||
|
||||
def total(self):
|
||||
return self.hits + self.misses
|
||||
|
||||
def hit_rate(self):
|
||||
assert self.total() > 0
|
||||
return 100 * self.hits // self.total()
|
||||
|
||||
def miss_rate(self):
|
||||
assert self.total() > 0
|
||||
return 100 * self.misses // self.total()
|
||||
|
||||
class ImageCache(MemoryCache):
|
||||
@staticmethod
|
||||
def retrieve_photo(url):
|
||||
return requests.get(url)
|
||||
|
||||
def __init__(self):
|
||||
super(self.__class__, self).__init__(on_miss=ImageCache.retrieve_photo)
|
@ -16,6 +16,7 @@ import PIL.Image
|
||||
import requests
|
||||
|
||||
from flatisfy import tools
|
||||
from flatisfy.filters.cache import ImageCache
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
@ -60,8 +61,7 @@ def homogeneize_phone_number(number):
|
||||
|
||||
return number
|
||||
|
||||
|
||||
def find_number_common_photos(flat1_photos, flat2_photos):
|
||||
def find_number_common_photos(photo_cache, flat1_photos, flat2_photos):
|
||||
"""
|
||||
Compute the number of common photos between the two lists of photos for the
|
||||
flats.
|
||||
@ -75,13 +75,14 @@ def find_number_common_photos(flat1_photos, flat2_photos):
|
||||
:return: The found number of common photos.
|
||||
"""
|
||||
n_common_photos = 0
|
||||
|
||||
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
|
||||
try:
|
||||
req1 = requests.get(photo1["url"])
|
||||
req1 = photo_cache.get(photo1["url"])
|
||||
im1 = PIL.Image.open(BytesIO(req1.content))
|
||||
hash1 = imagehash.average_hash(im1)
|
||||
|
||||
req2 = requests.get(photo2["url"])
|
||||
req2 = photo_cache.get(photo2["url"])
|
||||
im2 = PIL.Image.open(BytesIO(req2.content))
|
||||
hash2 = imagehash.average_hash(im2)
|
||||
|
||||
@ -89,6 +90,7 @@ def find_number_common_photos(flat1_photos, flat2_photos):
|
||||
n_common_photos += 1
|
||||
except (IOError, requests.exceptions.RequestException):
|
||||
pass
|
||||
|
||||
return n_common_photos
|
||||
|
||||
|
||||
@ -187,6 +189,9 @@ def deep_detect(flats_list):
|
||||
the flats objects that should be removed and considered as duplicates (they
|
||||
were already merged).
|
||||
"""
|
||||
|
||||
photo_cache = ImageCache()
|
||||
|
||||
LOGGER.info("Running deep duplicates detection.")
|
||||
matching_flats = collections.defaultdict(list)
|
||||
for i, flat1 in enumerate(flats_list):
|
||||
@ -250,16 +255,22 @@ def deep_detect(flats_list):
|
||||
# They should have at least one photo in common if there
|
||||
# are some photos
|
||||
if flat1["photos"] and flat2["photos"]:
|
||||
max_number_photos = max(len(flat1["photos"]),
|
||||
len(flat2["photos"]))
|
||||
n_common_photos = find_number_common_photos(
|
||||
photo_cache,
|
||||
flat1["photos"],
|
||||
flat2["photos"]
|
||||
)
|
||||
assert n_common_photos > 1
|
||||
n_common_items += int(
|
||||
20 * n_common_photos / max_number_photos
|
||||
)
|
||||
|
||||
min_number_photos = min(len(flat1["photos"]),
|
||||
len(flat2["photos"]))
|
||||
|
||||
# Either all the photos are the same, or there are at least
|
||||
# three common photos.
|
||||
if n_common_photos == min_number_photos:
|
||||
n_common_items += 15
|
||||
else:
|
||||
n_common_items += 5 * min(n_common_photos, 3)
|
||||
|
||||
# Minimal score to consider they are duplicates
|
||||
assert n_common_items >= 15
|
||||
@ -281,6 +292,11 @@ def deep_detect(flats_list):
|
||||
matching_flats[flat1["id"]].append(flat2["id"])
|
||||
matching_flats[flat2["id"]].append(flat1["id"])
|
||||
|
||||
if photo_cache.total():
|
||||
LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",
|
||||
photo_cache.hit_rate(),
|
||||
photo_cache.miss_rate())
|
||||
|
||||
seen_ids = []
|
||||
duplicate_flats = []
|
||||
unique_flats_list = []
|
||||
|
@ -13,6 +13,7 @@ import json
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import time
|
||||
|
||||
import requests
|
||||
import unidecode
|
||||
@ -299,3 +300,16 @@ def get_travel_time_between(latlng_from, latlng_to, config):
|
||||
"sections": sections
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def timeit(f):
|
||||
"""
|
||||
A decorator that logs how much time was spent in the function.
|
||||
"""
|
||||
def wrapped(*args, **kwargs):
|
||||
before = time.time()
|
||||
res = f(*args, **kwargs)
|
||||
runtime = time.time() - before
|
||||
LOGGER.info("%s -- Execution took %s seconds.", f.__name__, runtime)
|
||||
return res
|
||||
return wrapped
|
||||
|
Loading…
Reference in New Issue
Block a user