Avoid computing multiple times the same photo hash, fix for #111

This commit is contained in:
Lucas Verney 2018-01-18 15:43:39 +01:00
parent deeba0e5f2
commit 7bf08adbce
2 changed files with 39 additions and 12 deletions

View File

@ -49,7 +49,25 @@ def homogeneize_phone_number(number):
return number
def find_number_common_photos(photo_cache, flat1_photos, flat2_photos):
def get_or_compute_photo_hash(photo, photo_cache):
"""
Get the computed hash from the photo dict or compute it if not found.
:param photo: A photo, as a ``dict`` with (at least) a ``url`` key.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
"""
try:
# Try to get the computed hash from the photo dict
return photo["hash"]
except KeyError:
# Otherwise, get the image and compute the hash
req = photo_cache.get(photo["url"])
image = PIL.Image.open(BytesIO(req.content))
photo["hash"] = imagehash.average_hash(image)
return photo["hash"]
def find_number_common_photos(flat1_photos, flat2_photos, photo_cache):
"""
Compute the number of common photos between the two lists of photos for the
flats.
@ -57,22 +75,18 @@ def find_number_common_photos(photo_cache, flat1_photos, flat2_photos):
Fetch the photos and compare them with dHash method.
:param flat1_photos: First list of flat photos. Each photo should be a
``dict`` with a ``url`` key.
``dict`` with (at least) a ``url`` key.
:param flat2_photos: First list of flat photos. Each photo should be a
``dict`` with a ``url`` key.
``dict`` with (at least) a ``url`` key.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
:return: The found number of common photos.
"""
n_common_photos = 0
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
try:
req1 = photo_cache.get(photo1["url"])
im1 = PIL.Image.open(BytesIO(req1.content))
hash1 = imagehash.average_hash(im1)
req2 = photo_cache.get(photo2["url"])
im2 = PIL.Image.open(BytesIO(req2.content))
hash2 = imagehash.average_hash(im2)
hash1 = get_or_compute_photo_hash(photo1, photo_cache)
hash2 = get_or_compute_photo_hash(photo2, photo_cache)
if hash1 - hash2 == 0:
n_common_photos += 1
@ -236,9 +250,9 @@ def get_duplicate_score(flat1, flat2, photo_cache):
# are some photos
if flat1.get("photos", []) and flat2.get("photos", []):
n_common_photos = find_number_common_photos(
photo_cache,
flat1["photos"],
flat2["photos"]
flat2["photos"],
photo_cache
)
assert n_common_photos > 1

View File

@ -152,6 +152,19 @@ class Flat(BASE):
"""
return arrow.get(visit_date).naive
@validates("photos")
def validate_photos(self, _, photos):
"""
Photos validation method
"""
for photo in photos:
try:
# Remove computed hash to avoid storing it in db
del photo["hash"]
except KeyError:
pass
return photos
@staticmethod
def from_dict(flat_dict):
"""