Add unit tests for image comparison
@ -13,17 +13,15 @@ class MemoryCache(object):
|
||||
"""
|
||||
A cache in memory.
|
||||
"""
|
||||
def __init__(self, on_miss):
|
||||
"""
|
||||
Constructor
|
||||
|
||||
:param on_miss: Function to call to retrieve item when not already
|
||||
cached.
|
||||
"""
|
||||
@staticmethod
|
||||
def on_miss(key):
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self):
|
||||
self.hits = 0
|
||||
self.misses = 0
|
||||
self.map = {}
|
||||
self.on_miss = on_miss
|
||||
|
||||
def get(self, key):
|
||||
"""
|
||||
@ -77,11 +75,8 @@ class ImageCache(MemoryCache):
|
||||
A cache for images, stored in memory.
|
||||
"""
|
||||
@staticmethod
|
||||
def retrieve_photo(url):
|
||||
def on_miss(url):
|
||||
"""
|
||||
Helper to actually retrieve photos if not already cached.
|
||||
"""
|
||||
return requests.get(url)
|
||||
|
||||
def __init__(self):
|
||||
super(ImageCache, self).__init__(on_miss=ImageCache.retrieve_photo)
|
||||
|
@ -67,31 +67,51 @@ def get_or_compute_photo_hash(photo, photo_cache):
|
||||
return photo["hash"]
|
||||
|
||||
|
||||
def find_number_common_photos(flat1_photos, flat2_photos, photo_cache):
|
||||
def compare_photos(photo1, photo2, photo_cache, hash_threshold=10):
|
||||
"""
|
||||
Compares two photos with average hash method.
|
||||
|
||||
:param photo1: First photo url.
|
||||
:param photo2: Second photo url.
|
||||
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
|
||||
:param hash_thresold: The hash threshold between two images. Usually two
|
||||
different photos have a hash difference of 30.
|
||||
:return: ``True`` if the photos are identical, else ``False``.
|
||||
"""
|
||||
try:
|
||||
hash1 = get_or_compute_photo_hash(photo1, photo_cache)
|
||||
hash2 = get_or_compute_photo_hash(photo2, photo_cache)
|
||||
|
||||
return hash1 - hash2 < hash_threshold
|
||||
except (IOError, requests.exceptions.RequestException):
|
||||
return False
|
||||
|
||||
|
||||
def find_number_common_photos(
|
||||
flat1_photos,
|
||||
flat2_photos,
|
||||
photo_cache,
|
||||
hash_threshold=10
|
||||
):
|
||||
"""
|
||||
Compute the number of common photos between the two lists of photos for the
|
||||
flats.
|
||||
|
||||
Fetch the photos and compare them with dHash method.
|
||||
Fetch the photos and compare them with average hash method.
|
||||
|
||||
:param flat1_photos: First list of flat photos. Each photo should be a
|
||||
``dict`` with (at least) a ``url`` key.
|
||||
:param flat2_photos: First list of flat photos. Each photo should be a
|
||||
:param flat2_photos: Second list of flat photos. Each photo should be a
|
||||
``dict`` with (at least) a ``url`` key.
|
||||
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
|
||||
:param hash_thresold: The hash threshold between two images.
|
||||
:return: The found number of common photos.
|
||||
"""
|
||||
n_common_photos = 0
|
||||
|
||||
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
|
||||
try:
|
||||
hash1 = get_or_compute_photo_hash(photo1, photo_cache)
|
||||
hash2 = get_or_compute_photo_hash(photo2, photo_cache)
|
||||
|
||||
if hash1 - hash2 == 0:
|
||||
if compare_photos(photo1, photo2, photo_cache, hash_threshold):
|
||||
n_common_photos += 1
|
||||
except (IOError, requests.exceptions.RequestException):
|
||||
pass
|
||||
|
||||
return n_common_photos
|
||||
|
||||
@ -182,7 +202,7 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
|
||||
return unique_flats_list, duplicate_flats
|
||||
|
||||
|
||||
def get_duplicate_score(flat1, flat2, photo_cache):
|
||||
def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
|
||||
"""
|
||||
Compute the duplicate score between two flats. The higher the score, the
|
||||
more likely the two flats to be duplicates.
|
||||
@ -190,6 +210,7 @@ def get_duplicate_score(flat1, flat2, photo_cache):
|
||||
:param flat1: First flat dict.
|
||||
:param flat2: Second flat dict.
|
||||
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
|
||||
:param hash_thresold: The hash threshold between two images.
|
||||
:return: The duplicate score as ``int``.
|
||||
"""
|
||||
n_common_items = 0
|
||||
@ -314,7 +335,12 @@ def deep_detect(flats_list, config):
|
||||
if flat2["id"] in matching_flats[flat1["id"]]:
|
||||
continue
|
||||
|
||||
n_common_items = get_duplicate_score(flat1, flat2, photo_cache)
|
||||
n_common_items = get_duplicate_score(
|
||||
flat1,
|
||||
flat2,
|
||||
photo_cache,
|
||||
config["duplicate_image_hash_threshold"]
|
||||
)
|
||||
|
||||
# Minimal score to consider they are duplicates
|
||||
if n_common_items >= config["duplicate_threshold"]:
|
||||
|
BIN
flatisfy/test_files/127028739-2@seloger.jpg
Normal file
After Width: | Height: | Size: 122 KiB |
BIN
flatisfy/test_files/127028739-3@seloger.jpg
Normal file
After Width: | Height: | Size: 114 KiB |
BIN
flatisfy/test_files/127028739-watermark@seloger.jpg
Normal file
After Width: | Height: | Size: 24 KiB |
BIN
flatisfy/test_files/127028739@seloger.jpg
Normal file
After Width: | Height: | Size: 81 KiB |
BIN
flatisfy/test_files/14428129-2@explorimmo.jpg
Normal file
After Width: | Height: | Size: 40 KiB |
BIN
flatisfy/test_files/14428129-3@explorimmo.jpg
Normal file
After Width: | Height: | Size: 36 KiB |
BIN
flatisfy/test_files/14428129@explorimmo.jpg
Normal file
After Width: | Height: | Size: 25 KiB |
@ -9,6 +9,8 @@ import os
|
||||
import random
|
||||
import sys
|
||||
import unittest
|
||||
import requests
|
||||
import requests_mock
|
||||
|
||||
from flatisfy import tools
|
||||
from flatisfy.filters import duplicates
|
||||
@ -19,6 +21,22 @@ LOGGER = logging.getLogger(__name__)
|
||||
TESTS_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + "/test_files/"
|
||||
|
||||
|
||||
class LocalImageCache(ImageCache):
|
||||
"""
|
||||
A local cache for images, stored in memory.
|
||||
"""
|
||||
@staticmethod
|
||||
def on_miss(path):
|
||||
"""
|
||||
Helper to actually retrieve photos if not already cached.
|
||||
"""
|
||||
url = "mock://flatisfy" + path
|
||||
with requests_mock.Mocker() as mock:
|
||||
with open(path, "rb") as fh:
|
||||
mock.get(url, content=fh.read())
|
||||
return requests.get(url)
|
||||
|
||||
|
||||
class TestTexts(unittest.TestCase):
|
||||
"""
|
||||
Checks string normalizations.
|
||||
@ -118,6 +136,68 @@ class TestPhoneNumbers(unittest.TestCase):
|
||||
)
|
||||
|
||||
|
||||
class TestPhotos(unittest.TestCase):
|
||||
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
|
||||
|
||||
def test_same_photo_twice(self):
|
||||
"""
|
||||
Compares a photo against itself.
|
||||
"""
|
||||
photo = {
|
||||
"url": TESTS_DATA_DIR + "127028739@seloger.jpg"
|
||||
}
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
photo,
|
||||
photo,
|
||||
TestPhotos.IMAGE_CACHE
|
||||
))
|
||||
|
||||
def test_different_photos(self):
|
||||
"""
|
||||
Compares two different photos.
|
||||
"""
|
||||
self.assertFalse(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
))
|
||||
|
||||
self.assertFalse(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
))
|
||||
|
||||
def test_matching_photos(self):
|
||||
"""
|
||||
Compares two matching photos with different size and source.
|
||||
"""
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
))
|
||||
|
||||
|
||||
class TestDuplicates(unittest.TestCase):
|
||||
"""
|
||||
Checks duplicates detection.
|
||||
@ -286,5 +366,9 @@ def run():
|
||||
suite = unittest.TestLoader().loadTestsFromTestCase(TestDuplicates)
|
||||
result = unittest.TextTestRunner(verbosity=2).run(suite)
|
||||
assert result.wasSuccessful()
|
||||
|
||||
suite = unittest.TestLoader().loadTestsFromTestCase(TestPhotos)
|
||||
result = unittest.TextTestRunner(verbosity=2).run(suite)
|
||||
assert result.wasSuccessful()
|
||||
except AssertionError:
|
||||
sys.exit(1)
|
||||
|
@ -10,6 +10,7 @@ future
|
||||
imagehash
|
||||
pillow
|
||||
requests
|
||||
requests_mock
|
||||
sqlalchemy
|
||||
titlecase
|
||||
unidecode
|
||||
|