diff --git a/flatisfy/filters/cache.py b/flatisfy/filters/cache.py index b0a73c4..fbd2541 100644 --- a/flatisfy/filters/cache.py +++ b/flatisfy/filters/cache.py @@ -13,17 +13,15 @@ class MemoryCache(object): """ A cache in memory. """ - def __init__(self, on_miss): - """ - Constructor - :param on_miss: Function to call to retrieve item when not already - cached. - """ + @staticmethod + def on_miss(key): + raise NotImplementedError + + def __init__(self): self.hits = 0 self.misses = 0 self.map = {} - self.on_miss = on_miss def get(self, key): """ @@ -77,11 +75,8 @@ class ImageCache(MemoryCache): A cache for images, stored in memory. """ @staticmethod - def retrieve_photo(url): + def on_miss(url): """ Helper to actually retrieve photos if not already cached. """ return requests.get(url) - - def __init__(self): - super(ImageCache, self).__init__(on_miss=ImageCache.retrieve_photo) diff --git a/flatisfy/filters/duplicates.py b/flatisfy/filters/duplicates.py index 1cee476..2b28ca8 100644 --- a/flatisfy/filters/duplicates.py +++ b/flatisfy/filters/duplicates.py @@ -67,31 +67,51 @@ def get_or_compute_photo_hash(photo, photo_cache): return photo["hash"] -def find_number_common_photos(flat1_photos, flat2_photos, photo_cache): +def compare_photos(photo1, photo2, photo_cache, hash_threshold=10): + """ + Compares two photos with average hash method. + + :param photo1: First photo url. + :param photo2: Second photo url. + :param photo_cache: An instance of ``ImageCache`` to use to cache images. + :param hash_thresold: The hash threshold between two images. Usually two + different photos have a hash difference of 30. + :return: ``True`` if the photos are identical, else ``False``. + """ + try: + hash1 = get_or_compute_photo_hash(photo1, photo_cache) + hash2 = get_or_compute_photo_hash(photo2, photo_cache) + + return hash1 - hash2 < hash_threshold + except (IOError, requests.exceptions.RequestException): + return False + + +def find_number_common_photos( + flat1_photos, + flat2_photos, + photo_cache, + hash_threshold=10 +): """ Compute the number of common photos between the two lists of photos for the flats. - Fetch the photos and compare them with dHash method. + Fetch the photos and compare them with average hash method. :param flat1_photos: First list of flat photos. Each photo should be a ``dict`` with (at least) a ``url`` key. - :param flat2_photos: First list of flat photos. Each photo should be a + :param flat2_photos: Second list of flat photos. Each photo should be a ``dict`` with (at least) a ``url`` key. :param photo_cache: An instance of ``ImageCache`` to use to cache images. + :param hash_thresold: The hash threshold between two images. :return: The found number of common photos. """ n_common_photos = 0 for photo1, photo2 in itertools.product(flat1_photos, flat2_photos): - try: - hash1 = get_or_compute_photo_hash(photo1, photo_cache) - hash2 = get_or_compute_photo_hash(photo2, photo_cache) - - if hash1 - hash2 == 0: - n_common_photos += 1 - except (IOError, requests.exceptions.RequestException): - pass + if compare_photos(photo1, photo2, photo_cache, hash_threshold): + n_common_photos += 1 return n_common_photos @@ -182,7 +202,7 @@ def detect(flats_list, key="id", merge=True, should_intersect=False): return unique_flats_list, duplicate_flats -def get_duplicate_score(flat1, flat2, photo_cache): +def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10): """ Compute the duplicate score between two flats. The higher the score, the more likely the two flats to be duplicates. @@ -190,6 +210,7 @@ def get_duplicate_score(flat1, flat2, photo_cache): :param flat1: First flat dict. :param flat2: Second flat dict. :param photo_cache: An instance of ``ImageCache`` to use to cache images. + :param hash_thresold: The hash threshold between two images. :return: The duplicate score as ``int``. """ n_common_items = 0 @@ -314,7 +335,12 @@ def deep_detect(flats_list, config): if flat2["id"] in matching_flats[flat1["id"]]: continue - n_common_items = get_duplicate_score(flat1, flat2, photo_cache) + n_common_items = get_duplicate_score( + flat1, + flat2, + photo_cache, + config["duplicate_image_hash_threshold"] + ) # Minimal score to consider they are duplicates if n_common_items >= config["duplicate_threshold"]: diff --git a/flatisfy/test_files/127028739-2@seloger.jpg b/flatisfy/test_files/127028739-2@seloger.jpg new file mode 100644 index 0000000..e76ff9b Binary files /dev/null and b/flatisfy/test_files/127028739-2@seloger.jpg differ diff --git a/flatisfy/test_files/127028739-3@seloger.jpg b/flatisfy/test_files/127028739-3@seloger.jpg new file mode 100644 index 0000000..4233e75 Binary files /dev/null and b/flatisfy/test_files/127028739-3@seloger.jpg differ diff --git a/flatisfy/test_files/127028739-watermark@seloger.jpg b/flatisfy/test_files/127028739-watermark@seloger.jpg new file mode 100644 index 0000000..5365c27 Binary files /dev/null and b/flatisfy/test_files/127028739-watermark@seloger.jpg differ diff --git a/flatisfy/test_files/127028739@seloger.jpg b/flatisfy/test_files/127028739@seloger.jpg new file mode 100644 index 0000000..7b6553f Binary files /dev/null and b/flatisfy/test_files/127028739@seloger.jpg differ diff --git a/flatisfy/test_files/14428129-2@explorimmo.jpg b/flatisfy/test_files/14428129-2@explorimmo.jpg new file mode 100644 index 0000000..4396475 Binary files /dev/null and b/flatisfy/test_files/14428129-2@explorimmo.jpg differ diff --git a/flatisfy/test_files/14428129-3@explorimmo.jpg b/flatisfy/test_files/14428129-3@explorimmo.jpg new file mode 100644 index 0000000..137c09a Binary files /dev/null and b/flatisfy/test_files/14428129-3@explorimmo.jpg differ diff --git a/flatisfy/test_files/14428129@explorimmo.jpg b/flatisfy/test_files/14428129@explorimmo.jpg new file mode 100644 index 0000000..cefbba3 Binary files /dev/null and b/flatisfy/test_files/14428129@explorimmo.jpg differ diff --git a/flatisfy/tests.py b/flatisfy/tests.py index df3e48e..26823f1 100644 --- a/flatisfy/tests.py +++ b/flatisfy/tests.py @@ -9,6 +9,8 @@ import os import random import sys import unittest +import requests +import requests_mock from flatisfy import tools from flatisfy.filters import duplicates @@ -19,6 +21,22 @@ LOGGER = logging.getLogger(__name__) TESTS_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + "/test_files/" +class LocalImageCache(ImageCache): + """ + A local cache for images, stored in memory. + """ + @staticmethod + def on_miss(path): + """ + Helper to actually retrieve photos if not already cached. + """ + url = "mock://flatisfy" + path + with requests_mock.Mocker() as mock: + with open(path, "rb") as fh: + mock.get(url, content=fh.read()) + return requests.get(url) + + class TestTexts(unittest.TestCase): """ Checks string normalizations. @@ -118,6 +136,68 @@ class TestPhoneNumbers(unittest.TestCase): ) +class TestPhotos(unittest.TestCase): + IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name + + def test_same_photo_twice(self): + """ + Compares a photo against itself. + """ + photo = { + "url": TESTS_DATA_DIR + "127028739@seloger.jpg" + } + + self.assertTrue(duplicates.compare_photos( + photo, + photo, + TestPhotos.IMAGE_CACHE + )) + + def test_different_photos(self): + """ + Compares two different photos. + """ + self.assertFalse(duplicates.compare_photos( + {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, + {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, + TestPhotos.IMAGE_CACHE + )) + + self.assertFalse(duplicates.compare_photos( + {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, + {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, + TestPhotos.IMAGE_CACHE + )) + + def test_matching_photos(self): + """ + Compares two matching photos with different size and source. + """ + self.assertTrue(duplicates.compare_photos( + {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, + {"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"}, + TestPhotos.IMAGE_CACHE + )) + + self.assertTrue(duplicates.compare_photos( + {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, + {"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"}, + TestPhotos.IMAGE_CACHE + )) + + self.assertTrue(duplicates.compare_photos( + {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, + {"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"}, + TestPhotos.IMAGE_CACHE + )) + + self.assertTrue(duplicates.compare_photos( + {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, + {"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"}, + TestPhotos.IMAGE_CACHE + )) + + class TestDuplicates(unittest.TestCase): """ Checks duplicates detection. @@ -286,5 +366,9 @@ def run(): suite = unittest.TestLoader().loadTestsFromTestCase(TestDuplicates) result = unittest.TextTestRunner(verbosity=2).run(suite) assert result.wasSuccessful() + + suite = unittest.TestLoader().loadTestsFromTestCase(TestPhotos) + result = unittest.TextTestRunner(verbosity=2).run(suite) + assert result.wasSuccessful() except AssertionError: sys.exit(1) diff --git a/requirements.txt b/requirements.txt index ca1cda6..e42027d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ future imagehash pillow requests +requests_mock sqlalchemy titlecase unidecode