diff --git a/flatisfy/config.py b/flatisfy/config.py index 19c82c0..68df028 100644 --- a/flatisfy/config.py +++ b/flatisfy/config.py @@ -49,6 +49,8 @@ DEFAULT_CONFIG = { "max_distance_housing_station": 1500, # Score to consider two flats as being duplicates "duplicate_threshold": 15, + # Score to consider two images as being duplicates through hash comparison + "duplicate_image_hash_threshold": 10, # Navitia API key "navitia_api_key": None, # Number of filtering passes to run @@ -144,6 +146,7 @@ def validate_config(config, check_with_data): assert isinstance(config["store_personal_data"], bool) assert isinstance(config["max_distance_housing_station"], (int, float)) assert isinstance(config["duplicate_threshold"], int) + assert isinstance(config["duplicate_image_hash_threshold"], int) # Ensure constraints are ok assert config["constraints"] diff --git a/flatisfy/filters/cache.py b/flatisfy/filters/cache.py index fbd2541..a98b74b 100644 --- a/flatisfy/filters/cache.py +++ b/flatisfy/filters/cache.py @@ -13,9 +13,15 @@ class MemoryCache(object): """ A cache in memory. """ - @staticmethod def on_miss(key): + """ + Method to be called whenever an object is requested from the cache but + was not already cached. Typically, make a HTTP query to fetch it. + + :param key: Key of the requested object. + :return: The object content. + """ raise NotImplementedError def __init__(self): diff --git a/flatisfy/filters/duplicates.py b/flatisfy/filters/duplicates.py index 2b28ca8..b5e2803 100644 --- a/flatisfy/filters/duplicates.py +++ b/flatisfy/filters/duplicates.py @@ -67,14 +67,14 @@ def get_or_compute_photo_hash(photo, photo_cache): return photo["hash"] -def compare_photos(photo1, photo2, photo_cache, hash_threshold=10): +def compare_photos(photo1, photo2, photo_cache, hash_threshold): """ Compares two photos with average hash method. :param photo1: First photo url. :param photo2: Second photo url. :param photo_cache: An instance of ``ImageCache`` to use to cache images. - :param hash_thresold: The hash threshold between two images. Usually two + :param hash_threshold: The hash threshold between two images. Usually two different photos have a hash difference of 30. :return: ``True`` if the photos are identical, else ``False``. """ @@ -91,7 +91,7 @@ def find_number_common_photos( flat1_photos, flat2_photos, photo_cache, - hash_threshold=10 + hash_threshold ): """ Compute the number of common photos between the two lists of photos for the @@ -104,7 +104,7 @@ def find_number_common_photos( :param flat2_photos: Second list of flat photos. Each photo should be a ``dict`` with (at least) a ``url`` key. :param photo_cache: An instance of ``ImageCache`` to use to cache images. - :param hash_thresold: The hash threshold between two images. + :param hash_threshold: The hash threshold between two images. :return: The found number of common photos. """ n_common_photos = 0 @@ -202,7 +202,7 @@ def detect(flats_list, key="id", merge=True, should_intersect=False): return unique_flats_list, duplicate_flats -def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10): +def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold): """ Compute the duplicate score between two flats. The higher the score, the more likely the two flats to be duplicates. @@ -210,7 +210,7 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10): :param flat1: First flat dict. :param flat2: Second flat dict. :param photo_cache: An instance of ``ImageCache`` to use to cache images. - :param hash_thresold: The hash threshold between two images. + :param hash_threshold: The hash threshold between two images. :return: The duplicate score as ``int``. """ n_common_items = 0 @@ -273,7 +273,8 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10): n_common_photos = find_number_common_photos( flat1["photos"], flat2["photos"], - photo_cache + photo_cache, + hash_threshold ) assert n_common_photos > 1 diff --git a/flatisfy/tests.py b/flatisfy/tests.py index 26823f1..15a5d57 100644 --- a/flatisfy/tests.py +++ b/flatisfy/tests.py @@ -138,6 +138,7 @@ class TestPhoneNumbers(unittest.TestCase): class TestPhotos(unittest.TestCase): IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name + HASH_THRESHOLD = 10 # pylint: disable=invalid-name def test_same_photo_twice(self): """ @@ -150,7 +151,8 @@ class TestPhotos(unittest.TestCase): self.assertTrue(duplicates.compare_photos( photo, photo, - TestPhotos.IMAGE_CACHE + TestPhotos.IMAGE_CACHE, + TestPhotos.HASH_THRESHOLD )) def test_different_photos(self): @@ -160,13 +162,15 @@ class TestPhotos(unittest.TestCase): self.assertFalse(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, - TestPhotos.IMAGE_CACHE + TestPhotos.IMAGE_CACHE, + TestPhotos.HASH_THRESHOLD )) self.assertFalse(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, - TestPhotos.IMAGE_CACHE + TestPhotos.IMAGE_CACHE, + TestPhotos.HASH_THRESHOLD )) def test_matching_photos(self): @@ -176,25 +180,29 @@ class TestPhotos(unittest.TestCase): self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"}, - TestPhotos.IMAGE_CACHE + TestPhotos.IMAGE_CACHE, + TestPhotos.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"}, - TestPhotos.IMAGE_CACHE + TestPhotos.IMAGE_CACHE, + TestPhotos.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"}, - TestPhotos.IMAGE_CACHE + TestPhotos.IMAGE_CACHE, + TestPhotos.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"}, - TestPhotos.IMAGE_CACHE + TestPhotos.IMAGE_CACHE, + TestPhotos.HASH_THRESHOLD )) @@ -204,6 +212,7 @@ class TestDuplicates(unittest.TestCase): """ DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name + HASH_THRESHOLD = 10 # pylint: disable=invalid-name IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name @staticmethod @@ -246,7 +255,8 @@ class TestDuplicates(unittest.TestCase): flat1 = self.generate_fake_flat() flat2 = copy.deepcopy(flat1) score = duplicates.get_duplicate_score( - flat1, flat2, TestDuplicates.IMAGE_CACHE + flat1, flat2, + TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS @@ -261,7 +271,8 @@ class TestDuplicates(unittest.TestCase): flat2["cost"] += 1000 score = duplicates.get_duplicate_score( - flat1, flat2, TestDuplicates.IMAGE_CACHE + flat1, flat2, + TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS @@ -277,7 +288,8 @@ class TestDuplicates(unittest.TestCase): flat2["rooms"] += 1 score = duplicates.get_duplicate_score( - flat1, flat2, TestDuplicates.IMAGE_CACHE + flat1, flat2, + TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS @@ -292,7 +304,8 @@ class TestDuplicates(unittest.TestCase): flat2["area"] += 10 score = duplicates.get_duplicate_score( - flat1, flat2, TestDuplicates.IMAGE_CACHE + flat1, flat2, + TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS @@ -309,7 +322,8 @@ class TestDuplicates(unittest.TestCase): flat2["area"] = 50.37 score = duplicates.get_duplicate_score( - flat1, flat2, TestDuplicates.IMAGE_CACHE + flat1, flat2, + TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS @@ -325,7 +339,8 @@ class TestDuplicates(unittest.TestCase): flat2["phone"] = "0708091011" score = duplicates.get_duplicate_score( - flat1, flat2, TestDuplicates.IMAGE_CACHE + flat1, flat2, + TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS @@ -342,7 +357,8 @@ class TestDuplicates(unittest.TestCase): ) score = duplicates.get_duplicate_score( - flats[0], flats[1], TestDuplicates.IMAGE_CACHE + flats[0], flats[1], + TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS