Address a few nits in review

This commit is contained in:
Lucas Verney 2018-01-21 11:52:52 +01:00
parent 9fa2177087
commit 08599d91de
4 changed files with 48 additions and 22 deletions

View File

@ -49,6 +49,8 @@ DEFAULT_CONFIG = {
"max_distance_housing_station": 1500, "max_distance_housing_station": 1500,
# Score to consider two flats as being duplicates # Score to consider two flats as being duplicates
"duplicate_threshold": 15, "duplicate_threshold": 15,
# Score to consider two images as being duplicates through hash comparison
"duplicate_image_hash_threshold": 10,
# Navitia API key # Navitia API key
"navitia_api_key": None, "navitia_api_key": None,
# Number of filtering passes to run # Number of filtering passes to run
@ -144,6 +146,7 @@ def validate_config(config, check_with_data):
assert isinstance(config["store_personal_data"], bool) assert isinstance(config["store_personal_data"], bool)
assert isinstance(config["max_distance_housing_station"], (int, float)) assert isinstance(config["max_distance_housing_station"], (int, float))
assert isinstance(config["duplicate_threshold"], int) assert isinstance(config["duplicate_threshold"], int)
assert isinstance(config["duplicate_image_hash_threshold"], int)
# Ensure constraints are ok # Ensure constraints are ok
assert config["constraints"] assert config["constraints"]

View File

@ -13,9 +13,15 @@ class MemoryCache(object):
""" """
A cache in memory. A cache in memory.
""" """
@staticmethod @staticmethod
def on_miss(key): def on_miss(key):
"""
Method to be called whenever an object is requested from the cache but
was not already cached. Typically, make a HTTP query to fetch it.
:param key: Key of the requested object.
:return: The object content.
"""
raise NotImplementedError raise NotImplementedError
def __init__(self): def __init__(self):

View File

@ -67,14 +67,14 @@ def get_or_compute_photo_hash(photo, photo_cache):
return photo["hash"] return photo["hash"]
def compare_photos(photo1, photo2, photo_cache, hash_threshold=10): def compare_photos(photo1, photo2, photo_cache, hash_threshold):
""" """
Compares two photos with average hash method. Compares two photos with average hash method.
:param photo1: First photo url. :param photo1: First photo url.
:param photo2: Second photo url. :param photo2: Second photo url.
:param photo_cache: An instance of ``ImageCache`` to use to cache images. :param photo_cache: An instance of ``ImageCache`` to use to cache images.
:param hash_thresold: The hash threshold between two images. Usually two :param hash_threshold: The hash threshold between two images. Usually two
different photos have a hash difference of 30. different photos have a hash difference of 30.
:return: ``True`` if the photos are identical, else ``False``. :return: ``True`` if the photos are identical, else ``False``.
""" """
@ -91,7 +91,7 @@ def find_number_common_photos(
flat1_photos, flat1_photos,
flat2_photos, flat2_photos,
photo_cache, photo_cache,
hash_threshold=10 hash_threshold
): ):
""" """
Compute the number of common photos between the two lists of photos for the Compute the number of common photos between the two lists of photos for the
@ -104,7 +104,7 @@ def find_number_common_photos(
:param flat2_photos: Second list of flat photos. Each photo should be a :param flat2_photos: Second list of flat photos. Each photo should be a
``dict`` with (at least) a ``url`` key. ``dict`` with (at least) a ``url`` key.
:param photo_cache: An instance of ``ImageCache`` to use to cache images. :param photo_cache: An instance of ``ImageCache`` to use to cache images.
:param hash_thresold: The hash threshold between two images. :param hash_threshold: The hash threshold between two images.
:return: The found number of common photos. :return: The found number of common photos.
""" """
n_common_photos = 0 n_common_photos = 0
@ -202,7 +202,7 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
return unique_flats_list, duplicate_flats return unique_flats_list, duplicate_flats
def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10): def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
""" """
Compute the duplicate score between two flats. The higher the score, the Compute the duplicate score between two flats. The higher the score, the
more likely the two flats to be duplicates. more likely the two flats to be duplicates.
@ -210,7 +210,7 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
:param flat1: First flat dict. :param flat1: First flat dict.
:param flat2: Second flat dict. :param flat2: Second flat dict.
:param photo_cache: An instance of ``ImageCache`` to use to cache images. :param photo_cache: An instance of ``ImageCache`` to use to cache images.
:param hash_thresold: The hash threshold between two images. :param hash_threshold: The hash threshold between two images.
:return: The duplicate score as ``int``. :return: The duplicate score as ``int``.
""" """
n_common_items = 0 n_common_items = 0
@ -273,7 +273,8 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
n_common_photos = find_number_common_photos( n_common_photos = find_number_common_photos(
flat1["photos"], flat1["photos"],
flat2["photos"], flat2["photos"],
photo_cache photo_cache,
hash_threshold
) )
assert n_common_photos > 1 assert n_common_photos > 1

View File

@ -138,6 +138,7 @@ class TestPhoneNumbers(unittest.TestCase):
class TestPhotos(unittest.TestCase): class TestPhotos(unittest.TestCase):
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
def test_same_photo_twice(self): def test_same_photo_twice(self):
""" """
@ -150,7 +151,8 @@ class TestPhotos(unittest.TestCase):
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
photo, photo,
photo, photo,
TestPhotos.IMAGE_CACHE TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
)) ))
def test_different_photos(self): def test_different_photos(self):
@ -160,13 +162,15 @@ class TestPhotos(unittest.TestCase):
self.assertFalse(duplicates.compare_photos( self.assertFalse(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
TestPhotos.IMAGE_CACHE TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
)) ))
self.assertFalse(duplicates.compare_photos( self.assertFalse(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
TestPhotos.IMAGE_CACHE TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
)) ))
def test_matching_photos(self): def test_matching_photos(self):
@ -176,25 +180,29 @@ class TestPhotos(unittest.TestCase):
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"}, {"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
)) ))
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"}, {"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
)) ))
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"}, {"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
)) ))
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
TestPhotos.IMAGE_CACHE TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
)) ))
@ -204,6 +212,7 @@ class TestDuplicates(unittest.TestCase):
""" """
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
@staticmethod @staticmethod
@ -246,7 +255,8 @@ class TestDuplicates(unittest.TestCase):
flat1 = self.generate_fake_flat() flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1) flat2 = copy.deepcopy(flat1)
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -261,7 +271,8 @@ class TestDuplicates(unittest.TestCase):
flat2["cost"] += 1000 flat2["cost"] += 1000
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -277,7 +288,8 @@ class TestDuplicates(unittest.TestCase):
flat2["rooms"] += 1 flat2["rooms"] += 1
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -292,7 +304,8 @@ class TestDuplicates(unittest.TestCase):
flat2["area"] += 10 flat2["area"] += 10
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -309,7 +322,8 @@ class TestDuplicates(unittest.TestCase):
flat2["area"] = 50.37 flat2["area"] = 50.37
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -325,7 +339,8 @@ class TestDuplicates(unittest.TestCase):
flat2["phone"] = "0708091011" flat2["phone"] = "0708091011"
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -342,7 +357,8 @@ class TestDuplicates(unittest.TestCase):
) )
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flats[0], flats[1], TestDuplicates.IMAGE_CACHE flats[0], flats[1],
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS