Address a few nits in review

This commit is contained in:
Lucas Verney 2018-01-21 11:52:52 +01:00
parent 9fa2177087
commit 08599d91de
4 changed files with 48 additions and 22 deletions

View File

@ -49,6 +49,8 @@ DEFAULT_CONFIG = {
"max_distance_housing_station": 1500,
# Score to consider two flats as being duplicates
"duplicate_threshold": 15,
# Score to consider two images as being duplicates through hash comparison
"duplicate_image_hash_threshold": 10,
# Navitia API key
"navitia_api_key": None,
# Number of filtering passes to run
@ -144,6 +146,7 @@ def validate_config(config, check_with_data):
assert isinstance(config["store_personal_data"], bool)
assert isinstance(config["max_distance_housing_station"], (int, float))
assert isinstance(config["duplicate_threshold"], int)
assert isinstance(config["duplicate_image_hash_threshold"], int)
# Ensure constraints are ok
assert config["constraints"]

View File

@ -13,9 +13,15 @@ class MemoryCache(object):
"""
A cache in memory.
"""
@staticmethod
def on_miss(key):
"""
Method to be called whenever an object is requested from the cache but
was not already cached. Typically, make a HTTP query to fetch it.
:param key: Key of the requested object.
:return: The object content.
"""
raise NotImplementedError
def __init__(self):

View File

@ -67,14 +67,14 @@ def get_or_compute_photo_hash(photo, photo_cache):
return photo["hash"]
def compare_photos(photo1, photo2, photo_cache, hash_threshold=10):
def compare_photos(photo1, photo2, photo_cache, hash_threshold):
"""
Compares two photos with average hash method.
:param photo1: First photo url.
:param photo2: Second photo url.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
:param hash_thresold: The hash threshold between two images. Usually two
:param hash_threshold: The hash threshold between two images. Usually two
different photos have a hash difference of 30.
:return: ``True`` if the photos are identical, else ``False``.
"""
@ -91,7 +91,7 @@ def find_number_common_photos(
flat1_photos,
flat2_photos,
photo_cache,
hash_threshold=10
hash_threshold
):
"""
Compute the number of common photos between the two lists of photos for the
@ -104,7 +104,7 @@ def find_number_common_photos(
:param flat2_photos: Second list of flat photos. Each photo should be a
``dict`` with (at least) a ``url`` key.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
:param hash_thresold: The hash threshold between two images.
:param hash_threshold: The hash threshold between two images.
:return: The found number of common photos.
"""
n_common_photos = 0
@ -202,7 +202,7 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
return unique_flats_list, duplicate_flats
def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
"""
Compute the duplicate score between two flats. The higher the score, the
more likely the two flats to be duplicates.
@ -210,7 +210,7 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
:param flat1: First flat dict.
:param flat2: Second flat dict.
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
:param hash_thresold: The hash threshold between two images.
:param hash_threshold: The hash threshold between two images.
:return: The duplicate score as ``int``.
"""
n_common_items = 0
@ -273,7 +273,8 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
n_common_photos = find_number_common_photos(
flat1["photos"],
flat2["photos"],
photo_cache
photo_cache,
hash_threshold
)
assert n_common_photos > 1

View File

@ -138,6 +138,7 @@ class TestPhoneNumbers(unittest.TestCase):
class TestPhotos(unittest.TestCase):
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
def test_same_photo_twice(self):
"""
@ -150,7 +151,8 @@ class TestPhotos(unittest.TestCase):
self.assertTrue(duplicates.compare_photos(
photo,
photo,
TestPhotos.IMAGE_CACHE
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
))
def test_different_photos(self):
@ -160,13 +162,15 @@ class TestPhotos(unittest.TestCase):
self.assertFalse(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
TestPhotos.IMAGE_CACHE
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
))
self.assertFalse(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
TestPhotos.IMAGE_CACHE
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
))
def test_matching_photos(self):
@ -176,25 +180,29 @@ class TestPhotos(unittest.TestCase):
self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
))
self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
))
self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
))
self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
TestPhotos.IMAGE_CACHE
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
))
@ -204,6 +212,7 @@ class TestDuplicates(unittest.TestCase):
"""
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
@staticmethod
@ -246,7 +255,8 @@ class TestDuplicates(unittest.TestCase):
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
)
self.assertTrue(
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -261,7 +271,8 @@ class TestDuplicates(unittest.TestCase):
flat2["cost"] += 1000
score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -277,7 +288,8 @@ class TestDuplicates(unittest.TestCase):
flat2["rooms"] += 1
score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -292,7 +304,8 @@ class TestDuplicates(unittest.TestCase):
flat2["area"] += 10
score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -309,7 +322,8 @@ class TestDuplicates(unittest.TestCase):
flat2["area"] = 50.37
score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -325,7 +339,8 @@ class TestDuplicates(unittest.TestCase):
flat2["phone"] = "0708091011"
score = duplicates.get_duplicate_score(
flat1, flat2, TestDuplicates.IMAGE_CACHE
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
@ -342,7 +357,8 @@ class TestDuplicates(unittest.TestCase):
)
score = duplicates.get_duplicate_score(
flats[0], flats[1], TestDuplicates.IMAGE_CACHE
flats[0], flats[1],
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
)
self.assertTrue(
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS