Address a few nits in review
This commit is contained in:
parent
9fa2177087
commit
08599d91de
@ -49,6 +49,8 @@ DEFAULT_CONFIG = {
|
||||
"max_distance_housing_station": 1500,
|
||||
# Score to consider two flats as being duplicates
|
||||
"duplicate_threshold": 15,
|
||||
# Score to consider two images as being duplicates through hash comparison
|
||||
"duplicate_image_hash_threshold": 10,
|
||||
# Navitia API key
|
||||
"navitia_api_key": None,
|
||||
# Number of filtering passes to run
|
||||
@ -144,6 +146,7 @@ def validate_config(config, check_with_data):
|
||||
assert isinstance(config["store_personal_data"], bool)
|
||||
assert isinstance(config["max_distance_housing_station"], (int, float))
|
||||
assert isinstance(config["duplicate_threshold"], int)
|
||||
assert isinstance(config["duplicate_image_hash_threshold"], int)
|
||||
|
||||
# Ensure constraints are ok
|
||||
assert config["constraints"]
|
||||
|
@ -13,9 +13,15 @@ class MemoryCache(object):
|
||||
"""
|
||||
A cache in memory.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def on_miss(key):
|
||||
"""
|
||||
Method to be called whenever an object is requested from the cache but
|
||||
was not already cached. Typically, make a HTTP query to fetch it.
|
||||
|
||||
:param key: Key of the requested object.
|
||||
:return: The object content.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self):
|
||||
|
@ -67,14 +67,14 @@ def get_or_compute_photo_hash(photo, photo_cache):
|
||||
return photo["hash"]
|
||||
|
||||
|
||||
def compare_photos(photo1, photo2, photo_cache, hash_threshold=10):
|
||||
def compare_photos(photo1, photo2, photo_cache, hash_threshold):
|
||||
"""
|
||||
Compares two photos with average hash method.
|
||||
|
||||
:param photo1: First photo url.
|
||||
:param photo2: Second photo url.
|
||||
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
|
||||
:param hash_thresold: The hash threshold between two images. Usually two
|
||||
:param hash_threshold: The hash threshold between two images. Usually two
|
||||
different photos have a hash difference of 30.
|
||||
:return: ``True`` if the photos are identical, else ``False``.
|
||||
"""
|
||||
@ -91,7 +91,7 @@ def find_number_common_photos(
|
||||
flat1_photos,
|
||||
flat2_photos,
|
||||
photo_cache,
|
||||
hash_threshold=10
|
||||
hash_threshold
|
||||
):
|
||||
"""
|
||||
Compute the number of common photos between the two lists of photos for the
|
||||
@ -104,7 +104,7 @@ def find_number_common_photos(
|
||||
:param flat2_photos: Second list of flat photos. Each photo should be a
|
||||
``dict`` with (at least) a ``url`` key.
|
||||
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
|
||||
:param hash_thresold: The hash threshold between two images.
|
||||
:param hash_threshold: The hash threshold between two images.
|
||||
:return: The found number of common photos.
|
||||
"""
|
||||
n_common_photos = 0
|
||||
@ -202,7 +202,7 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
|
||||
return unique_flats_list, duplicate_flats
|
||||
|
||||
|
||||
def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
|
||||
def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
|
||||
"""
|
||||
Compute the duplicate score between two flats. The higher the score, the
|
||||
more likely the two flats to be duplicates.
|
||||
@ -210,7 +210,7 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
|
||||
:param flat1: First flat dict.
|
||||
:param flat2: Second flat dict.
|
||||
:param photo_cache: An instance of ``ImageCache`` to use to cache images.
|
||||
:param hash_thresold: The hash threshold between two images.
|
||||
:param hash_threshold: The hash threshold between two images.
|
||||
:return: The duplicate score as ``int``.
|
||||
"""
|
||||
n_common_items = 0
|
||||
@ -273,7 +273,8 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold=10):
|
||||
n_common_photos = find_number_common_photos(
|
||||
flat1["photos"],
|
||||
flat2["photos"],
|
||||
photo_cache
|
||||
photo_cache,
|
||||
hash_threshold
|
||||
)
|
||||
assert n_common_photos > 1
|
||||
|
||||
|
@ -138,6 +138,7 @@ class TestPhoneNumbers(unittest.TestCase):
|
||||
|
||||
class TestPhotos(unittest.TestCase):
|
||||
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
|
||||
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
||||
|
||||
def test_same_photo_twice(self):
|
||||
"""
|
||||
@ -150,7 +151,8 @@ class TestPhotos(unittest.TestCase):
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
photo,
|
||||
photo,
|
||||
TestPhotos.IMAGE_CACHE
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
def test_different_photos(self):
|
||||
@ -160,13 +162,15 @@ class TestPhotos(unittest.TestCase):
|
||||
self.assertFalse(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
self.assertFalse(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
def test_matching_photos(self):
|
||||
@ -176,25 +180,29 @@ class TestPhotos(unittest.TestCase):
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
|
||||
@ -204,6 +212,7 @@ class TestDuplicates(unittest.TestCase):
|
||||
"""
|
||||
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
|
||||
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
|
||||
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
||||
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
|
||||
|
||||
@staticmethod
|
||||
@ -246,7 +255,8 @@ class TestDuplicates(unittest.TestCase):
|
||||
flat1 = self.generate_fake_flat()
|
||||
flat2 = copy.deepcopy(flat1)
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2, TestDuplicates.IMAGE_CACHE
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
@ -261,7 +271,8 @@ class TestDuplicates(unittest.TestCase):
|
||||
flat2["cost"] += 1000
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2, TestDuplicates.IMAGE_CACHE
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
@ -277,7 +288,8 @@ class TestDuplicates(unittest.TestCase):
|
||||
flat2["rooms"] += 1
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2, TestDuplicates.IMAGE_CACHE
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
@ -292,7 +304,8 @@ class TestDuplicates(unittest.TestCase):
|
||||
flat2["area"] += 10
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2, TestDuplicates.IMAGE_CACHE
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
@ -309,7 +322,8 @@ class TestDuplicates(unittest.TestCase):
|
||||
flat2["area"] = 50.37
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2, TestDuplicates.IMAGE_CACHE
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
@ -325,7 +339,8 @@ class TestDuplicates(unittest.TestCase):
|
||||
flat2["phone"] = "0708091011"
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2, TestDuplicates.IMAGE_CACHE
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
@ -342,7 +357,8 @@ class TestDuplicates(unittest.TestCase):
|
||||
)
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flats[0], flats[1], TestDuplicates.IMAGE_CACHE
|
||||
flats[0], flats[1],
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS
|
||||
|
Loading…
Reference in New Issue
Block a user