Phone number should not count for as much in duplicates detection + better handling of multiple phone numbers

This commit is contained in:
Lucas Verney 2018-01-26 10:51:16 +01:00
parent e7218e90f3
commit d6b82b24c6
1 changed files with 28 additions and 18 deletions

View File

@ -22,31 +22,39 @@ from flatisfy.filters.cache import ImageCache
LOGGER = logging.getLogger(__name__)
def homogeneize_phone_number(number):
def homogeneize_phone_number(numbers):
"""
Homogeneize the phone numbers, by stripping any space, dash or dot as well
as the international prefix. Assumes it is dealing with French phone
numbers (starting with a zero and having 10 characters).
:param number: The phone number to homogeneize.
:param numbers: The phone number string to homogeneize (can contain
multiple phone numbers).
:return: The cleaned phone number. ``None`` if the number is not valid.
"""
if not number:
return None
number = number.replace(".", "")
number = number.replace(" ", "")
number = number.replace("-", "")
number = number.replace("(", "")
number = number.replace(")", "")
number = re.sub(r'^\+\d\d', "", number)
if not number.startswith("0"):
number = "0" + number
if len(number) != 10:
if not numbers:
return None
return number
clean_numbers = []
for number in numbers.split(','):
number = number.strip()
number = number.replace(".", "")
number = number.replace(" ", "")
number = number.replace("-", "")
number = number.replace("(", "")
number = number.replace(")", "")
number = re.sub(r'^\+\d\d', "", number)
if not number.startswith("0"):
number = "0" + number
if len(number) == 10:
clean_numbers.append(number)
if not clean_numbers:
return None
return ", ".join(clean_numbers)
def get_or_compute_photo_hash(photo, photo_cache):
@ -264,8 +272,10 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
flat1_phone = homogeneize_phone_number(flat1["phone"])
flat2_phone = homogeneize_phone_number(flat2["phone"])
if flat1_phone and flat2_phone:
assert flat1_phone == flat2_phone
n_common_items += 10 # Counts much more than the rest
# Use an "in" test as there could be multiple phone numbers
# returned by a weboob module
if flat1_phone in flat2_phone or flat2_phone in flat1_phone:
n_common_items += 4 # Counts much more than the rest
# If the two flats are from the same website and have a
# different float part, consider they cannot be duplicates. See