Phone number should not count for as much in duplicates detection + better handling of multiple phone numbers

This commit is contained in:
Lucas Verney 2018-01-26 10:51:16 +01:00
parent e7218e90f3
commit d6b82b24c6
1 changed files with 28 additions and 18 deletions

View File

@ -22,31 +22,39 @@ from flatisfy.filters.cache import ImageCache
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
def homogeneize_phone_number(number): def homogeneize_phone_number(numbers):
""" """
Homogeneize the phone numbers, by stripping any space, dash or dot as well Homogeneize the phone numbers, by stripping any space, dash or dot as well
as the international prefix. Assumes it is dealing with French phone as the international prefix. Assumes it is dealing with French phone
numbers (starting with a zero and having 10 characters). numbers (starting with a zero and having 10 characters).
:param number: The phone number to homogeneize. :param numbers: The phone number string to homogeneize (can contain
multiple phone numbers).
:return: The cleaned phone number. ``None`` if the number is not valid. :return: The cleaned phone number. ``None`` if the number is not valid.
""" """
if not number: if not numbers:
return None
number = number.replace(".", "")
number = number.replace(" ", "")
number = number.replace("-", "")
number = number.replace("(", "")
number = number.replace(")", "")
number = re.sub(r'^\+\d\d', "", number)
if not number.startswith("0"):
number = "0" + number
if len(number) != 10:
return None return None
return number clean_numbers = []
for number in numbers.split(','):
number = number.strip()
number = number.replace(".", "")
number = number.replace(" ", "")
number = number.replace("-", "")
number = number.replace("(", "")
number = number.replace(")", "")
number = re.sub(r'^\+\d\d', "", number)
if not number.startswith("0"):
number = "0" + number
if len(number) == 10:
clean_numbers.append(number)
if not clean_numbers:
return None
return ", ".join(clean_numbers)
def get_or_compute_photo_hash(photo, photo_cache): def get_or_compute_photo_hash(photo, photo_cache):
@ -264,8 +272,10 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
flat1_phone = homogeneize_phone_number(flat1["phone"]) flat1_phone = homogeneize_phone_number(flat1["phone"])
flat2_phone = homogeneize_phone_number(flat2["phone"]) flat2_phone = homogeneize_phone_number(flat2["phone"])
if flat1_phone and flat2_phone: if flat1_phone and flat2_phone:
assert flat1_phone == flat2_phone # Use an "in" test as there could be multiple phone numbers
n_common_items += 10 # Counts much more than the rest # returned by a weboob module
if flat1_phone in flat2_phone or flat2_phone in flat1_phone:
n_common_items += 4 # Counts much more than the rest
# If the two flats are from the same website and have a # If the two flats are from the same website and have a
# different float part, consider they cannot be duplicates. See # different float part, consider they cannot be duplicates. See