Phone number should not count for as much in duplicates detection + better handling of multiple phone numbers
This commit is contained in:
parent
e7218e90f3
commit
d6b82b24c6
@ -22,31 +22,39 @@ from flatisfy.filters.cache import ImageCache
|
|||||||
LOGGER = logging.getLogger(__name__)
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def homogeneize_phone_number(number):
|
def homogeneize_phone_number(numbers):
|
||||||
"""
|
"""
|
||||||
Homogeneize the phone numbers, by stripping any space, dash or dot as well
|
Homogeneize the phone numbers, by stripping any space, dash or dot as well
|
||||||
as the international prefix. Assumes it is dealing with French phone
|
as the international prefix. Assumes it is dealing with French phone
|
||||||
numbers (starting with a zero and having 10 characters).
|
numbers (starting with a zero and having 10 characters).
|
||||||
|
|
||||||
:param number: The phone number to homogeneize.
|
:param numbers: The phone number string to homogeneize (can contain
|
||||||
|
multiple phone numbers).
|
||||||
:return: The cleaned phone number. ``None`` if the number is not valid.
|
:return: The cleaned phone number. ``None`` if the number is not valid.
|
||||||
"""
|
"""
|
||||||
if not number:
|
if not numbers:
|
||||||
return None
|
|
||||||
number = number.replace(".", "")
|
|
||||||
number = number.replace(" ", "")
|
|
||||||
number = number.replace("-", "")
|
|
||||||
number = number.replace("(", "")
|
|
||||||
number = number.replace(")", "")
|
|
||||||
number = re.sub(r'^\+\d\d', "", number)
|
|
||||||
|
|
||||||
if not number.startswith("0"):
|
|
||||||
number = "0" + number
|
|
||||||
|
|
||||||
if len(number) != 10:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return number
|
clean_numbers = []
|
||||||
|
|
||||||
|
for number in numbers.split(','):
|
||||||
|
number = number.strip()
|
||||||
|
number = number.replace(".", "")
|
||||||
|
number = number.replace(" ", "")
|
||||||
|
number = number.replace("-", "")
|
||||||
|
number = number.replace("(", "")
|
||||||
|
number = number.replace(")", "")
|
||||||
|
number = re.sub(r'^\+\d\d', "", number)
|
||||||
|
|
||||||
|
if not number.startswith("0"):
|
||||||
|
number = "0" + number
|
||||||
|
|
||||||
|
if len(number) == 10:
|
||||||
|
clean_numbers.append(number)
|
||||||
|
|
||||||
|
if not clean_numbers:
|
||||||
|
return None
|
||||||
|
return ", ".join(clean_numbers)
|
||||||
|
|
||||||
|
|
||||||
def get_or_compute_photo_hash(photo, photo_cache):
|
def get_or_compute_photo_hash(photo, photo_cache):
|
||||||
@ -264,8 +272,10 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
|
|||||||
flat1_phone = homogeneize_phone_number(flat1["phone"])
|
flat1_phone = homogeneize_phone_number(flat1["phone"])
|
||||||
flat2_phone = homogeneize_phone_number(flat2["phone"])
|
flat2_phone = homogeneize_phone_number(flat2["phone"])
|
||||||
if flat1_phone and flat2_phone:
|
if flat1_phone and flat2_phone:
|
||||||
assert flat1_phone == flat2_phone
|
# Use an "in" test as there could be multiple phone numbers
|
||||||
n_common_items += 10 # Counts much more than the rest
|
# returned by a weboob module
|
||||||
|
if flat1_phone in flat2_phone or flat2_phone in flat1_phone:
|
||||||
|
n_common_items += 4 # Counts much more than the rest
|
||||||
|
|
||||||
# If the two flats are from the same website and have a
|
# If the two flats are from the same website and have a
|
||||||
# different float part, consider they cannot be duplicates. See
|
# different float part, consider they cannot be duplicates. See
|
||||||
|
Loading…
Reference in New Issue
Block a user