57 lines
2.1 KiB
Python
57 lines
2.1 KiB
Python
|
# coding: utf-8
|
||
|
"""
|
||
|
Filtering functions to detect and merge duplicates.
|
||
|
"""
|
||
|
from __future__ import absolute_import, print_function, unicode_literals
|
||
|
|
||
|
import collections
|
||
|
|
||
|
from flatisfy import tools
|
||
|
|
||
|
|
||
|
def detect(flats_list, key="id", merge=True):
|
||
|
"""
|
||
|
Detect obvious duplicates within a given list of flats.
|
||
|
|
||
|
There may be duplicates found, as some queries could overlap (especially
|
||
|
since when asking for a given place, websites tend to return housings in
|
||
|
nearby locations as well). We need to handle them, by either deleting the
|
||
|
duplicates (``merge=False``) or merging them together in a single flat
|
||
|
object.
|
||
|
|
||
|
:param flats_list: A list of flats dicts.
|
||
|
:param key: The flat dicts key on which the duplicate detection should be
|
||
|
done.
|
||
|
:param merge: Whether the found duplicates should be merged or we should
|
||
|
only keep one of them.
|
||
|
|
||
|
:return: A deduplicated list of flat dicts.
|
||
|
"""
|
||
|
# TODO: Keep track of found duplicates?
|
||
|
# ``seen`` is a dict mapping aggregating the flats by the deduplication
|
||
|
# keys. We basically make buckets of flats for every key value. Flats in
|
||
|
# the same bucket should be merged together afterwards.
|
||
|
seen = collections.defaultdict(list)
|
||
|
for flat in flats_list:
|
||
|
seen[flat.get(key, None)].append(flat)
|
||
|
|
||
|
# Generate the unique flats list based on these buckets
|
||
|
unique_flats_list = []
|
||
|
for flat_key, matching_flats in seen.items():
|
||
|
if flat_key is None:
|
||
|
# If the key is None, it means Weboob could not load the data. In
|
||
|
# this case, we consider every matching item as being independant
|
||
|
# of the others, to avoid over-deduplication.
|
||
|
unique_flats_list.extend(matching_flats)
|
||
|
else:
|
||
|
# Otherwise, check the policy
|
||
|
if merge:
|
||
|
# If a merge is requested, do the merge
|
||
|
unique_flats_list.append(
|
||
|
tools.merge_dicts(*matching_flats)
|
||
|
)
|
||
|
else:
|
||
|
# Otherwise, just keep any of them
|
||
|
unique_flats_list.append(matching_flats[0])
|
||
|
return unique_flats_list
|