Better deduplication
* Improve deduplication on URLs (match sets). * Keep track of duplicates and update their status on refiltering.
This commit is contained in:
parent
1d98c631e0
commit
5f2f4d0ccf
@ -68,6 +68,8 @@ List of configuration options:
|
||||
* `webserver` is a server to use instead of the default Bottle built-in
|
||||
webserver, see [Bottle deployment
|
||||
doc](http://bottlepy.org/docs/dev/deployment.html).
|
||||
* `backends` is a list of Weboob backends to enable. It defaults to any
|
||||
available and supported Weboob backend.
|
||||
|
||||
_Note:_ In production, you can either use the `serve` command with a reliable
|
||||
webserver instead of the default Bottle webserver (specifying a `webserver`
|
||||
|
@ -155,8 +155,8 @@ def main():
|
||||
if args.cmd == "fetch":
|
||||
# Fetch and filter flats list
|
||||
flats_list = fetch.fetch_flats_list(config)
|
||||
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
|
||||
fetch_details=True)
|
||||
flats_list = cmds.filter_flats(config, flats_list=flats_list,
|
||||
fetch_details=True)["new"]
|
||||
# Sort by cost
|
||||
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
|
||||
|
||||
@ -169,8 +169,8 @@ def main():
|
||||
if args.input:
|
||||
flats_list = fetch.load_flats_list_from_file(args.input)
|
||||
|
||||
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
|
||||
fetch_details=False)
|
||||
flats_list = cmds.filter_flats(config, flats_list=flats_list,
|
||||
fetch_details=False)["new"]
|
||||
|
||||
# Sort by cost
|
||||
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
|
||||
|
@ -4,6 +4,7 @@ Main commands available for flatisfy.
|
||||
"""
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import collections
|
||||
import logging
|
||||
|
||||
import flatisfy.filters
|
||||
@ -11,13 +12,14 @@ from flatisfy import database
|
||||
from flatisfy.models import flat as flat_model
|
||||
from flatisfy import fetch
|
||||
from flatisfy import tools
|
||||
from flatisfy.filters import metadata
|
||||
from flatisfy.web import app as web_app
|
||||
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def filter_flats(config, flats_list=None, fetch_details=True):
|
||||
def filter_flats(config, flats_list, fetch_details=True):
|
||||
"""
|
||||
Filter the available flats list. Then, filter it according to criteria.
|
||||
|
||||
@ -25,30 +27,43 @@ def filter_flats(config, flats_list=None, fetch_details=True):
|
||||
:param fetch_details: Whether additional details should be fetched between
|
||||
the two passes.
|
||||
:param flats_list: The initial list of flat objects to filter.
|
||||
:return: A tuple of the list of all matching flats and the list of ignored
|
||||
flats.
|
||||
:return: A dict mapping flat status and list of flat objects.
|
||||
"""
|
||||
# Add the flatisfy metadata entry and prepare the flat objects
|
||||
flats_list = metadata.init(flats_list)
|
||||
|
||||
first_pass_result = collections.defaultdict(list)
|
||||
second_pass_result = collections.defaultdict(list)
|
||||
# Do a first pass with the available infos to try to remove as much
|
||||
# unwanted postings as possible
|
||||
if config["passes"] > 0:
|
||||
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
|
||||
config)
|
||||
first_pass_result = flatisfy.filters.first_pass(flats_list,
|
||||
config)
|
||||
else:
|
||||
first_pass_result["new"] = flats_list
|
||||
|
||||
# Load additional infos
|
||||
if fetch_details:
|
||||
for i, flat in enumerate(first_pass_result["new"]):
|
||||
details = fetch.fetch_details(config, flat["id"])
|
||||
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
|
||||
|
||||
# Do a second pass to consolidate all the infos we found and make use of
|
||||
# additional infos
|
||||
if config["passes"] > 1:
|
||||
# Load additional infos
|
||||
if fetch_details:
|
||||
for i, flat in enumerate(flats_list):
|
||||
details = fetch.fetch_details(config, flat["id"])
|
||||
flats_list[i] = tools.merge_dicts(flat, details)
|
||||
|
||||
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
|
||||
flats_list, config
|
||||
second_pass_result = flatisfy.filters.second_pass(
|
||||
first_pass_result["new"], config
|
||||
)
|
||||
ignored_flats.extend(extra_ignored_flats)
|
||||
else:
|
||||
second_pass_result["new"] = first_pass_result["new"]
|
||||
|
||||
return flats_list, ignored_flats
|
||||
return {
|
||||
"new": second_pass_result["new"],
|
||||
"duplicate": first_pass_result["duplicate"],
|
||||
"ignored": (
|
||||
first_pass_result["ignored"] + second_pass_result["ignored"]
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def import_and_filter(config, load_from_db=False):
|
||||
@ -66,20 +81,17 @@ def import_and_filter(config, load_from_db=False):
|
||||
flats_list = fetch.load_flats_list_from_db(config)
|
||||
else:
|
||||
flats_list = fetch.fetch_flats_list(config)
|
||||
flats_list, ignored_list = filter_flats(config, flats_list=flats_list,
|
||||
fetch_details=True)
|
||||
flats_list_by_status = filter_flats(config, flats_list=flats_list,
|
||||
fetch_details=True)
|
||||
# Create database connection
|
||||
get_session = database.init_db(config["database"])
|
||||
|
||||
with get_session() as session:
|
||||
for flat_dict in flats_list:
|
||||
flat = flat_model.Flat.from_dict(flat_dict)
|
||||
session.merge(flat)
|
||||
|
||||
for flat_dict in ignored_list:
|
||||
flat = flat_model.Flat.from_dict(flat_dict)
|
||||
flat.status = flat_model.FlatStatus.ignored
|
||||
session.merge(flat)
|
||||
for status, flats_list in flats_list_by_status.items():
|
||||
for flat_dict in flats_list:
|
||||
flat = flat_model.Flat.from_dict(flat_dict)
|
||||
flat.status = getattr(flat_model.FlatStatus, status)
|
||||
session.merge(flat)
|
||||
|
||||
|
||||
def purge_db(config):
|
||||
|
@ -89,25 +89,23 @@ def first_pass(flats_list, config):
|
||||
|
||||
:param flats_list: A list of flats dict to filter.
|
||||
:param config: A config dict.
|
||||
:return: A tuple of processed flats and ignored flats.
|
||||
:return: A dict mapping flat status and list of flat objects.
|
||||
"""
|
||||
LOGGER.info("Running first filtering pass.")
|
||||
|
||||
# Handle duplicates based on ids
|
||||
# Just remove them (no merge) as they should be the exact same object.
|
||||
flats_list = duplicates.detect(
|
||||
flats_list, key="id", merge=False
|
||||
flats_list, duplicates_by_id = duplicates.detect(
|
||||
flats_list, key="id", merge=False, should_intersect=False
|
||||
)
|
||||
# Also merge duplicates based on url (these may come from different
|
||||
# Also merge duplicates based on urls (these may come from different
|
||||
# flatboob backends)
|
||||
# This is especially useful as some websites such as entreparticuliers
|
||||
# contains a lot of leboncoin housings posts.
|
||||
flats_list = duplicates.detect(
|
||||
flats_list, key="url", merge=True
|
||||
flats_list, duplicates_by_urls = duplicates.detect(
|
||||
flats_list, key="urls", merge=True, should_intersect=True
|
||||
)
|
||||
|
||||
# Add the flatisfy metadata entry and prepare the flat objects
|
||||
flats_list = metadata.init(flats_list)
|
||||
# Guess the postal codes
|
||||
flats_list = metadata.guess_postal_code(flats_list, config)
|
||||
# Try to match with stations
|
||||
@ -115,7 +113,11 @@ def first_pass(flats_list, config):
|
||||
# Remove returned housing posts that do not match criteria
|
||||
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
||||
|
||||
return (flats_list, ignored_list)
|
||||
return {
|
||||
"new": flats_list,
|
||||
"ignored": ignored_list,
|
||||
"duplicate": duplicates_by_id + duplicates_by_urls
|
||||
}
|
||||
|
||||
|
||||
def second_pass(flats_list, config):
|
||||
@ -131,7 +133,7 @@ def second_pass(flats_list, config):
|
||||
|
||||
:param flats_list: A list of flats dict to filter.
|
||||
:param config: A config dict.
|
||||
:return: A tuple of processed flats and ignored flats.
|
||||
:return: A dict mapping flat status and list of flat objects.
|
||||
"""
|
||||
LOGGER.info("Running second filtering pass.")
|
||||
# Assumed to run after first pass, so there should be no obvious duplicates
|
||||
@ -151,4 +153,7 @@ def second_pass(flats_list, config):
|
||||
# Remove returned housing posts that do not match criteria
|
||||
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
||||
|
||||
return (flats_list, ignored_list)
|
||||
return {
|
||||
"new": flats_list,
|
||||
"ignored": ignored_list
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ BACKENDS_PRECEDENCE = [
|
||||
]
|
||||
|
||||
|
||||
def detect(flats_list, key="id", merge=True):
|
||||
def detect(flats_list, key="id", merge=True, should_intersect=False):
|
||||
"""
|
||||
Detect obvious duplicates within a given list of flats.
|
||||
|
||||
@ -38,18 +38,32 @@ def detect(flats_list, key="id", merge=True):
|
||||
done.
|
||||
:param merge: Whether the found duplicates should be merged or we should
|
||||
only keep one of them.
|
||||
:param should_intersect: Set to ``True`` if the values in the flat dicts
|
||||
are lists and you want to deduplicate on non-empty intersection (typically
|
||||
if they have a common url).
|
||||
|
||||
:return: A deduplicated list of flat dicts.
|
||||
:return: A tuple of the deduplicated list of flat dicts and the list of all
|
||||
the flats objects that should be removed and considered as duplicates (they
|
||||
were already merged).
|
||||
"""
|
||||
# ``seen`` is a dict mapping aggregating the flats by the deduplication
|
||||
# keys. We basically make buckets of flats for every key value. Flats in
|
||||
# the same bucket should be merged together afterwards.
|
||||
seen = collections.defaultdict(list)
|
||||
for flat in flats_list:
|
||||
seen[flat.get(key, None)].append(flat)
|
||||
if should_intersect:
|
||||
# We add each value separately. We will add some flats multiple
|
||||
# times, but we deduplicate again on id below to compensate.
|
||||
for value in flat.get(key, []):
|
||||
seen[value].append(flat)
|
||||
else:
|
||||
seen[flat.get(key, None)].append(flat)
|
||||
|
||||
# Generate the unique flats list based on these buckets
|
||||
unique_flats_list = []
|
||||
# Keep track of all the flats that were removed by deduplication
|
||||
duplicate_flats = []
|
||||
|
||||
for flat_key, matching_flats in seen.items():
|
||||
if flat_key is None:
|
||||
# If the key is None, it means Weboob could not load the data. In
|
||||
@ -67,7 +81,8 @@ def detect(flats_list, key="id", merge=True):
|
||||
)
|
||||
|
||||
if len(matching_flats) > 1:
|
||||
LOGGER.info("Found duplicates: %s.",
|
||||
LOGGER.info("Found duplicates using key \"%s\": %s.",
|
||||
key,
|
||||
[flat["id"] for flat in matching_flats])
|
||||
# Otherwise, check the policy
|
||||
if merge:
|
||||
@ -76,6 +91,19 @@ def detect(flats_list, key="id", merge=True):
|
||||
tools.merge_dicts(*matching_flats)
|
||||
)
|
||||
else:
|
||||
# Otherwise, just keep any of them
|
||||
unique_flats_list.append(matching_flats[0])
|
||||
return unique_flats_list
|
||||
# Otherwise, just keep the most important of them
|
||||
unique_flats_list.append(matching_flats[-1])
|
||||
|
||||
# The ID of the added merged flat will be the one of the last item
|
||||
# in ``matching_flats``. Then, any flat object that was before in
|
||||
# the ``matching_flats`` list is to be considered as a duplicate
|
||||
# and should have a ``duplicate`` status.
|
||||
duplicate_flats.extend(matching_flats[:-1])
|
||||
|
||||
if should_intersect:
|
||||
# We added some flats twice with the above method, let's deduplicate on
|
||||
# id.
|
||||
unique_flats_list, _ = detect(unique_flats_list, key="id", merge=True,
|
||||
should_intersect=False)
|
||||
|
||||
return unique_flats_list, duplicate_flats
|
||||
|
@ -33,6 +33,7 @@ class FlatStatus(enum.Enum):
|
||||
An enum of the possible status for a flat entry.
|
||||
"""
|
||||
user_deleted = -100
|
||||
duplicate = -20
|
||||
ignored = -10
|
||||
new = 0
|
||||
followed = 10
|
||||
@ -83,16 +84,17 @@ class Flat(BASE):
|
||||
"""
|
||||
# Handle flatisfy metadata
|
||||
flat_dict = flat_dict.copy()
|
||||
flat_dict["flatisfy_stations"] = (
|
||||
flat_dict["flatisfy"].get("matched_stations", [])
|
||||
)
|
||||
flat_dict["flatisfy_postal_code"] = (
|
||||
flat_dict["flatisfy"].get("postal_code", None)
|
||||
)
|
||||
flat_dict["flatisfy_time_to"] = (
|
||||
flat_dict["flatisfy"].get("time_to", {})
|
||||
)
|
||||
del flat_dict["flatisfy"]
|
||||
if "flatisfy" in flat_dict:
|
||||
flat_dict["flatisfy_stations"] = (
|
||||
flat_dict["flatisfy"].get("matched_stations", [])
|
||||
)
|
||||
flat_dict["flatisfy_postal_code"] = (
|
||||
flat_dict["flatisfy"].get("postal_code", None)
|
||||
)
|
||||
flat_dict["flatisfy_time_to"] = (
|
||||
flat_dict["flatisfy"].get("time_to", {})
|
||||
)
|
||||
del flat_dict["flatisfy"]
|
||||
|
||||
# Handle utilities field
|
||||
if not isinstance(flat_dict["utilities"], FlatUtilities):
|
||||
|
@ -55,7 +55,7 @@
|
||||
</th>
|
||||
<td>
|
||||
<template v-if="flat.flatisfy_postal_code.postal_code">
|
||||
{{ flat.flatisfy_postal_code.name }} ( {{ flat.flatisfy_postal_code.postal_code }} )
|
||||
{{ flat.flatisfy_postal_code.name }} ({{ flat.flatisfy_postal_code.postal_code }})
|
||||
</template>
|
||||
<template v-else>
|
||||
?
|
||||
|
Loading…
Reference in New Issue
Block a user