Better deduplication

* Improve deduplication on URLs (match sets).
* Keep track of duplicates and update their status on refiltering.
This commit is contained in:
Lucas Verney 2017-04-27 17:08:10 +02:00
parent 1d98c631e0
commit 5f2f4d0ccf
7 changed files with 107 additions and 58 deletions

View File

@ -68,6 +68,8 @@ List of configuration options:
* `webserver` is a server to use instead of the default Bottle built-in * `webserver` is a server to use instead of the default Bottle built-in
webserver, see [Bottle deployment webserver, see [Bottle deployment
doc](http://bottlepy.org/docs/dev/deployment.html). doc](http://bottlepy.org/docs/dev/deployment.html).
* `backends` is a list of Weboob backends to enable. It defaults to any
available and supported Weboob backend.
_Note:_ In production, you can either use the `serve` command with a reliable _Note:_ In production, you can either use the `serve` command with a reliable
webserver instead of the default Bottle webserver (specifying a `webserver` webserver instead of the default Bottle webserver (specifying a `webserver`

View File

@ -155,8 +155,8 @@ def main():
if args.cmd == "fetch": if args.cmd == "fetch":
# Fetch and filter flats list # Fetch and filter flats list
flats_list = fetch.fetch_flats_list(config) flats_list = fetch.fetch_flats_list(config)
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list, flats_list = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=True) fetch_details=True)["new"]
# Sort by cost # Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost") flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
@ -169,8 +169,8 @@ def main():
if args.input: if args.input:
flats_list = fetch.load_flats_list_from_file(args.input) flats_list = fetch.load_flats_list_from_file(args.input)
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list, flats_list = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=False) fetch_details=False)["new"]
# Sort by cost # Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost") flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")

View File

@ -4,6 +4,7 @@ Main commands available for flatisfy.
""" """
from __future__ import absolute_import, print_function, unicode_literals from __future__ import absolute_import, print_function, unicode_literals
import collections
import logging import logging
import flatisfy.filters import flatisfy.filters
@ -11,13 +12,14 @@ from flatisfy import database
from flatisfy.models import flat as flat_model from flatisfy.models import flat as flat_model
from flatisfy import fetch from flatisfy import fetch
from flatisfy import tools from flatisfy import tools
from flatisfy.filters import metadata
from flatisfy.web import app as web_app from flatisfy.web import app as web_app
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
def filter_flats(config, flats_list=None, fetch_details=True): def filter_flats(config, flats_list, fetch_details=True):
""" """
Filter the available flats list. Then, filter it according to criteria. Filter the available flats list. Then, filter it according to criteria.
@ -25,30 +27,43 @@ def filter_flats(config, flats_list=None, fetch_details=True):
:param fetch_details: Whether additional details should be fetched between :param fetch_details: Whether additional details should be fetched between
the two passes. the two passes.
:param flats_list: The initial list of flat objects to filter. :param flats_list: The initial list of flat objects to filter.
:return: A tuple of the list of all matching flats and the list of ignored :return: A dict mapping flat status and list of flat objects.
flats.
""" """
# Add the flatisfy metadata entry and prepare the flat objects
flats_list = metadata.init(flats_list)
first_pass_result = collections.defaultdict(list)
second_pass_result = collections.defaultdict(list)
# Do a first pass with the available infos to try to remove as much # Do a first pass with the available infos to try to remove as much
# unwanted postings as possible # unwanted postings as possible
if config["passes"] > 0: if config["passes"] > 0:
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list, first_pass_result = flatisfy.filters.first_pass(flats_list,
config) config)
else:
first_pass_result["new"] = flats_list
# Load additional infos
if fetch_details:
for i, flat in enumerate(first_pass_result["new"]):
details = fetch.fetch_details(config, flat["id"])
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
# Do a second pass to consolidate all the infos we found and make use of # Do a second pass to consolidate all the infos we found and make use of
# additional infos # additional infos
if config["passes"] > 1: if config["passes"] > 1:
# Load additional infos second_pass_result = flatisfy.filters.second_pass(
if fetch_details: first_pass_result["new"], config
for i, flat in enumerate(flats_list):
details = fetch.fetch_details(config, flat["id"])
flats_list[i] = tools.merge_dicts(flat, details)
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
) )
ignored_flats.extend(extra_ignored_flats) else:
second_pass_result["new"] = first_pass_result["new"]
return flats_list, ignored_flats return {
"new": second_pass_result["new"],
"duplicate": first_pass_result["duplicate"],
"ignored": (
first_pass_result["ignored"] + second_pass_result["ignored"]
)
}
def import_and_filter(config, load_from_db=False): def import_and_filter(config, load_from_db=False):
@ -66,19 +81,16 @@ def import_and_filter(config, load_from_db=False):
flats_list = fetch.load_flats_list_from_db(config) flats_list = fetch.load_flats_list_from_db(config)
else: else:
flats_list = fetch.fetch_flats_list(config) flats_list = fetch.fetch_flats_list(config)
flats_list, ignored_list = filter_flats(config, flats_list=flats_list, flats_list_by_status = filter_flats(config, flats_list=flats_list,
fetch_details=True) fetch_details=True)
# Create database connection # Create database connection
get_session = database.init_db(config["database"]) get_session = database.init_db(config["database"])
with get_session() as session: with get_session() as session:
for status, flats_list in flats_list_by_status.items():
for flat_dict in flats_list: for flat_dict in flats_list:
flat = flat_model.Flat.from_dict(flat_dict) flat = flat_model.Flat.from_dict(flat_dict)
session.merge(flat) flat.status = getattr(flat_model.FlatStatus, status)
for flat_dict in ignored_list:
flat = flat_model.Flat.from_dict(flat_dict)
flat.status = flat_model.FlatStatus.ignored
session.merge(flat) session.merge(flat)

View File

@ -89,25 +89,23 @@ def first_pass(flats_list, config):
:param flats_list: A list of flats dict to filter. :param flats_list: A list of flats dict to filter.
:param config: A config dict. :param config: A config dict.
:return: A tuple of processed flats and ignored flats. :return: A dict mapping flat status and list of flat objects.
""" """
LOGGER.info("Running first filtering pass.") LOGGER.info("Running first filtering pass.")
# Handle duplicates based on ids # Handle duplicates based on ids
# Just remove them (no merge) as they should be the exact same object. # Just remove them (no merge) as they should be the exact same object.
flats_list = duplicates.detect( flats_list, duplicates_by_id = duplicates.detect(
flats_list, key="id", merge=False flats_list, key="id", merge=False, should_intersect=False
) )
# Also merge duplicates based on url (these may come from different # Also merge duplicates based on urls (these may come from different
# flatboob backends) # flatboob backends)
# This is especially useful as some websites such as entreparticuliers # This is especially useful as some websites such as entreparticuliers
# contains a lot of leboncoin housings posts. # contains a lot of leboncoin housings posts.
flats_list = duplicates.detect( flats_list, duplicates_by_urls = duplicates.detect(
flats_list, key="url", merge=True flats_list, key="urls", merge=True, should_intersect=True
) )
# Add the flatisfy metadata entry and prepare the flat objects
flats_list = metadata.init(flats_list)
# Guess the postal codes # Guess the postal codes
flats_list = metadata.guess_postal_code(flats_list, config) flats_list = metadata.guess_postal_code(flats_list, config)
# Try to match with stations # Try to match with stations
@ -115,7 +113,11 @@ def first_pass(flats_list, config):
# Remove returned housing posts that do not match criteria # Remove returned housing posts that do not match criteria
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config) flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
return (flats_list, ignored_list) return {
"new": flats_list,
"ignored": ignored_list,
"duplicate": duplicates_by_id + duplicates_by_urls
}
def second_pass(flats_list, config): def second_pass(flats_list, config):
@ -131,7 +133,7 @@ def second_pass(flats_list, config):
:param flats_list: A list of flats dict to filter. :param flats_list: A list of flats dict to filter.
:param config: A config dict. :param config: A config dict.
:return: A tuple of processed flats and ignored flats. :return: A dict mapping flat status and list of flat objects.
""" """
LOGGER.info("Running second filtering pass.") LOGGER.info("Running second filtering pass.")
# Assumed to run after first pass, so there should be no obvious duplicates # Assumed to run after first pass, so there should be no obvious duplicates
@ -151,4 +153,7 @@ def second_pass(flats_list, config):
# Remove returned housing posts that do not match criteria # Remove returned housing posts that do not match criteria
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config) flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
return (flats_list, ignored_list) return {
"new": flats_list,
"ignored": ignored_list
}

View File

@ -23,7 +23,7 @@ BACKENDS_PRECEDENCE = [
] ]
def detect(flats_list, key="id", merge=True): def detect(flats_list, key="id", merge=True, should_intersect=False):
""" """
Detect obvious duplicates within a given list of flats. Detect obvious duplicates within a given list of flats.
@ -38,18 +38,32 @@ def detect(flats_list, key="id", merge=True):
done. done.
:param merge: Whether the found duplicates should be merged or we should :param merge: Whether the found duplicates should be merged or we should
only keep one of them. only keep one of them.
:param should_intersect: Set to ``True`` if the values in the flat dicts
are lists and you want to deduplicate on non-empty intersection (typically
if they have a common url).
:return: A deduplicated list of flat dicts. :return: A tuple of the deduplicated list of flat dicts and the list of all
the flats objects that should be removed and considered as duplicates (they
were already merged).
""" """
# ``seen`` is a dict mapping aggregating the flats by the deduplication # ``seen`` is a dict mapping aggregating the flats by the deduplication
# keys. We basically make buckets of flats for every key value. Flats in # keys. We basically make buckets of flats for every key value. Flats in
# the same bucket should be merged together afterwards. # the same bucket should be merged together afterwards.
seen = collections.defaultdict(list) seen = collections.defaultdict(list)
for flat in flats_list: for flat in flats_list:
if should_intersect:
# We add each value separately. We will add some flats multiple
# times, but we deduplicate again on id below to compensate.
for value in flat.get(key, []):
seen[value].append(flat)
else:
seen[flat.get(key, None)].append(flat) seen[flat.get(key, None)].append(flat)
# Generate the unique flats list based on these buckets # Generate the unique flats list based on these buckets
unique_flats_list = [] unique_flats_list = []
# Keep track of all the flats that were removed by deduplication
duplicate_flats = []
for flat_key, matching_flats in seen.items(): for flat_key, matching_flats in seen.items():
if flat_key is None: if flat_key is None:
# If the key is None, it means Weboob could not load the data. In # If the key is None, it means Weboob could not load the data. In
@ -67,7 +81,8 @@ def detect(flats_list, key="id", merge=True):
) )
if len(matching_flats) > 1: if len(matching_flats) > 1:
LOGGER.info("Found duplicates: %s.", LOGGER.info("Found duplicates using key \"%s\": %s.",
key,
[flat["id"] for flat in matching_flats]) [flat["id"] for flat in matching_flats])
# Otherwise, check the policy # Otherwise, check the policy
if merge: if merge:
@ -76,6 +91,19 @@ def detect(flats_list, key="id", merge=True):
tools.merge_dicts(*matching_flats) tools.merge_dicts(*matching_flats)
) )
else: else:
# Otherwise, just keep any of them # Otherwise, just keep the most important of them
unique_flats_list.append(matching_flats[0]) unique_flats_list.append(matching_flats[-1])
return unique_flats_list
# The ID of the added merged flat will be the one of the last item
# in ``matching_flats``. Then, any flat object that was before in
# the ``matching_flats`` list is to be considered as a duplicate
# and should have a ``duplicate`` status.
duplicate_flats.extend(matching_flats[:-1])
if should_intersect:
# We added some flats twice with the above method, let's deduplicate on
# id.
unique_flats_list, _ = detect(unique_flats_list, key="id", merge=True,
should_intersect=False)
return unique_flats_list, duplicate_flats

View File

@ -33,6 +33,7 @@ class FlatStatus(enum.Enum):
An enum of the possible status for a flat entry. An enum of the possible status for a flat entry.
""" """
user_deleted = -100 user_deleted = -100
duplicate = -20
ignored = -10 ignored = -10
new = 0 new = 0
followed = 10 followed = 10
@ -83,6 +84,7 @@ class Flat(BASE):
""" """
# Handle flatisfy metadata # Handle flatisfy metadata
flat_dict = flat_dict.copy() flat_dict = flat_dict.copy()
if "flatisfy" in flat_dict:
flat_dict["flatisfy_stations"] = ( flat_dict["flatisfy_stations"] = (
flat_dict["flatisfy"].get("matched_stations", []) flat_dict["flatisfy"].get("matched_stations", [])
) )

View File

@ -55,7 +55,7 @@
</th> </th>
<td> <td>
<template v-if="flat.flatisfy_postal_code.postal_code"> <template v-if="flat.flatisfy_postal_code.postal_code">
{{ flat.flatisfy_postal_code.name }} ( {{ flat.flatisfy_postal_code.postal_code }} ) {{ flat.flatisfy_postal_code.name }} ({{ flat.flatisfy_postal_code.postal_code }})
</template> </template>
<template v-else> <template v-else>
? ?