Better deduplication

* Improve deduplication on URLs (match sets).
* Keep track of duplicates and update their status on refiltering.
This commit is contained in:
Lucas Verney 2017-04-27 17:08:10 +02:00
parent 1d98c631e0
commit 5f2f4d0ccf
7 changed files with 107 additions and 58 deletions

View File

@ -68,6 +68,8 @@ List of configuration options:
* `webserver` is a server to use instead of the default Bottle built-in
webserver, see [Bottle deployment
doc](http://bottlepy.org/docs/dev/deployment.html).
* `backends` is a list of Weboob backends to enable. It defaults to any
available and supported Weboob backend.
_Note:_ In production, you can either use the `serve` command with a reliable
webserver instead of the default Bottle webserver (specifying a `webserver`

View File

@ -155,8 +155,8 @@ def main():
if args.cmd == "fetch":
# Fetch and filter flats list
flats_list = fetch.fetch_flats_list(config)
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=True)
flats_list = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=True)["new"]
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
@ -169,8 +169,8 @@ def main():
if args.input:
flats_list = fetch.load_flats_list_from_file(args.input)
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=False)
flats_list = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=False)["new"]
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")

View File

@ -4,6 +4,7 @@ Main commands available for flatisfy.
"""
from __future__ import absolute_import, print_function, unicode_literals
import collections
import logging
import flatisfy.filters
@ -11,13 +12,14 @@ from flatisfy import database
from flatisfy.models import flat as flat_model
from flatisfy import fetch
from flatisfy import tools
from flatisfy.filters import metadata
from flatisfy.web import app as web_app
LOGGER = logging.getLogger(__name__)
def filter_flats(config, flats_list=None, fetch_details=True):
def filter_flats(config, flats_list, fetch_details=True):
"""
Filter the available flats list. Then, filter it according to criteria.
@ -25,30 +27,43 @@ def filter_flats(config, flats_list=None, fetch_details=True):
:param fetch_details: Whether additional details should be fetched between
the two passes.
:param flats_list: The initial list of flat objects to filter.
:return: A tuple of the list of all matching flats and the list of ignored
flats.
:return: A dict mapping flat status and list of flat objects.
"""
# Add the flatisfy metadata entry and prepare the flat objects
flats_list = metadata.init(flats_list)
first_pass_result = collections.defaultdict(list)
second_pass_result = collections.defaultdict(list)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
config)
first_pass_result = flatisfy.filters.first_pass(flats_list,
config)
else:
first_pass_result["new"] = flats_list
# Load additional infos
if fetch_details:
for i, flat in enumerate(first_pass_result["new"]):
details = fetch.fetch_details(config, flat["id"])
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
# Do a second pass to consolidate all the infos we found and make use of
# additional infos
if config["passes"] > 1:
# Load additional infos
if fetch_details:
for i, flat in enumerate(flats_list):
details = fetch.fetch_details(config, flat["id"])
flats_list[i] = tools.merge_dicts(flat, details)
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
second_pass_result = flatisfy.filters.second_pass(
first_pass_result["new"], config
)
ignored_flats.extend(extra_ignored_flats)
else:
second_pass_result["new"] = first_pass_result["new"]
return flats_list, ignored_flats
return {
"new": second_pass_result["new"],
"duplicate": first_pass_result["duplicate"],
"ignored": (
first_pass_result["ignored"] + second_pass_result["ignored"]
)
}
def import_and_filter(config, load_from_db=False):
@ -66,20 +81,17 @@ def import_and_filter(config, load_from_db=False):
flats_list = fetch.load_flats_list_from_db(config)
else:
flats_list = fetch.fetch_flats_list(config)
flats_list, ignored_list = filter_flats(config, flats_list=flats_list,
fetch_details=True)
flats_list_by_status = filter_flats(config, flats_list=flats_list,
fetch_details=True)
# Create database connection
get_session = database.init_db(config["database"])
with get_session() as session:
for flat_dict in flats_list:
flat = flat_model.Flat.from_dict(flat_dict)
session.merge(flat)
for flat_dict in ignored_list:
flat = flat_model.Flat.from_dict(flat_dict)
flat.status = flat_model.FlatStatus.ignored
session.merge(flat)
for status, flats_list in flats_list_by_status.items():
for flat_dict in flats_list:
flat = flat_model.Flat.from_dict(flat_dict)
flat.status = getattr(flat_model.FlatStatus, status)
session.merge(flat)
def purge_db(config):

View File

@ -89,25 +89,23 @@ def first_pass(flats_list, config):
:param flats_list: A list of flats dict to filter.
:param config: A config dict.
:return: A tuple of processed flats and ignored flats.
:return: A dict mapping flat status and list of flat objects.
"""
LOGGER.info("Running first filtering pass.")
# Handle duplicates based on ids
# Just remove them (no merge) as they should be the exact same object.
flats_list = duplicates.detect(
flats_list, key="id", merge=False
flats_list, duplicates_by_id = duplicates.detect(
flats_list, key="id", merge=False, should_intersect=False
)
# Also merge duplicates based on url (these may come from different
# Also merge duplicates based on urls (these may come from different
# flatboob backends)
# This is especially useful as some websites such as entreparticuliers
# contains a lot of leboncoin housings posts.
flats_list = duplicates.detect(
flats_list, key="url", merge=True
flats_list, duplicates_by_urls = duplicates.detect(
flats_list, key="urls", merge=True, should_intersect=True
)
# Add the flatisfy metadata entry and prepare the flat objects
flats_list = metadata.init(flats_list)
# Guess the postal codes
flats_list = metadata.guess_postal_code(flats_list, config)
# Try to match with stations
@ -115,7 +113,11 @@ def first_pass(flats_list, config):
# Remove returned housing posts that do not match criteria
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
return (flats_list, ignored_list)
return {
"new": flats_list,
"ignored": ignored_list,
"duplicate": duplicates_by_id + duplicates_by_urls
}
def second_pass(flats_list, config):
@ -131,7 +133,7 @@ def second_pass(flats_list, config):
:param flats_list: A list of flats dict to filter.
:param config: A config dict.
:return: A tuple of processed flats and ignored flats.
:return: A dict mapping flat status and list of flat objects.
"""
LOGGER.info("Running second filtering pass.")
# Assumed to run after first pass, so there should be no obvious duplicates
@ -151,4 +153,7 @@ def second_pass(flats_list, config):
# Remove returned housing posts that do not match criteria
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
return (flats_list, ignored_list)
return {
"new": flats_list,
"ignored": ignored_list
}

View File

@ -23,7 +23,7 @@ BACKENDS_PRECEDENCE = [
]
def detect(flats_list, key="id", merge=True):
def detect(flats_list, key="id", merge=True, should_intersect=False):
"""
Detect obvious duplicates within a given list of flats.
@ -38,18 +38,32 @@ def detect(flats_list, key="id", merge=True):
done.
:param merge: Whether the found duplicates should be merged or we should
only keep one of them.
:param should_intersect: Set to ``True`` if the values in the flat dicts
are lists and you want to deduplicate on non-empty intersection (typically
if they have a common url).
:return: A deduplicated list of flat dicts.
:return: A tuple of the deduplicated list of flat dicts and the list of all
the flats objects that should be removed and considered as duplicates (they
were already merged).
"""
# ``seen`` is a dict mapping aggregating the flats by the deduplication
# keys. We basically make buckets of flats for every key value. Flats in
# the same bucket should be merged together afterwards.
seen = collections.defaultdict(list)
for flat in flats_list:
seen[flat.get(key, None)].append(flat)
if should_intersect:
# We add each value separately. We will add some flats multiple
# times, but we deduplicate again on id below to compensate.
for value in flat.get(key, []):
seen[value].append(flat)
else:
seen[flat.get(key, None)].append(flat)
# Generate the unique flats list based on these buckets
unique_flats_list = []
# Keep track of all the flats that were removed by deduplication
duplicate_flats = []
for flat_key, matching_flats in seen.items():
if flat_key is None:
# If the key is None, it means Weboob could not load the data. In
@ -67,7 +81,8 @@ def detect(flats_list, key="id", merge=True):
)
if len(matching_flats) > 1:
LOGGER.info("Found duplicates: %s.",
LOGGER.info("Found duplicates using key \"%s\": %s.",
key,
[flat["id"] for flat in matching_flats])
# Otherwise, check the policy
if merge:
@ -76,6 +91,19 @@ def detect(flats_list, key="id", merge=True):
tools.merge_dicts(*matching_flats)
)
else:
# Otherwise, just keep any of them
unique_flats_list.append(matching_flats[0])
return unique_flats_list
# Otherwise, just keep the most important of them
unique_flats_list.append(matching_flats[-1])
# The ID of the added merged flat will be the one of the last item
# in ``matching_flats``. Then, any flat object that was before in
# the ``matching_flats`` list is to be considered as a duplicate
# and should have a ``duplicate`` status.
duplicate_flats.extend(matching_flats[:-1])
if should_intersect:
# We added some flats twice with the above method, let's deduplicate on
# id.
unique_flats_list, _ = detect(unique_flats_list, key="id", merge=True,
should_intersect=False)
return unique_flats_list, duplicate_flats

View File

@ -33,6 +33,7 @@ class FlatStatus(enum.Enum):
An enum of the possible status for a flat entry.
"""
user_deleted = -100
duplicate = -20
ignored = -10
new = 0
followed = 10
@ -83,16 +84,17 @@ class Flat(BASE):
"""
# Handle flatisfy metadata
flat_dict = flat_dict.copy()
flat_dict["flatisfy_stations"] = (
flat_dict["flatisfy"].get("matched_stations", [])
)
flat_dict["flatisfy_postal_code"] = (
flat_dict["flatisfy"].get("postal_code", None)
)
flat_dict["flatisfy_time_to"] = (
flat_dict["flatisfy"].get("time_to", {})
)
del flat_dict["flatisfy"]
if "flatisfy" in flat_dict:
flat_dict["flatisfy_stations"] = (
flat_dict["flatisfy"].get("matched_stations", [])
)
flat_dict["flatisfy_postal_code"] = (
flat_dict["flatisfy"].get("postal_code", None)
)
flat_dict["flatisfy_time_to"] = (
flat_dict["flatisfy"].get("time_to", {})
)
del flat_dict["flatisfy"]
# Handle utilities field
if not isinstance(flat_dict["utilities"], FlatUtilities):

View File

@ -55,7 +55,7 @@
</th>
<td>
<template v-if="flat.flatisfy_postal_code.postal_code">
{{ flat.flatisfy_postal_code.name }} ( {{ flat.flatisfy_postal_code.postal_code }} )
{{ flat.flatisfy_postal_code.name }} ({{ flat.flatisfy_postal_code.postal_code }})
</template>
<template v-else>
?