Better deduplication

* Improve deduplication on URLs (match sets). * Keep track of duplicates and update their status on refiltering.
2017-04-27 17:08:10 +02:00 · 2017-04-27 17:08:10 +02:00 · 5f2f4d0ccf
commit 5f2f4d0ccf
parent 1d98c631e0
7 changed files with 107 additions and 58 deletions
--- a/doc/0.getting_started.md
+++ b/doc/0.getting_started.md
@ -68,6 +68,8 @@ List of configuration options:
 * `webserver` is a server to use instead of the default Bottle built-in
  webserver, see [Bottle deployment
  doc](http://bottlepy.org/docs/dev/deployment.html).
 * `backends` is a list of Weboob backends to enable. It defaults to any
  available and supported Weboob backend.
 _Note:_ In production, you can either use the `serve` command with a reliable
 webserver instead of the default Bottle webserver (specifying a `webserver`
--- a/flatisfy/main.py
+++ b/flatisfy/main.py
@ -155,8 +155,8 @@ def main():
    if args.cmd == "fetch":
        # Fetch and filter flats list
        flats_list = fetch.fetch_flats_list(config)
-        flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
+        flats_list = cmds.filter_flats(config, flats_list=flats_list,
-                                          fetch_details=True)
+                                       fetch_details=True)["new"]
        # Sort by cost
        flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
@ -169,8 +169,8 @@ def main():
        if args.input:
            flats_list = fetch.load_flats_list_from_file(args.input)
-            flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
+            flats_list = cmds.filter_flats(config, flats_list=flats_list,
-                                              fetch_details=False)
+                                           fetch_details=False)["new"]
            # Sort by cost
            flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
--- a/flatisfy/cmds.py
+++ b/flatisfy/cmds.py
@ -4,6 +4,7 @@ Main commands available for flatisfy.
 """
 from __future__ import absolute_import, print_function, unicode_literals
 import collections
 import logging
 import flatisfy.filters
@ -11,13 +12,14 @@ from flatisfy import database
 from flatisfy.models import flat as flat_model
 from flatisfy import fetch
 from flatisfy import tools
 from flatisfy.filters import metadata
 from flatisfy.web import app as web_app
 LOGGER = logging.getLogger(__name__)
-def filter_flats(config, flats_list=None, fetch_details=True):
+def filter_flats(config, flats_list, fetch_details=True):
    """
    Filter the available flats list. Then, filter it according to criteria.
@ -25,30 +27,43 @@ def filter_flats(config, flats_list=None, fetch_details=True):
    :param fetch_details: Whether additional details should be fetched between
    the two passes.
    :param flats_list: The initial list of flat objects to filter.
-    :return: A tuple of the list of all matching flats and the list of ignored
+    :return: A dict mapping flat status and list of flat objects.
    flats.
    """
    # Add the flatisfy metadata entry and prepare the flat objects
    flats_list = metadata.init(flats_list)
    first_pass_result = collections.defaultdict(list)
    second_pass_result = collections.defaultdict(list)
    # Do a first pass with the available infos to try to remove as much
    # unwanted postings as possible
    if config["passes"] > 0:
-        flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
+        first_pass_result = flatisfy.filters.first_pass(flats_list,
                                                        config)
    else:
        first_pass_result["new"] = flats_list
    # Load additional infos
    if fetch_details:
        for i, flat in enumerate(first_pass_result["new"]):
            details = fetch.fetch_details(config, flat["id"])
            first_pass_result["new"][i] = tools.merge_dicts(flat, details)
    # Do a second pass to consolidate all the infos we found and make use of
    # additional infos
    if config["passes"] > 1:
-        # Load additional infos
+        second_pass_result = flatisfy.filters.second_pass(
-        if fetch_details:
+            first_pass_result["new"], config
            for i, flat in enumerate(flats_list):
                details = fetch.fetch_details(config, flat["id"])
                flats_list[i] = tools.merge_dicts(flat, details)
        flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
            flats_list, config
        )
-        ignored_flats.extend(extra_ignored_flats)
+    else:
        second_pass_result["new"] = first_pass_result["new"]
-    return flats_list, ignored_flats
+    return {
        "new": second_pass_result["new"],
        "duplicate": first_pass_result["duplicate"],
        "ignored": (
            first_pass_result["ignored"] + second_pass_result["ignored"]
        )
    }
 def import_and_filter(config, load_from_db=False):
@ -66,19 +81,16 @@ def import_and_filter(config, load_from_db=False):
        flats_list = fetch.load_flats_list_from_db(config)
    else:
        flats_list = fetch.fetch_flats_list(config)
-    flats_list, ignored_list = filter_flats(config, flats_list=flats_list,
+    flats_list_by_status = filter_flats(config, flats_list=flats_list,
                                        fetch_details=True)
    # Create database connection
    get_session = database.init_db(config["database"])
    with get_session() as session:
        for status, flats_list in flats_list_by_status.items():
            for flat_dict in flats_list:
                flat = flat_model.Flat.from_dict(flat_dict)
-            session.merge(flat)
+                flat.status = getattr(flat_model.FlatStatus, status)
        for flat_dict in ignored_list:
            flat = flat_model.Flat.from_dict(flat_dict)
            flat.status = flat_model.FlatStatus.ignored
                session.merge(flat)
--- a/flatisfy/filters/init.py
+++ b/flatisfy/filters/init.py
@ -89,25 +89,23 @@ def first_pass(flats_list, config):
    :param flats_list: A list of flats dict to filter.
    :param config: A config dict.
-    :return: A tuple of processed flats and ignored flats.
+    :return: A dict mapping flat status and list of flat objects.
    """
    LOGGER.info("Running first filtering pass.")
    # Handle duplicates based on ids
    # Just remove them (no merge) as they should be the exact same object.
-    flats_list = duplicates.detect(
+    flats_list, duplicates_by_id = duplicates.detect(
-        flats_list, key="id", merge=False
+        flats_list, key="id", merge=False, should_intersect=False
    )
-    # Also merge duplicates based on url (these may come from different
+    # Also merge duplicates based on urls (these may come from different
    # flatboob backends)
    # This is especially useful as some websites such as entreparticuliers
    # contains a lot of leboncoin housings posts.
-    flats_list = duplicates.detect(
+    flats_list, duplicates_by_urls = duplicates.detect(
-        flats_list, key="url", merge=True
+        flats_list, key="urls", merge=True, should_intersect=True
    )
    # Add the flatisfy metadata entry and prepare the flat objects
    flats_list = metadata.init(flats_list)
    # Guess the postal codes
    flats_list = metadata.guess_postal_code(flats_list, config)
    # Try to match with stations
@ -115,7 +113,11 @@ def first_pass(flats_list, config):
    # Remove returned housing posts that do not match criteria
    flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
-    return (flats_list, ignored_list)
+    return {
        "new": flats_list,
        "ignored": ignored_list,
        "duplicate": duplicates_by_id + duplicates_by_urls
    }
 def second_pass(flats_list, config):
@ -131,7 +133,7 @@ def second_pass(flats_list, config):
    :param flats_list: A list of flats dict to filter.
    :param config: A config dict.
-    :return: A tuple of processed flats and ignored flats.
+    :return: A dict mapping flat status and list of flat objects.
    """
    LOGGER.info("Running second filtering pass.")
    # Assumed to run after first pass, so there should be no obvious duplicates
@ -151,4 +153,7 @@ def second_pass(flats_list, config):
    # Remove returned housing posts that do not match criteria
    flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
-    return (flats_list, ignored_list)
+    return {
        "new": flats_list,
        "ignored": ignored_list
    }
--- a/flatisfy/filters/duplicates.py
+++ b/flatisfy/filters/duplicates.py
@ -23,7 +23,7 @@ BACKENDS_PRECEDENCE = [
 ]
-def detect(flats_list, key="id", merge=True):
+def detect(flats_list, key="id", merge=True, should_intersect=False):
    """
    Detect obvious duplicates within a given list of flats.
@ -38,18 +38,32 @@ def detect(flats_list, key="id", merge=True):
    done.
    :param merge: Whether the found duplicates should be merged or we should
    only keep one of them.
    :param should_intersect: Set to ``True`` if the values in the flat dicts
    are lists and you want to deduplicate on non-empty intersection (typically
    if they have a common url).
-    :return: A deduplicated list of flat dicts.
+    :return: A tuple of the deduplicated list of flat dicts and the list of all
    the flats objects that should be removed and considered as duplicates (they
    were already merged).
    """
    # ``seen`` is a dict mapping aggregating the flats by the deduplication
    # keys. We basically make buckets of flats for every key value. Flats in
    # the same bucket should be merged together afterwards.
    seen = collections.defaultdict(list)
    for flat in flats_list:
        if should_intersect:
            # We add each value separately. We will add some flats multiple
            # times, but we deduplicate again on id below to compensate.
            for value in flat.get(key, []):
                seen[value].append(flat)
        else:
            seen[flat.get(key, None)].append(flat)
    # Generate the unique flats list based on these buckets
    unique_flats_list = []
    # Keep track of all the flats that were removed by deduplication
    duplicate_flats = []
    for flat_key, matching_flats in seen.items():
        if flat_key is None:
            # If the key is None, it means Weboob could not load the data. In
@ -67,7 +81,8 @@ def detect(flats_list, key="id", merge=True):
            )
            if len(matching_flats) > 1:
-                LOGGER.info("Found duplicates: %s.",
+                LOGGER.info("Found duplicates using key \"%s\": %s.",
                            key,
                            [flat["id"] for flat in matching_flats])
            # Otherwise, check the policy
            if merge:
@ -76,6 +91,19 @@ def detect(flats_list, key="id", merge=True):
                    tools.merge_dicts(*matching_flats)
                )
            else:
-                # Otherwise, just keep any of them
+                # Otherwise, just keep the most important of them
-                unique_flats_list.append(matching_flats[0])
+                unique_flats_list.append(matching_flats[-1])
-    return unique_flats_list
+
            # The ID of the added merged flat will be the one of the last item
            # in ``matching_flats``. Then, any flat object that was before in
            # the ``matching_flats`` list is to be considered as a duplicate
            # and should have a ``duplicate`` status.
            duplicate_flats.extend(matching_flats[:-1])
    if should_intersect:
        # We added some flats twice with the above method, let's deduplicate on
        # id.
        unique_flats_list, _ = detect(unique_flats_list, key="id", merge=True,
                                      should_intersect=False)
    return unique_flats_list, duplicate_flats
--- a/flatisfy/models/flat.py
+++ b/flatisfy/models/flat.py
@ -33,6 +33,7 @@ class FlatStatus(enum.Enum):
    An enum of the possible status for a flat entry.
    """
    user_deleted = -100
    duplicate = -20
    ignored = -10
    new = 0
    followed = 10
@ -83,6 +84,7 @@ class Flat(BASE):
        """
        # Handle flatisfy metadata
        flat_dict = flat_dict.copy()
        if "flatisfy" in flat_dict:
            flat_dict["flatisfy_stations"] = (
                flat_dict["flatisfy"].get("matched_stations", [])
            )
--- a/flatisfy/web/js_src/views/details.vue
+++ b/flatisfy/web/js_src/views/details.vue
@ -55,7 +55,7 @@
                            </th>
                            <td>
                                <template v-if="flat.flatisfy_postal_code.postal_code">
-                                    {{ flat.flatisfy_postal_code.name }} ( {{ flat.flatisfy_postal_code.postal_code }} )
+                                    {{ flat.flatisfy_postal_code.name }} ({{ flat.flatisfy_postal_code.postal_code }})
                                </template>
                                <template v-else>
                                    ?