diff --git a/doc/0.getting_started.md b/doc/0.getting_started.md index 08be9ae..3f3b98f 100644 --- a/doc/0.getting_started.md +++ b/doc/0.getting_started.md @@ -68,6 +68,8 @@ List of configuration options: * `webserver` is a server to use instead of the default Bottle built-in webserver, see [Bottle deployment doc](http://bottlepy.org/docs/dev/deployment.html). +* `backends` is a list of Weboob backends to enable. It defaults to any + available and supported Weboob backend. _Note:_ In production, you can either use the `serve` command with a reliable webserver instead of the default Bottle webserver (specifying a `webserver` diff --git a/flatisfy/__main__.py b/flatisfy/__main__.py index 492c39d..20dde81 100644 --- a/flatisfy/__main__.py +++ b/flatisfy/__main__.py @@ -155,8 +155,8 @@ def main(): if args.cmd == "fetch": # Fetch and filter flats list flats_list = fetch.fetch_flats_list(config) - flats_list, _ = cmds.filter_flats(config, flats_list=flats_list, - fetch_details=True) + flats_list = cmds.filter_flats(config, flats_list=flats_list, + fetch_details=True)["new"] # Sort by cost flats_list = tools.sort_list_of_dicts_by(flats_list, "cost") @@ -169,8 +169,8 @@ def main(): if args.input: flats_list = fetch.load_flats_list_from_file(args.input) - flats_list, _ = cmds.filter_flats(config, flats_list=flats_list, - fetch_details=False) + flats_list = cmds.filter_flats(config, flats_list=flats_list, + fetch_details=False)["new"] # Sort by cost flats_list = tools.sort_list_of_dicts_by(flats_list, "cost") diff --git a/flatisfy/cmds.py b/flatisfy/cmds.py index ca8271c..19e32d2 100644 --- a/flatisfy/cmds.py +++ b/flatisfy/cmds.py @@ -4,6 +4,7 @@ Main commands available for flatisfy. """ from __future__ import absolute_import, print_function, unicode_literals +import collections import logging import flatisfy.filters @@ -11,13 +12,14 @@ from flatisfy import database from flatisfy.models import flat as flat_model from flatisfy import fetch from flatisfy import tools +from flatisfy.filters import metadata from flatisfy.web import app as web_app LOGGER = logging.getLogger(__name__) -def filter_flats(config, flats_list=None, fetch_details=True): +def filter_flats(config, flats_list, fetch_details=True): """ Filter the available flats list. Then, filter it according to criteria. @@ -25,30 +27,43 @@ def filter_flats(config, flats_list=None, fetch_details=True): :param fetch_details: Whether additional details should be fetched between the two passes. :param flats_list: The initial list of flat objects to filter. - :return: A tuple of the list of all matching flats and the list of ignored - flats. + :return: A dict mapping flat status and list of flat objects. """ + # Add the flatisfy metadata entry and prepare the flat objects + flats_list = metadata.init(flats_list) + + first_pass_result = collections.defaultdict(list) + second_pass_result = collections.defaultdict(list) # Do a first pass with the available infos to try to remove as much # unwanted postings as possible if config["passes"] > 0: - flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list, - config) + first_pass_result = flatisfy.filters.first_pass(flats_list, + config) + else: + first_pass_result["new"] = flats_list + + # Load additional infos + if fetch_details: + for i, flat in enumerate(first_pass_result["new"]): + details = fetch.fetch_details(config, flat["id"]) + first_pass_result["new"][i] = tools.merge_dicts(flat, details) # Do a second pass to consolidate all the infos we found and make use of # additional infos if config["passes"] > 1: - # Load additional infos - if fetch_details: - for i, flat in enumerate(flats_list): - details = fetch.fetch_details(config, flat["id"]) - flats_list[i] = tools.merge_dicts(flat, details) - - flats_list, extra_ignored_flats = flatisfy.filters.second_pass( - flats_list, config + second_pass_result = flatisfy.filters.second_pass( + first_pass_result["new"], config ) - ignored_flats.extend(extra_ignored_flats) + else: + second_pass_result["new"] = first_pass_result["new"] - return flats_list, ignored_flats + return { + "new": second_pass_result["new"], + "duplicate": first_pass_result["duplicate"], + "ignored": ( + first_pass_result["ignored"] + second_pass_result["ignored"] + ) + } def import_and_filter(config, load_from_db=False): @@ -66,20 +81,17 @@ def import_and_filter(config, load_from_db=False): flats_list = fetch.load_flats_list_from_db(config) else: flats_list = fetch.fetch_flats_list(config) - flats_list, ignored_list = filter_flats(config, flats_list=flats_list, - fetch_details=True) + flats_list_by_status = filter_flats(config, flats_list=flats_list, + fetch_details=True) # Create database connection get_session = database.init_db(config["database"]) with get_session() as session: - for flat_dict in flats_list: - flat = flat_model.Flat.from_dict(flat_dict) - session.merge(flat) - - for flat_dict in ignored_list: - flat = flat_model.Flat.from_dict(flat_dict) - flat.status = flat_model.FlatStatus.ignored - session.merge(flat) + for status, flats_list in flats_list_by_status.items(): + for flat_dict in flats_list: + flat = flat_model.Flat.from_dict(flat_dict) + flat.status = getattr(flat_model.FlatStatus, status) + session.merge(flat) def purge_db(config): diff --git a/flatisfy/filters/__init__.py b/flatisfy/filters/__init__.py index 2220a8b..6233511 100644 --- a/flatisfy/filters/__init__.py +++ b/flatisfy/filters/__init__.py @@ -89,25 +89,23 @@ def first_pass(flats_list, config): :param flats_list: A list of flats dict to filter. :param config: A config dict. - :return: A tuple of processed flats and ignored flats. + :return: A dict mapping flat status and list of flat objects. """ LOGGER.info("Running first filtering pass.") # Handle duplicates based on ids # Just remove them (no merge) as they should be the exact same object. - flats_list = duplicates.detect( - flats_list, key="id", merge=False + flats_list, duplicates_by_id = duplicates.detect( + flats_list, key="id", merge=False, should_intersect=False ) - # Also merge duplicates based on url (these may come from different + # Also merge duplicates based on urls (these may come from different # flatboob backends) # This is especially useful as some websites such as entreparticuliers # contains a lot of leboncoin housings posts. - flats_list = duplicates.detect( - flats_list, key="url", merge=True + flats_list, duplicates_by_urls = duplicates.detect( + flats_list, key="urls", merge=True, should_intersect=True ) - # Add the flatisfy metadata entry and prepare the flat objects - flats_list = metadata.init(flats_list) # Guess the postal codes flats_list = metadata.guess_postal_code(flats_list, config) # Try to match with stations @@ -115,7 +113,11 @@ def first_pass(flats_list, config): # Remove returned housing posts that do not match criteria flats_list, ignored_list = refine_with_housing_criteria(flats_list, config) - return (flats_list, ignored_list) + return { + "new": flats_list, + "ignored": ignored_list, + "duplicate": duplicates_by_id + duplicates_by_urls + } def second_pass(flats_list, config): @@ -131,7 +133,7 @@ def second_pass(flats_list, config): :param flats_list: A list of flats dict to filter. :param config: A config dict. - :return: A tuple of processed flats and ignored flats. + :return: A dict mapping flat status and list of flat objects. """ LOGGER.info("Running second filtering pass.") # Assumed to run after first pass, so there should be no obvious duplicates @@ -151,4 +153,7 @@ def second_pass(flats_list, config): # Remove returned housing posts that do not match criteria flats_list, ignored_list = refine_with_housing_criteria(flats_list, config) - return (flats_list, ignored_list) + return { + "new": flats_list, + "ignored": ignored_list + } diff --git a/flatisfy/filters/duplicates.py b/flatisfy/filters/duplicates.py index a7365c3..7dae825 100644 --- a/flatisfy/filters/duplicates.py +++ b/flatisfy/filters/duplicates.py @@ -23,7 +23,7 @@ BACKENDS_PRECEDENCE = [ ] -def detect(flats_list, key="id", merge=True): +def detect(flats_list, key="id", merge=True, should_intersect=False): """ Detect obvious duplicates within a given list of flats. @@ -38,18 +38,32 @@ def detect(flats_list, key="id", merge=True): done. :param merge: Whether the found duplicates should be merged or we should only keep one of them. + :param should_intersect: Set to ``True`` if the values in the flat dicts + are lists and you want to deduplicate on non-empty intersection (typically + if they have a common url). - :return: A deduplicated list of flat dicts. + :return: A tuple of the deduplicated list of flat dicts and the list of all + the flats objects that should be removed and considered as duplicates (they + were already merged). """ # ``seen`` is a dict mapping aggregating the flats by the deduplication # keys. We basically make buckets of flats for every key value. Flats in # the same bucket should be merged together afterwards. seen = collections.defaultdict(list) for flat in flats_list: - seen[flat.get(key, None)].append(flat) + if should_intersect: + # We add each value separately. We will add some flats multiple + # times, but we deduplicate again on id below to compensate. + for value in flat.get(key, []): + seen[value].append(flat) + else: + seen[flat.get(key, None)].append(flat) # Generate the unique flats list based on these buckets unique_flats_list = [] + # Keep track of all the flats that were removed by deduplication + duplicate_flats = [] + for flat_key, matching_flats in seen.items(): if flat_key is None: # If the key is None, it means Weboob could not load the data. In @@ -67,7 +81,8 @@ def detect(flats_list, key="id", merge=True): ) if len(matching_flats) > 1: - LOGGER.info("Found duplicates: %s.", + LOGGER.info("Found duplicates using key \"%s\": %s.", + key, [flat["id"] for flat in matching_flats]) # Otherwise, check the policy if merge: @@ -76,6 +91,19 @@ def detect(flats_list, key="id", merge=True): tools.merge_dicts(*matching_flats) ) else: - # Otherwise, just keep any of them - unique_flats_list.append(matching_flats[0]) - return unique_flats_list + # Otherwise, just keep the most important of them + unique_flats_list.append(matching_flats[-1]) + + # The ID of the added merged flat will be the one of the last item + # in ``matching_flats``. Then, any flat object that was before in + # the ``matching_flats`` list is to be considered as a duplicate + # and should have a ``duplicate`` status. + duplicate_flats.extend(matching_flats[:-1]) + + if should_intersect: + # We added some flats twice with the above method, let's deduplicate on + # id. + unique_flats_list, _ = detect(unique_flats_list, key="id", merge=True, + should_intersect=False) + + return unique_flats_list, duplicate_flats diff --git a/flatisfy/models/flat.py b/flatisfy/models/flat.py index 12d2c74..cd74840 100644 --- a/flatisfy/models/flat.py +++ b/flatisfy/models/flat.py @@ -33,6 +33,7 @@ class FlatStatus(enum.Enum): An enum of the possible status for a flat entry. """ user_deleted = -100 + duplicate = -20 ignored = -10 new = 0 followed = 10 @@ -83,16 +84,17 @@ class Flat(BASE): """ # Handle flatisfy metadata flat_dict = flat_dict.copy() - flat_dict["flatisfy_stations"] = ( - flat_dict["flatisfy"].get("matched_stations", []) - ) - flat_dict["flatisfy_postal_code"] = ( - flat_dict["flatisfy"].get("postal_code", None) - ) - flat_dict["flatisfy_time_to"] = ( - flat_dict["flatisfy"].get("time_to", {}) - ) - del flat_dict["flatisfy"] + if "flatisfy" in flat_dict: + flat_dict["flatisfy_stations"] = ( + flat_dict["flatisfy"].get("matched_stations", []) + ) + flat_dict["flatisfy_postal_code"] = ( + flat_dict["flatisfy"].get("postal_code", None) + ) + flat_dict["flatisfy_time_to"] = ( + flat_dict["flatisfy"].get("time_to", {}) + ) + del flat_dict["flatisfy"] # Handle utilities field if not isinstance(flat_dict["utilities"], FlatUtilities): diff --git a/flatisfy/web/js_src/views/details.vue b/flatisfy/web/js_src/views/details.vue index 63dd8d0..268f627 100644 --- a/flatisfy/web/js_src/views/details.vue +++ b/flatisfy/web/js_src/views/details.vue @@ -55,7 +55,7 @@