From 1d98c631e081f455352e82963091d8d28b501bd7 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Thu, 27 Apr 2017 16:37:39 +0200 Subject: [PATCH] Refilter command and backends in config * Add a refilter command * Add a backend option in config to only enable some backends. --- doc/0.getting_started.md | 6 ++- flatisfy/__main__.py | 45 ++++++++++++++------- flatisfy/cmds.py | 57 +++++++++------------------ flatisfy/config.py | 5 ++- flatisfy/data.py | 5 ++- flatisfy/fetch.py | 33 ++++++++++++++-- flatisfy/filters/metadata.py | 19 ++++++--- flatisfy/web/dbplugin.py | 19 +++++---- flatisfy/web/js_src/views/details.vue | 2 +- 9 files changed, 112 insertions(+), 79 deletions(-) diff --git a/doc/0.getting_started.md b/doc/0.getting_started.md index d299121..08be9ae 100644 --- a/doc/0.getting_started.md +++ b/doc/0.getting_started.md @@ -32,9 +32,11 @@ The available commands are: * `init-config` to generate an empty configuration file, either on the `stdin` or in the specified file. +* `build-data` to rebuild OpenData datasets. * `fetch` to load and filter housings posts and output a JSON dump. -* `filter` to filter a previously fetched list of housings posts, provided as - a JSON dump. +* `filter` to filter again the flats in the database (and update their status) + according to changes in config. It can also filter a previously fetched list + of housings posts, provided as a JSON dump (with a `--input` argument). * `import` to import and filter housing posts into the database. * `serve` to serve the built-in webapp with the development server. Do not use in production. diff --git a/flatisfy/__main__.py b/flatisfy/__main__.py index 2229173..492c39d 100644 --- a/flatisfy/__main__.py +++ b/flatisfy/__main__.py @@ -11,6 +11,7 @@ import sys import flatisfy.config from flatisfy import cmds from flatisfy import data +from flatisfy import fetch from flatisfy import tools @@ -76,14 +77,18 @@ def parse_args(argv=None): help="Fetch housings posts") # Filter subcommand parser - parser_filter = subparsers.add_parser("filter", parents=[parent_parser], - help=( - "Filter housings posts. No " - "fetching of additional infos " - "is done.")) + parser_filter = subparsers.add_parser( + "filter", parents=[parent_parser], + help="Filter housings posts according to constraints in config." + ) parser_filter.add_argument( - "input", - help="JSON dump of the housings post to filter." + "--input", + help=( + "Optional JSON dump of the housings post to filter. If provided, " + "no additional fetching of infos is done, and the script outputs " + "a filtered JSON dump on stdout. If not provided, update status " + "of the flats in the database." + ) ) # Import subcommand parser @@ -149,7 +154,9 @@ def main(): # Fetch command if args.cmd == "fetch": # Fetch and filter flats list - flats_list, _ = cmds.fetch_and_filter(config) + flats_list = fetch.fetch_flats_list(config) + flats_list, _ = cmds.filter_flats(config, flats_list=flats_list, + fetch_details=True) # Sort by cost flats_list = tools.sort_list_of_dicts_by(flats_list, "cost") @@ -159,18 +166,26 @@ def main(): # Filter command elif args.cmd == "filter": # Load and filter flats list - flats_list = cmds.load_and_filter(args.input, config) - # Sort by cost - flats_list = tools.sort_list_of_dicts_by(flats_list, "cost") + if args.input: + flats_list = fetch.load_flats_list_from_file(args.input) - print( - tools.pretty_json(flats_list) - ) + flats_list, _ = cmds.filter_flats(config, flats_list=flats_list, + fetch_details=False) + + # Sort by cost + flats_list = tools.sort_list_of_dicts_by(flats_list, "cost") + + # Output to stdout + print( + tools.pretty_json(flats_list) + ) + else: + cmds.import_and_filter(config, load_from_db=True) # Import command elif args.cmd == "import": # TODO: Do not fetch details for already imported flats / use the last # timestamp - cmds.import_and_filter(config) + cmds.import_and_filter(config, load_from_db=False) # Purge command elif args.cmd == "purge": cmds.purge_db(config) diff --git a/flatisfy/cmds.py b/flatisfy/cmds.py index f91678f..ca8271c 100644 --- a/flatisfy/cmds.py +++ b/flatisfy/cmds.py @@ -17,18 +17,17 @@ from flatisfy.web import app as web_app LOGGER = logging.getLogger(__name__) -def fetch_and_filter(config): +def filter_flats(config, flats_list=None, fetch_details=True): """ - Fetch the available flats list. Then, filter it according to criteria. + Filter the available flats list. Then, filter it according to criteria. :param config: A config dict. + :param fetch_details: Whether additional details should be fetched between + the two passes. + :param flats_list: The initial list of flat objects to filter. :return: A tuple of the list of all matching flats and the list of ignored flats. """ - # TODO: Reduce load on housings listing websites - # Fetch flats list with flatboobs - flats_list = fetch.fetch_flats_list(config) - # Do a first pass with the available infos to try to remove as much # unwanted postings as possible if config["passes"] > 0: @@ -39,9 +38,10 @@ def fetch_and_filter(config): # additional infos if config["passes"] > 1: # Load additional infos - for i, flat in enumerate(flats_list): - details = fetch.fetch_details(config, flat["id"]) - flats_list[i] = tools.merge_dicts(flat, details) + if fetch_details: + for i, flat in enumerate(flats_list): + details = fetch.fetch_details(config, flat["id"]) + flats_list[i] = tools.merge_dicts(flat, details) flats_list, extra_ignored_flats = flatisfy.filters.second_pass( flats_list, config @@ -51,44 +51,23 @@ def fetch_and_filter(config): return flats_list, ignored_flats -def load_and_filter(housing_file, config): - """ - Load the dumped flats list. Then, filter it according to criteria. - - :param housing_file: The JSON file to load flats from. - :param config: A config dict. - :return: A tuple of the list of all matching flats and the list of ignored - flats. - """ - # Load flats list - flats_list = fetch.load_flats_list(housing_file) - - # Do a first pass with the available infos to try to remove as much - # unwanted postings as possible - if config["passes"] > 0: - flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list, - config) - - # Do a second pass to consolidate all the infos we found - if config["passes"] > 1: - flats_list, extra_ignored_flats = flatisfy.filters.second_pass( - flats_list, config - ) - ignored_flats.extend(extra_ignored_flats) - - return flats_list, ignored_flats - - -def import_and_filter(config): +def import_and_filter(config, load_from_db=False): """ Fetch the available flats list. Then, filter it according to criteria. Finally, store it in the database. :param config: A config dict. + :param load_from_db: Whether to load flats from database or fetch them + using Weboob. :return: ``None``. """ # Fetch and filter flats list - flats_list, ignored_list = fetch_and_filter(config) + if load_from_db: + flats_list = fetch.load_flats_list_from_db(config) + else: + flats_list = fetch.fetch_flats_list(config) + flats_list, ignored_list = filter_flats(config, flats_list=flats_list, + fetch_details=True) # Create database connection get_session = database.init_db(config["database"]) diff --git a/flatisfy/config.py b/flatisfy/config.py index b9693ca..0cc4514 100644 --- a/flatisfy/config.py +++ b/flatisfy/config.py @@ -54,7 +54,9 @@ DEFAULT_CONFIG = { # Web app host to listen on "host": "127.0.0.1", # Web server to use to serve the webapp (see Bottle deployment doc) - "webserver": None + "webserver": None, + # List of Weboob backends to use (default to any backend available) + "backends": None } LOGGER = logging.getLogger(__name__) @@ -135,6 +137,7 @@ def validate_config(config): assert isinstance(config["port"], int) assert isinstance(config["host"], str) assert config["webserver"] is None or isinstance(config["webserver"], str) # noqa: E501 + assert config["backends"] is None or isinstance(config["backends"], list) # noqa: E501 return True except (AssertionError, KeyError): diff --git a/flatisfy/data.py b/flatisfy/data.py index 7f51efb..8dd23b7 100644 --- a/flatisfy/data.py +++ b/flatisfy/data.py @@ -37,7 +37,10 @@ def _preprocess_ratp(output_dir): ratp_data = collections.defaultdict(list) for item in ratp_data_raw: stop_name = item["fields"]["stop_name"].lower() - ratp_data[stop_name].append(item["fields"]["coord"]) + ratp_data[stop_name].append({ + "gps": item["fields"]["coord"], + "name": item["fields"]["stop_name"] + }) # Output it with open(os.path.join(output_dir, "ratp.json"), "w") as fh: diff --git a/flatisfy/fetch.py b/flatisfy/fetch.py index 68e949f..449c825 100644 --- a/flatisfy/fetch.py +++ b/flatisfy/fetch.py @@ -8,7 +8,9 @@ import itertools import json import logging +from flatisfy import database from flatisfy import tools +from flatisfy.models import flat as flat_model LOGGER = logging.getLogger(__name__) @@ -59,6 +61,13 @@ class WeboobProxy(object): :param config: A config dict. """ + # Default backends + if not config["backends"]: + backends = ["seloger", "pap", "leboncoin", "logicimmo", + "explorimmo", "entreparticuliers"] + else: + backends = config["backends"] + # Create base WebNip object self.webnip = WebNip(modules_path=config["modules_path"]) @@ -69,8 +78,7 @@ class WeboobProxy(object): module, params={} ) - for module in ["seloger", "pap", "leboncoin", "logicimmo", - "explorimmo", "entreparticuliers"] + for module in backends ] def __enter__(self): @@ -210,13 +218,13 @@ def fetch_details(config, flat_id): weboob_output = weboob_proxy.info(flat_id) flat_details = json.loads(weboob_output) - flats_details = WeboobProxy.restore_decimal_fields(flat_details) + flat_details = WeboobProxy.restore_decimal_fields(flat_details) LOGGER.info("Fetched details for flat %s.", flat_id) return flat_details -def load_flats_list(json_file): +def load_flats_list_from_file(json_file): """ Load a dumped flats list from JSON file. @@ -232,3 +240,20 @@ def load_flats_list(json_file): except (IOError, ValueError): LOGGER.error("File %s is not a valid dump file.", json_file) return flats_list + + +def load_flats_list_from_db(config): + """ + Load flats from database. + + :param config: A config dict. + :return: A list of all the flats in the database. + """ + flats_list = [] + get_session = database.init_db(config["database"]) + + with get_session() as session: + # TODO: Better serialization + flats_list = [flat.json_api_repr() + for flat in session.query(flat_model.Flat).all()] + return flats_list diff --git a/flatisfy/filters/metadata.py b/flatisfy/filters/metadata.py index f0cab7e..2bbaca2 100644 --- a/flatisfy/filters/metadata.py +++ b/flatisfy/filters/metadata.py @@ -31,9 +31,14 @@ def init(flats_list): if "flatisfy" not in flat: flat["flatisfy"] = {} # Move url key to urls - flat["urls"] = [flat["url"]] + if "urls" not in flat: + if "url" in flat: + flat["urls"] = [flat["url"]] + else: + flat["urls"] = [] # Create merged_ids key - flat["merged_ids"] = [flat["id"]] + if "merged_ids" not in flat: + flat["merged_ids"] = [flat["id"]] return flats_list @@ -261,16 +266,18 @@ def guess_stations(flats_list, config, distance_threshold=1500): # of coordinates, for efficiency. Note that multiple stations # with the same name exist in a city, hence the list of # coordinates. - for station_gps in opendata["stations"][station[0]]: - distance = tools.distance(station_gps, postal_code_gps) + for station_data in opendata["stations"][station[0]]: + distance = tools.distance(station_data["gps"], + postal_code_gps) if distance < distance_threshold: # If at least one of the coordinates for a given # station is close enough, that's ok and we can add # the station good_matched_stations.append({ - "name": station[0], + "key": station[0], + "name": station_data["name"], "confidence": station[1], - "gps": station_gps + "gps": station_data["gps"] }) break LOGGER.debug( diff --git a/flatisfy/web/dbplugin.py b/flatisfy/web/dbplugin.py index 1ebc09c..fa7bb68 100644 --- a/flatisfy/web/dbplugin.py +++ b/flatisfy/web/dbplugin.py @@ -64,16 +64,15 @@ class DatabasePlugin(object): if self.KEYWORD not in callback_args: # If no need for a db session, call the route callback return callback - else: - def wrapper(*args, **kwargs): - """ - Wrap the callback in a call to get_session. - """ - with self.get_session() as session: - # Get a db session and pass it to the callback - kwargs[self.KEYWORD] = session - return callback(*args, **kwargs) - return wrapper + def wrapper(*args, **kwargs): + """ + Wrap the callback in a call to get_session. + """ + with self.get_session() as session: + # Get a db session and pass it to the callback + kwargs[self.KEYWORD] = session + return callback(*args, **kwargs) + return wrapper Plugin = DatabasePlugin diff --git a/flatisfy/web/js_src/views/details.vue b/flatisfy/web/js_src/views/details.vue index 4cada5d..63dd8d0 100644 --- a/flatisfy/web/js_src/views/details.vue +++ b/flatisfy/web/js_src/views/details.vue @@ -197,7 +197,7 @@ export default { }, displayedStations () { if (this.flat.flatisfy_stations.length > 0) { - const stationsNames = this.flat.flatisfy_stations.map(station => capitalize(station.name)) + const stationsNames = this.flat.flatisfy_stations.map(station => station.name) return stationsNames.join(', ') } else { return null