From c659dc6b76d510e52a68f57cd1e7234a7c223ac0 Mon Sep 17 00:00:00 2001 From: Gautier P Date: Tue, 19 Jan 2021 09:42:12 +0100 Subject: [PATCH] Add --new-only import option --- flatisfy/__main__.py | 14 ++++++++++--- flatisfy/cmds.py | 38 ++++++++++++++++++++++++------------ flatisfy/filters/metadata.py | 7 +++---- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/flatisfy/__main__.py b/flatisfy/__main__.py index f80c4e9..f012ef4 100644 --- a/flatisfy/__main__.py +++ b/flatisfy/__main__.py @@ -101,8 +101,16 @@ def parse_args(argv=None): ) # Import subcommand parser - subparsers.add_parser("import", parents=[parent_parser], - help="Import housing posts in database.") + import_filter = subparsers.add_parser( + "import", parents=[parent_parser], + help="Import housing posts in database.") + import_filter.add_argument( + "--new-only", + action="store_true", + help=( + "Download new housing posts only but do not refresh existing ones" + ) + ) # Purge subcommand parser subparsers.add_parser("purge", parents=[parent_parser], @@ -211,7 +219,7 @@ def main(): return # Import command elif args.cmd == "import": - cmds.import_and_filter(config, load_from_db=False) + cmds.import_and_filter(config, load_from_db=False, new_only=args.new_only) return # Serve command elif args.cmd == "serve": diff --git a/flatisfy/cmds.py b/flatisfy/cmds.py index e67445b..20316c1 100644 --- a/flatisfy/cmds.py +++ b/flatisfy/cmds.py @@ -23,16 +23,17 @@ import time LOGGER = logging.getLogger(__name__) -def filter_flats_list(config, constraint_name, flats_list, fetch_details=True): +def filter_flats_list(config, constraint_name, flats_list, fetch_details=True, past_flats=None): """ Filter the available flats list. Then, filter it according to criteria. :param config: A config dict. :param constraint_name: The constraint name that the ``flats_list`` should satisfy. + :param flats_list: The initial list of flat objects to filter. :param fetch_details: Whether additional details should be fetched between the two passes. - :param flats_list: The initial list of flat objects to filter. + :param past_flats: The list of already fetched flats :return: A dict mapping flat status and list of flat objects. """ # Add the flatisfy metadata entry and prepare the flat objects @@ -66,13 +67,21 @@ def filter_flats_list(config, constraint_name, flats_list, fetch_details=True): # Load additional infos if fetch_details: + past_ids = {x["id"]: x for x in past_flats} if past_flats else {} for i, flat in enumerate(first_pass_result["new"]): - details = fetch.fetch_details(config, flat["id"]) - first_pass_result["new"][i] = tools.merge_dicts(flat, details) - if flat["id"].endswith("@leboncoin"): - # sleep 0.5s to avoid rate-kick - time.sleep(0.5) + details = None + use_cache = past_ids.get(flat["id"]) + if use_cache: + LOGGER.info("Skipping details download for %s.", flat["id"]) + details = use_cache + else: + details = fetch.fetch_details(config, flat["id"]) + if flat["id"].endswith("@leboncoin"): + # sleep 0.5s to avoid rate-kick + time.sleep(0.5) + + first_pass_result["new"][i] = tools.merge_dicts(flat, details) # Do a second pass to consolidate all the infos we found and make use of # additional infos @@ -107,7 +116,7 @@ def filter_flats_list(config, constraint_name, flats_list, fetch_details=True): } -def filter_fetched_flats(config, fetched_flats, fetch_details=True): +def filter_fetched_flats(config, fetched_flats, fetch_details=True, past_flats={}): """ Filter the available flats list. Then, filter it according to criteria. @@ -124,12 +133,13 @@ def filter_fetched_flats(config, fetched_flats, fetch_details=True): config, constraint_name, flats_list, - fetch_details + fetch_details, + past_flats.get(constraint_name, None) ) return fetched_flats -def import_and_filter(config, load_from_db=False): +def import_and_filter(config, load_from_db=False, new_only=False): """ Fetch the available flats list. Then, filter it according to criteria. Finally, store it in the database. @@ -140,13 +150,15 @@ def import_and_filter(config, load_from_db=False): :return: ``None``. """ # Fetch and filter flats list + past_flats = fetch.load_flats_from_db(config) if load_from_db: - fetched_flats = fetch.load_flats_from_db(config) + fetched_flats = past_flats else: fetched_flats = fetch.fetch_flats(config) # Do not fetch additional details if we loaded data from the db. flats_by_status = filter_fetched_flats(config, fetched_flats=fetched_flats, - fetch_details=(not load_from_db)) + fetch_details=(not load_from_db), + past_flats=past_flats if new_only else {}) # Create database connection get_session = database.init_db(config["database"], config["search_index"]) @@ -208,6 +220,8 @@ def import_and_filter(config, load_from_db=False): if config["send_email"]: email.send_notification(config, new_flats) + LOGGER.info(f"Found {len(new_flats)} new flats.") + # Touch a file to indicate last update timestamp ts_file = os.path.join( config["data_directory"], diff --git a/flatisfy/filters/metadata.py b/flatisfy/filters/metadata.py index 569ccd0..68941a6 100644 --- a/flatisfy/filters/metadata.py +++ b/flatisfy/filters/metadata.py @@ -171,7 +171,7 @@ def guess_location_position(location, cities, constraint): ] if len(postal_code_objects_for_city): position = {"lat": postal_code_objects_for_city[0].lat, "lng": postal_code_objects_for_city[0].lng} - LOGGER.info( + LOGGER.debug( ("Found position %s using city %s."), position, matched_city_name ) @@ -228,7 +228,7 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): # Check the postal code is within the db assert postal_code in [x.postal_code for x in opendata["postal_codes"]] - LOGGER.info( + LOGGER.debug( "Found postal code in location field for flat %s: %s.", flat["id"], postal_code ) @@ -266,13 +266,12 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000): ("Postal code %s found for flat %s @ %s is off-constraints " "(distance is %dm > %dm). Let's consider it is an " "artifact match and keep the post without this postal " - "code. (%s)"), + "code."), postal_code, flat["id"], location, int(distance), int(distance_threshold), - flat ) postal_code = None position = None