Add --new-only import option

This commit is contained in:
Gautier P 2021-01-19 09:42:12 +01:00
parent 5a3a82ca8d
commit c659dc6b76
3 changed files with 40 additions and 19 deletions

View File

@ -101,8 +101,16 @@ def parse_args(argv=None):
) )
# Import subcommand parser # Import subcommand parser
subparsers.add_parser("import", parents=[parent_parser], import_filter = subparsers.add_parser(
help="Import housing posts in database.") "import", parents=[parent_parser],
help="Import housing posts in database.")
import_filter.add_argument(
"--new-only",
action="store_true",
help=(
"Download new housing posts only but do not refresh existing ones"
)
)
# Purge subcommand parser # Purge subcommand parser
subparsers.add_parser("purge", parents=[parent_parser], subparsers.add_parser("purge", parents=[parent_parser],
@ -211,7 +219,7 @@ def main():
return return
# Import command # Import command
elif args.cmd == "import": elif args.cmd == "import":
cmds.import_and_filter(config, load_from_db=False) cmds.import_and_filter(config, load_from_db=False, new_only=args.new_only)
return return
# Serve command # Serve command
elif args.cmd == "serve": elif args.cmd == "serve":

View File

@ -23,16 +23,17 @@ import time
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
def filter_flats_list(config, constraint_name, flats_list, fetch_details=True): def filter_flats_list(config, constraint_name, flats_list, fetch_details=True, past_flats=None):
""" """
Filter the available flats list. Then, filter it according to criteria. Filter the available flats list. Then, filter it according to criteria.
:param config: A config dict. :param config: A config dict.
:param constraint_name: The constraint name that the ``flats_list`` should :param constraint_name: The constraint name that the ``flats_list`` should
satisfy. satisfy.
:param flats_list: The initial list of flat objects to filter.
:param fetch_details: Whether additional details should be fetched between :param fetch_details: Whether additional details should be fetched between
the two passes. the two passes.
:param flats_list: The initial list of flat objects to filter. :param past_flats: The list of already fetched flats
:return: A dict mapping flat status and list of flat objects. :return: A dict mapping flat status and list of flat objects.
""" """
# Add the flatisfy metadata entry and prepare the flat objects # Add the flatisfy metadata entry and prepare the flat objects
@ -66,13 +67,21 @@ def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
# Load additional infos # Load additional infos
if fetch_details: if fetch_details:
past_ids = {x["id"]: x for x in past_flats} if past_flats else {}
for i, flat in enumerate(first_pass_result["new"]): for i, flat in enumerate(first_pass_result["new"]):
details = fetch.fetch_details(config, flat["id"]) details = None
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
if flat["id"].endswith("@leboncoin"):
# sleep 0.5s to avoid rate-kick
time.sleep(0.5)
use_cache = past_ids.get(flat["id"])
if use_cache:
LOGGER.info("Skipping details download for %s.", flat["id"])
details = use_cache
else:
details = fetch.fetch_details(config, flat["id"])
if flat["id"].endswith("@leboncoin"):
# sleep 0.5s to avoid rate-kick
time.sleep(0.5)
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
# Do a second pass to consolidate all the infos we found and make use of # Do a second pass to consolidate all the infos we found and make use of
# additional infos # additional infos
@ -107,7 +116,7 @@ def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
} }
def filter_fetched_flats(config, fetched_flats, fetch_details=True): def filter_fetched_flats(config, fetched_flats, fetch_details=True, past_flats={}):
""" """
Filter the available flats list. Then, filter it according to criteria. Filter the available flats list. Then, filter it according to criteria.
@ -124,12 +133,13 @@ def filter_fetched_flats(config, fetched_flats, fetch_details=True):
config, config,
constraint_name, constraint_name,
flats_list, flats_list,
fetch_details fetch_details,
past_flats.get(constraint_name, None)
) )
return fetched_flats return fetched_flats
def import_and_filter(config, load_from_db=False): def import_and_filter(config, load_from_db=False, new_only=False):
""" """
Fetch the available flats list. Then, filter it according to criteria. Fetch the available flats list. Then, filter it according to criteria.
Finally, store it in the database. Finally, store it in the database.
@ -140,13 +150,15 @@ def import_and_filter(config, load_from_db=False):
:return: ``None``. :return: ``None``.
""" """
# Fetch and filter flats list # Fetch and filter flats list
past_flats = fetch.load_flats_from_db(config)
if load_from_db: if load_from_db:
fetched_flats = fetch.load_flats_from_db(config) fetched_flats = past_flats
else: else:
fetched_flats = fetch.fetch_flats(config) fetched_flats = fetch.fetch_flats(config)
# Do not fetch additional details if we loaded data from the db. # Do not fetch additional details if we loaded data from the db.
flats_by_status = filter_fetched_flats(config, fetched_flats=fetched_flats, flats_by_status = filter_fetched_flats(config, fetched_flats=fetched_flats,
fetch_details=(not load_from_db)) fetch_details=(not load_from_db),
past_flats=past_flats if new_only else {})
# Create database connection # Create database connection
get_session = database.init_db(config["database"], config["search_index"]) get_session = database.init_db(config["database"], config["search_index"])
@ -208,6 +220,8 @@ def import_and_filter(config, load_from_db=False):
if config["send_email"]: if config["send_email"]:
email.send_notification(config, new_flats) email.send_notification(config, new_flats)
LOGGER.info(f"Found {len(new_flats)} new flats.")
# Touch a file to indicate last update timestamp # Touch a file to indicate last update timestamp
ts_file = os.path.join( ts_file = os.path.join(
config["data_directory"], config["data_directory"],

View File

@ -171,7 +171,7 @@ def guess_location_position(location, cities, constraint):
] ]
if len(postal_code_objects_for_city): if len(postal_code_objects_for_city):
position = {"lat": postal_code_objects_for_city[0].lat, "lng": postal_code_objects_for_city[0].lng} position = {"lat": postal_code_objects_for_city[0].lat, "lng": postal_code_objects_for_city[0].lng}
LOGGER.info( LOGGER.debug(
("Found position %s using city %s."), ("Found position %s using city %s."),
position, matched_city_name position, matched_city_name
) )
@ -228,7 +228,7 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
# Check the postal code is within the db # Check the postal code is within the db
assert postal_code in [x.postal_code for x in opendata["postal_codes"]] assert postal_code in [x.postal_code for x in opendata["postal_codes"]]
LOGGER.info( LOGGER.debug(
"Found postal code in location field for flat %s: %s.", "Found postal code in location field for flat %s: %s.",
flat["id"], postal_code flat["id"], postal_code
) )
@ -266,13 +266,12 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
("Postal code %s found for flat %s @ %s is off-constraints " ("Postal code %s found for flat %s @ %s is off-constraints "
"(distance is %dm > %dm). Let's consider it is an " "(distance is %dm > %dm). Let's consider it is an "
"artifact match and keep the post without this postal " "artifact match and keep the post without this postal "
"code. (%s)"), "code."),
postal_code, postal_code,
flat["id"], flat["id"],
location, location,
int(distance), int(distance),
int(distance_threshold), int(distance_threshold),
flat
) )
postal_code = None postal_code = None
position = None position = None