Add --new-only import option
This commit is contained in:
parent
5a3a82ca8d
commit
c659dc6b76
@ -101,8 +101,16 @@ def parse_args(argv=None):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Import subcommand parser
|
# Import subcommand parser
|
||||||
subparsers.add_parser("import", parents=[parent_parser],
|
import_filter = subparsers.add_parser(
|
||||||
|
"import", parents=[parent_parser],
|
||||||
help="Import housing posts in database.")
|
help="Import housing posts in database.")
|
||||||
|
import_filter.add_argument(
|
||||||
|
"--new-only",
|
||||||
|
action="store_true",
|
||||||
|
help=(
|
||||||
|
"Download new housing posts only but do not refresh existing ones"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Purge subcommand parser
|
# Purge subcommand parser
|
||||||
subparsers.add_parser("purge", parents=[parent_parser],
|
subparsers.add_parser("purge", parents=[parent_parser],
|
||||||
@ -211,7 +219,7 @@ def main():
|
|||||||
return
|
return
|
||||||
# Import command
|
# Import command
|
||||||
elif args.cmd == "import":
|
elif args.cmd == "import":
|
||||||
cmds.import_and_filter(config, load_from_db=False)
|
cmds.import_and_filter(config, load_from_db=False, new_only=args.new_only)
|
||||||
return
|
return
|
||||||
# Serve command
|
# Serve command
|
||||||
elif args.cmd == "serve":
|
elif args.cmd == "serve":
|
||||||
|
@ -23,16 +23,17 @@ import time
|
|||||||
LOGGER = logging.getLogger(__name__)
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
|
def filter_flats_list(config, constraint_name, flats_list, fetch_details=True, past_flats=None):
|
||||||
"""
|
"""
|
||||||
Filter the available flats list. Then, filter it according to criteria.
|
Filter the available flats list. Then, filter it according to criteria.
|
||||||
|
|
||||||
:param config: A config dict.
|
:param config: A config dict.
|
||||||
:param constraint_name: The constraint name that the ``flats_list`` should
|
:param constraint_name: The constraint name that the ``flats_list`` should
|
||||||
satisfy.
|
satisfy.
|
||||||
|
:param flats_list: The initial list of flat objects to filter.
|
||||||
:param fetch_details: Whether additional details should be fetched between
|
:param fetch_details: Whether additional details should be fetched between
|
||||||
the two passes.
|
the two passes.
|
||||||
:param flats_list: The initial list of flat objects to filter.
|
:param past_flats: The list of already fetched flats
|
||||||
:return: A dict mapping flat status and list of flat objects.
|
:return: A dict mapping flat status and list of flat objects.
|
||||||
"""
|
"""
|
||||||
# Add the flatisfy metadata entry and prepare the flat objects
|
# Add the flatisfy metadata entry and prepare the flat objects
|
||||||
@ -66,13 +67,21 @@ def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
|
|||||||
|
|
||||||
# Load additional infos
|
# Load additional infos
|
||||||
if fetch_details:
|
if fetch_details:
|
||||||
|
past_ids = {x["id"]: x for x in past_flats} if past_flats else {}
|
||||||
for i, flat in enumerate(first_pass_result["new"]):
|
for i, flat in enumerate(first_pass_result["new"]):
|
||||||
|
details = None
|
||||||
|
|
||||||
|
use_cache = past_ids.get(flat["id"])
|
||||||
|
if use_cache:
|
||||||
|
LOGGER.info("Skipping details download for %s.", flat["id"])
|
||||||
|
details = use_cache
|
||||||
|
else:
|
||||||
details = fetch.fetch_details(config, flat["id"])
|
details = fetch.fetch_details(config, flat["id"])
|
||||||
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
|
|
||||||
if flat["id"].endswith("@leboncoin"):
|
if flat["id"].endswith("@leboncoin"):
|
||||||
# sleep 0.5s to avoid rate-kick
|
# sleep 0.5s to avoid rate-kick
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
|
||||||
|
|
||||||
# Do a second pass to consolidate all the infos we found and make use of
|
# Do a second pass to consolidate all the infos we found and make use of
|
||||||
# additional infos
|
# additional infos
|
||||||
@ -107,7 +116,7 @@ def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def filter_fetched_flats(config, fetched_flats, fetch_details=True):
|
def filter_fetched_flats(config, fetched_flats, fetch_details=True, past_flats={}):
|
||||||
"""
|
"""
|
||||||
Filter the available flats list. Then, filter it according to criteria.
|
Filter the available flats list. Then, filter it according to criteria.
|
||||||
|
|
||||||
@ -124,12 +133,13 @@ def filter_fetched_flats(config, fetched_flats, fetch_details=True):
|
|||||||
config,
|
config,
|
||||||
constraint_name,
|
constraint_name,
|
||||||
flats_list,
|
flats_list,
|
||||||
fetch_details
|
fetch_details,
|
||||||
|
past_flats.get(constraint_name, None)
|
||||||
)
|
)
|
||||||
return fetched_flats
|
return fetched_flats
|
||||||
|
|
||||||
|
|
||||||
def import_and_filter(config, load_from_db=False):
|
def import_and_filter(config, load_from_db=False, new_only=False):
|
||||||
"""
|
"""
|
||||||
Fetch the available flats list. Then, filter it according to criteria.
|
Fetch the available flats list. Then, filter it according to criteria.
|
||||||
Finally, store it in the database.
|
Finally, store it in the database.
|
||||||
@ -140,13 +150,15 @@ def import_and_filter(config, load_from_db=False):
|
|||||||
:return: ``None``.
|
:return: ``None``.
|
||||||
"""
|
"""
|
||||||
# Fetch and filter flats list
|
# Fetch and filter flats list
|
||||||
|
past_flats = fetch.load_flats_from_db(config)
|
||||||
if load_from_db:
|
if load_from_db:
|
||||||
fetched_flats = fetch.load_flats_from_db(config)
|
fetched_flats = past_flats
|
||||||
else:
|
else:
|
||||||
fetched_flats = fetch.fetch_flats(config)
|
fetched_flats = fetch.fetch_flats(config)
|
||||||
# Do not fetch additional details if we loaded data from the db.
|
# Do not fetch additional details if we loaded data from the db.
|
||||||
flats_by_status = filter_fetched_flats(config, fetched_flats=fetched_flats,
|
flats_by_status = filter_fetched_flats(config, fetched_flats=fetched_flats,
|
||||||
fetch_details=(not load_from_db))
|
fetch_details=(not load_from_db),
|
||||||
|
past_flats=past_flats if new_only else {})
|
||||||
# Create database connection
|
# Create database connection
|
||||||
get_session = database.init_db(config["database"], config["search_index"])
|
get_session = database.init_db(config["database"], config["search_index"])
|
||||||
|
|
||||||
@ -208,6 +220,8 @@ def import_and_filter(config, load_from_db=False):
|
|||||||
if config["send_email"]:
|
if config["send_email"]:
|
||||||
email.send_notification(config, new_flats)
|
email.send_notification(config, new_flats)
|
||||||
|
|
||||||
|
LOGGER.info(f"Found {len(new_flats)} new flats.")
|
||||||
|
|
||||||
# Touch a file to indicate last update timestamp
|
# Touch a file to indicate last update timestamp
|
||||||
ts_file = os.path.join(
|
ts_file = os.path.join(
|
||||||
config["data_directory"],
|
config["data_directory"],
|
||||||
|
@ -171,7 +171,7 @@ def guess_location_position(location, cities, constraint):
|
|||||||
]
|
]
|
||||||
if len(postal_code_objects_for_city):
|
if len(postal_code_objects_for_city):
|
||||||
position = {"lat": postal_code_objects_for_city[0].lat, "lng": postal_code_objects_for_city[0].lng}
|
position = {"lat": postal_code_objects_for_city[0].lat, "lng": postal_code_objects_for_city[0].lng}
|
||||||
LOGGER.info(
|
LOGGER.debug(
|
||||||
("Found position %s using city %s."),
|
("Found position %s using city %s."),
|
||||||
position, matched_city_name
|
position, matched_city_name
|
||||||
)
|
)
|
||||||
@ -228,7 +228,7 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
|
|||||||
# Check the postal code is within the db
|
# Check the postal code is within the db
|
||||||
assert postal_code in [x.postal_code for x in opendata["postal_codes"]]
|
assert postal_code in [x.postal_code for x in opendata["postal_codes"]]
|
||||||
|
|
||||||
LOGGER.info(
|
LOGGER.debug(
|
||||||
"Found postal code in location field for flat %s: %s.",
|
"Found postal code in location field for flat %s: %s.",
|
||||||
flat["id"], postal_code
|
flat["id"], postal_code
|
||||||
)
|
)
|
||||||
@ -266,13 +266,12 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
|
|||||||
("Postal code %s found for flat %s @ %s is off-constraints "
|
("Postal code %s found for flat %s @ %s is off-constraints "
|
||||||
"(distance is %dm > %dm). Let's consider it is an "
|
"(distance is %dm > %dm). Let's consider it is an "
|
||||||
"artifact match and keep the post without this postal "
|
"artifact match and keep the post without this postal "
|
||||||
"code. (%s)"),
|
"code."),
|
||||||
postal_code,
|
postal_code,
|
||||||
flat["id"],
|
flat["id"],
|
||||||
location,
|
location,
|
||||||
int(distance),
|
int(distance),
|
||||||
int(distance_threshold),
|
int(distance_threshold),
|
||||||
flat
|
|
||||||
)
|
)
|
||||||
postal_code = None
|
postal_code = None
|
||||||
position = None
|
position = None
|
||||||
|
Loading…
Reference in New Issue
Block a user