249 lines
9.4 KiB
Python
249 lines
9.4 KiB
Python
# coding: utf-8
|
|
"""
|
|
This module contains all the filtering functions. It exposes ``first_pass`` and
|
|
``second_pass`` functions which are a set of filters applied during the first
|
|
pass and the second pass.
|
|
"""
|
|
from __future__ import absolute_import, print_function, unicode_literals
|
|
|
|
import logging
|
|
|
|
from flatisfy import tools
|
|
from flatisfy.filters import duplicates
|
|
from flatisfy.filters import images
|
|
from flatisfy.filters import metadata
|
|
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
|
|
|
|
|
def refine_with_housing_criteria(flats_list, constraint):
|
|
"""
|
|
Filter a list of flats according to criteria.
|
|
|
|
Housings posts websites tend to return broader results that what was
|
|
actually asked for. Then, we should filter out the list to match the
|
|
user criteria, and avoid exposing unwanted flats.
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
|
:param constraint: The constraint that the ``flats_list`` should satisfy.
|
|
:return: A tuple of flats to keep and flats to delete.
|
|
"""
|
|
# For each flat, the associated `is_ok` value indicate whether it should be
|
|
# kept or discarded.
|
|
is_ok = [True for _ in flats_list]
|
|
|
|
for i, flat in enumerate(flats_list):
|
|
# Check postal code
|
|
postal_code = flat["flatisfy"].get("postal_code", None)
|
|
if postal_code and postal_code not in constraint["postal_codes"]:
|
|
LOGGER.info(
|
|
"Postal code %s for flat %s is out of range (%s).",
|
|
postal_code,
|
|
flat["id"],
|
|
", ".join(constraint["postal_codes"]),
|
|
)
|
|
is_ok[i] = False
|
|
# Check insee code
|
|
insee_code = flat["flatisfy"].get("insee_code", None)
|
|
if insee_code and "insee_codes" in constraint and insee_code not in constraint["insee_codes"]:
|
|
LOGGER.info(
|
|
"insee code %s for flat %s is out of range (%s).",
|
|
insee_code,
|
|
flat["id"],
|
|
", ".join(constraint["insee_codes"]),
|
|
)
|
|
is_ok[i] = False
|
|
|
|
# Check time_to
|
|
for place_name, time in flat["flatisfy"].get("time_to", {}).items():
|
|
time = time["time"]
|
|
is_within_interval = tools.is_within_interval(time, *(constraint["time_to"][place_name]["time"]))
|
|
if not is_within_interval:
|
|
LOGGER.info(
|
|
"Flat %s is too far from place %s: %ds.",
|
|
flat["id"],
|
|
place_name,
|
|
time,
|
|
)
|
|
is_ok[i] = is_ok[i] and is_within_interval
|
|
|
|
# Check other fields
|
|
for field in ["area", "cost", "rooms", "bedrooms"]:
|
|
interval = constraint[field]
|
|
is_within_interval = tools.is_within_interval(flat.get(field, None), *interval)
|
|
if not is_within_interval:
|
|
LOGGER.info(
|
|
"%s %s for flat %s is out of range.", field.capitalize(), str(flat.get(field, None)), flat["id"]
|
|
)
|
|
is_ok[i] = is_ok[i] and is_within_interval
|
|
|
|
return (
|
|
[flat for i, flat in enumerate(flats_list) if is_ok[i]],
|
|
[flat for i, flat in enumerate(flats_list) if not is_ok[i]],
|
|
)
|
|
|
|
|
|
def refine_with_details_criteria(flats_list, constraint):
|
|
"""
|
|
Filter a list of flats according to the criteria which require the full
|
|
details to be fetched. These include minimum number of photos and terms
|
|
that should appear in description.
|
|
|
|
.. note ::
|
|
|
|
This has to be done in a separate function and not with the other
|
|
criterias as photos and full description are only fetched in the second
|
|
pass.
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
|
:param constraint: The constraint that the ``flats_list`` should satisfy.
|
|
:return: A tuple of flats to keep and flats to delete.
|
|
"""
|
|
# For each flat, the associated `is_ok` value indicate whether it should be
|
|
# kept or discarded.
|
|
is_ok = [True for _ in flats_list]
|
|
|
|
for i, flat in enumerate(flats_list):
|
|
# Check number of pictures
|
|
has_enough_photos = tools.is_within_interval(len(flat.get("photos", [])), constraint["minimum_nb_photos"], None)
|
|
if not has_enough_photos:
|
|
LOGGER.info(
|
|
"Flat %s only has %d photos, it should have at least %d.",
|
|
flat["id"],
|
|
len(flat["photos"]),
|
|
constraint["minimum_nb_photos"],
|
|
)
|
|
is_ok[i] = False
|
|
|
|
for term in constraint["description_should_contain"]:
|
|
if isinstance(term, str) and term.lower() not in flat["text"].lower():
|
|
LOGGER.info(
|
|
("Description for flat %s does not contain required term '%s'."),
|
|
flat["id"],
|
|
term,
|
|
)
|
|
is_ok[i] = False
|
|
elif isinstance(term, list) and all(x.lower() not in flat["text"].lower() for x in term):
|
|
LOGGER.info(
|
|
("Description for flat %s does not contain any of required terms '%s'."),
|
|
flat["id"],
|
|
term,
|
|
)
|
|
is_ok[i] = False
|
|
for term in constraint["description_should_not_contain"]:
|
|
if term.lower() in flat["text"].lower():
|
|
LOGGER.info(
|
|
("Description for flat %s contains blacklisted term '%s'."),
|
|
flat["id"],
|
|
term,
|
|
)
|
|
is_ok[i] = False
|
|
|
|
return (
|
|
[flat for i, flat in enumerate(flats_list) if is_ok[i]],
|
|
[flat for i, flat in enumerate(flats_list) if not is_ok[i]],
|
|
)
|
|
|
|
|
|
@tools.timeit
|
|
def first_pass(flats_list, constraint, config):
|
|
"""
|
|
First filtering pass.
|
|
|
|
Flatboob only fetches data from the listing of the available housing. Then,
|
|
we should do a first pass to filter based on the already available data and
|
|
only request more data for the remaining housings.
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
|
:param constraint: The constraint that the ``flats_list`` should satisfy.
|
|
:param config: A config dict.
|
|
:return: A dict mapping flat status and list of flat objects.
|
|
"""
|
|
LOGGER.info("Running first filtering pass.")
|
|
|
|
# Handle duplicates based on ids
|
|
# Just remove them (no merge) as they should be the exact same object.
|
|
flats_list, _ = duplicates.detect(flats_list, key="id", merge=False, should_intersect=False)
|
|
# Also merge duplicates based on urls (these may come from different
|
|
# flatboob backends)
|
|
# This is especially useful as some websites such as entreparticuliers
|
|
# contains a lot of leboncoin housings posts.
|
|
flats_list, duplicates_by_urls = duplicates.detect(flats_list, key="urls", merge=True, should_intersect=True)
|
|
|
|
# Guess the postal codes
|
|
flats_list = metadata.guess_postal_code(flats_list, constraint, config)
|
|
|
|
if not config["ignore_station"]:
|
|
# Try to match with stations
|
|
flats_list = metadata.guess_stations(flats_list, constraint, config)
|
|
|
|
# Remove returned housing posts that do not match criteria
|
|
flats_list, ignored_list = refine_with_housing_criteria(flats_list, constraint)
|
|
|
|
return {"new": flats_list, "ignored": ignored_list, "duplicate": duplicates_by_urls}
|
|
|
|
|
|
@tools.timeit
|
|
def second_pass(flats_list, constraint, config):
|
|
"""
|
|
Second filtering pass.
|
|
|
|
This pass is expected to have as most information as possible on the
|
|
available housings. Plus it runs after first pass which already
|
|
consolidated data.
|
|
|
|
It should consolidate everything and try to extract as many data as
|
|
possible from the fetched housings.
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
|
:param constraint: The constraint that the ``flats_list`` should satisfy.
|
|
:param config: A config dict.
|
|
:return: A dict mapping flat status and list of flat objects.
|
|
"""
|
|
LOGGER.info("Running second filtering pass.")
|
|
# Assumed to run after first pass, so there should be no obvious duplicates
|
|
# left and we already tried to find postal code and nearby stations.
|
|
|
|
# Confirm postal code
|
|
flats_list = metadata.guess_postal_code(flats_list, constraint, config)
|
|
|
|
# Better match with stations (confirm and check better)
|
|
if not config["ignore_station"]:
|
|
flats_list = metadata.guess_stations(flats_list, constraint, config)
|
|
|
|
# Compute travel time to specified points
|
|
flats_list = metadata.compute_travel_times(flats_list, constraint, config)
|
|
|
|
# Remove returned housing posts that do not match criteria
|
|
flats_list, ignored_list = refine_with_housing_criteria(flats_list, constraint)
|
|
|
|
# Remove returned housing posts which do not match criteria relying on
|
|
# fetched details.
|
|
flats_list, ignored_list = refine_with_details_criteria(flats_list, constraint)
|
|
|
|
if config["serve_images_locally"]:
|
|
images.download_images(flats_list, config)
|
|
|
|
return {"new": flats_list, "ignored": ignored_list, "duplicate": []}
|
|
|
|
|
|
@tools.timeit
|
|
def third_pass(flats_list, config):
|
|
"""
|
|
Third filtering pass.
|
|
|
|
This pass is expected to perform deep duplicate detection on available
|
|
flats.
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
|
:param config: A config dict.
|
|
:return: A dict mapping flat status and list of flat objects.
|
|
"""
|
|
LOGGER.info("Running third filtering pass.")
|
|
|
|
# Deduplicate the list using every available data
|
|
flats_list, duplicate_flats = duplicates.deep_detect(flats_list, config)
|
|
|
|
return {"new": flats_list, "ignored": [], "duplicate": duplicate_flats}
|