2017-04-03 17:29:29 +02:00
|
|
|
# coding: utf-8
|
|
|
|
"""
|
|
|
|
This module contains all the filtering functions. It exposes ``first_pass`` and
|
|
|
|
``second_pass`` functions which are a set of filters applied during the first
|
|
|
|
pass and the second pass.
|
|
|
|
"""
|
|
|
|
from __future__ import absolute_import, print_function, unicode_literals
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
|
|
|
from flatisfy import tools
|
|
|
|
from flatisfy.filters import duplicates
|
2018-01-22 01:06:09 +01:00
|
|
|
from flatisfy.filters import images
|
2017-04-03 17:29:29 +02:00
|
|
|
from flatisfy.filters import metadata
|
|
|
|
|
|
|
|
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2017-06-20 13:37:22 +02:00
|
|
|
def refine_with_housing_criteria(flats_list, constraint):
|
2017-04-03 17:29:29 +02:00
|
|
|
"""
|
|
|
|
Filter a list of flats according to criteria.
|
|
|
|
|
|
|
|
Housings posts websites tend to return broader results that what was
|
|
|
|
actually asked for. Then, we should filter out the list to match the
|
|
|
|
user criteria, and avoid exposing unwanted flats.
|
|
|
|
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
2017-06-16 16:21:13 +02:00
|
|
|
:param constraint: The constraint that the ``flats_list`` should satisfy.
|
2017-04-03 17:29:29 +02:00
|
|
|
:return: A tuple of flats to keep and flats to delete.
|
|
|
|
"""
|
|
|
|
# For each flat, the associated `is_ok` value indicate whether it should be
|
|
|
|
# kept or discarded.
|
|
|
|
is_ok = [True for _ in flats_list]
|
|
|
|
|
|
|
|
for i, flat in enumerate(flats_list):
|
|
|
|
# Check postal code
|
|
|
|
postal_code = flat["flatisfy"].get("postal_code", None)
|
|
|
|
if (
|
|
|
|
postal_code and
|
2017-06-16 16:21:13 +02:00
|
|
|
postal_code not in constraint["postal_codes"]
|
2017-04-03 17:29:29 +02:00
|
|
|
):
|
|
|
|
LOGGER.info("Postal code for flat %s is out of range.", flat["id"])
|
|
|
|
is_ok[i] = is_ok[i] and False
|
|
|
|
|
|
|
|
# Check time_to
|
|
|
|
for place_name, time in flat["flatisfy"].get("time_to", {}).items():
|
2017-05-04 20:52:10 +02:00
|
|
|
time = time["time"]
|
2017-04-03 17:29:29 +02:00
|
|
|
is_within_interval = tools.is_within_interval(
|
|
|
|
time,
|
2017-06-16 16:21:13 +02:00
|
|
|
*(constraint["time_to"][place_name]["time"])
|
2017-04-03 17:29:29 +02:00
|
|
|
)
|
|
|
|
if not is_within_interval:
|
2017-05-04 20:52:10 +02:00
|
|
|
LOGGER.info("Flat %s is too far from place %s: %ds.",
|
|
|
|
flat["id"], place_name, time)
|
2017-04-03 17:29:29 +02:00
|
|
|
is_ok[i] = is_ok[i] and is_within_interval
|
|
|
|
|
|
|
|
# Check other fields
|
|
|
|
for field in ["area", "cost", "rooms", "bedrooms"]:
|
2017-06-16 16:21:13 +02:00
|
|
|
interval = constraint[field]
|
2017-04-03 17:29:29 +02:00
|
|
|
is_within_interval = tools.is_within_interval(
|
|
|
|
flat.get(field, None),
|
|
|
|
*interval
|
|
|
|
)
|
|
|
|
if not is_within_interval:
|
|
|
|
LOGGER.info("%s for flat %s is out of range.",
|
|
|
|
field.capitalize(), flat["id"])
|
|
|
|
is_ok[i] = is_ok[i] and is_within_interval
|
|
|
|
|
2017-10-29 03:05:35 +01:00
|
|
|
return (
|
|
|
|
[
|
|
|
|
flat
|
|
|
|
for i, flat in enumerate(flats_list)
|
|
|
|
if is_ok[i]
|
|
|
|
],
|
|
|
|
[
|
|
|
|
flat
|
|
|
|
for i, flat in enumerate(flats_list)
|
|
|
|
if not is_ok[i]
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2017-10-29 20:16:33 +01:00
|
|
|
def refine_with_details_criteria(flats_list, constraint):
|
2017-10-29 03:05:35 +01:00
|
|
|
"""
|
2017-10-29 20:16:33 +01:00
|
|
|
Filter a list of flats according to the criteria which require the full
|
|
|
|
details to be fetched. These include minimum number of photos and terms
|
|
|
|
that should appear in description.
|
2017-10-29 03:05:35 +01:00
|
|
|
|
2017-12-05 14:56:08 +01:00
|
|
|
.. note ::
|
|
|
|
|
|
|
|
This has to be done in a separate function and not with the other
|
|
|
|
criterias as photos and full description are only fetched in the second
|
|
|
|
pass.
|
2017-10-29 03:05:35 +01:00
|
|
|
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
|
|
|
:param constraint: The constraint that the ``flats_list`` should satisfy.
|
|
|
|
:return: A tuple of flats to keep and flats to delete.
|
|
|
|
"""
|
|
|
|
# For each flat, the associated `is_ok` value indicate whether it should be
|
|
|
|
# kept or discarded.
|
|
|
|
is_ok = [True for _ in flats_list]
|
|
|
|
|
|
|
|
for i, flat in enumerate(flats_list):
|
2017-10-24 17:21:48 +02:00
|
|
|
# Check number of pictures
|
2017-10-29 03:05:35 +01:00
|
|
|
has_enough_photos = tools.is_within_interval(
|
2017-11-03 17:41:16 +01:00
|
|
|
len(flat.get('photos', [])),
|
2017-10-29 20:03:39 +01:00
|
|
|
constraint['minimum_nb_photos'],
|
2017-10-29 03:05:35 +01:00
|
|
|
None
|
|
|
|
)
|
|
|
|
if not has_enough_photos:
|
|
|
|
LOGGER.info(
|
|
|
|
"Flat %s only has %d photos, it should have at least %d.",
|
|
|
|
flat["id"],
|
|
|
|
len(flat['photos']),
|
2017-10-29 20:03:39 +01:00
|
|
|
constraint['minimum_nb_photos']
|
2017-10-29 03:05:35 +01:00
|
|
|
)
|
2017-10-24 17:21:48 +02:00
|
|
|
is_ok[i] = False
|
|
|
|
|
2018-11-07 15:47:19 +01:00
|
|
|
has_all_good_terms_in_description = True
|
2017-10-29 20:16:33 +01:00
|
|
|
if constraint["description_should_contain"]:
|
2018-11-07 15:47:19 +01:00
|
|
|
has_all_good_terms_in_description = all(
|
2017-10-29 20:16:33 +01:00
|
|
|
term in flat['text']
|
|
|
|
for term in constraint["description_should_contain"]
|
|
|
|
)
|
2018-11-07 15:47:19 +01:00
|
|
|
|
|
|
|
has_a_bad_term_in_description = False
|
|
|
|
if constraint["description_should_not_contain"]:
|
|
|
|
has_a_bad_term_in_description = any(
|
|
|
|
term in flat['text']
|
|
|
|
for term in constraint["description_should_not_contain"]
|
|
|
|
)
|
|
|
|
|
|
|
|
if (not has_all_good_terms_in_description
|
|
|
|
or has_a_bad_term_in_description):
|
2017-10-29 20:16:33 +01:00
|
|
|
LOGGER.info(
|
|
|
|
("Description for flat %s does not contain all the required "
|
2018-11-07 15:47:19 +01:00
|
|
|
"terms, or contains a blacklisted term."),
|
2017-10-29 20:16:33 +01:00
|
|
|
flat["id"]
|
|
|
|
)
|
|
|
|
is_ok[i] = False
|
|
|
|
|
2017-04-03 17:29:29 +02:00
|
|
|
return (
|
|
|
|
[
|
|
|
|
flat
|
|
|
|
for i, flat in enumerate(flats_list)
|
|
|
|
if is_ok[i]
|
|
|
|
],
|
|
|
|
[
|
|
|
|
flat
|
|
|
|
for i, flat in enumerate(flats_list)
|
|
|
|
if not is_ok[i]
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2017-10-29 03:05:35 +01:00
|
|
|
|
2017-06-13 19:22:08 +02:00
|
|
|
@tools.timeit
|
2017-06-16 16:21:13 +02:00
|
|
|
def first_pass(flats_list, constraint, config):
|
2017-04-03 17:29:29 +02:00
|
|
|
"""
|
|
|
|
First filtering pass.
|
|
|
|
|
|
|
|
Flatboob only fetches data from the listing of the available housing. Then,
|
|
|
|
we should do a first pass to filter based on the already available data and
|
|
|
|
only request more data for the remaining housings.
|
|
|
|
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
2017-06-16 16:21:13 +02:00
|
|
|
:param constraint: The constraint that the ``flats_list`` should satisfy.
|
2017-04-03 17:29:29 +02:00
|
|
|
:param config: A config dict.
|
2017-04-27 17:08:10 +02:00
|
|
|
:return: A dict mapping flat status and list of flat objects.
|
2017-04-03 17:29:29 +02:00
|
|
|
"""
|
|
|
|
LOGGER.info("Running first filtering pass.")
|
2017-04-13 23:24:31 +02:00
|
|
|
|
2017-04-03 17:29:29 +02:00
|
|
|
# Handle duplicates based on ids
|
|
|
|
# Just remove them (no merge) as they should be the exact same object.
|
2018-01-10 16:46:47 +01:00
|
|
|
flats_list, _ = duplicates.detect(
|
2017-04-27 17:08:10 +02:00
|
|
|
flats_list, key="id", merge=False, should_intersect=False
|
2017-04-03 17:29:29 +02:00
|
|
|
)
|
2017-04-27 17:08:10 +02:00
|
|
|
# Also merge duplicates based on urls (these may come from different
|
2017-04-03 17:29:29 +02:00
|
|
|
# flatboob backends)
|
|
|
|
# This is especially useful as some websites such as entreparticuliers
|
|
|
|
# contains a lot of leboncoin housings posts.
|
2017-04-27 17:08:10 +02:00
|
|
|
flats_list, duplicates_by_urls = duplicates.detect(
|
|
|
|
flats_list, key="urls", merge=True, should_intersect=True
|
2017-04-03 17:29:29 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
# Guess the postal codes
|
2017-06-16 16:21:13 +02:00
|
|
|
flats_list = metadata.guess_postal_code(flats_list, constraint, config)
|
2017-04-03 17:29:29 +02:00
|
|
|
# Try to match with stations
|
2017-06-16 16:21:13 +02:00
|
|
|
flats_list = metadata.guess_stations(flats_list, constraint, config)
|
2017-04-03 17:29:29 +02:00
|
|
|
# Remove returned housing posts that do not match criteria
|
2017-06-16 16:21:13 +02:00
|
|
|
flats_list, ignored_list = refine_with_housing_criteria(flats_list,
|
2017-06-20 13:37:22 +02:00
|
|
|
constraint)
|
2017-04-03 17:29:29 +02:00
|
|
|
|
2017-04-27 17:08:10 +02:00
|
|
|
return {
|
|
|
|
"new": flats_list,
|
|
|
|
"ignored": ignored_list,
|
2018-01-10 16:46:47 +01:00
|
|
|
"duplicate": duplicates_by_urls
|
2017-04-27 17:08:10 +02:00
|
|
|
}
|
2017-04-03 17:29:29 +02:00
|
|
|
|
2017-06-13 19:22:08 +02:00
|
|
|
@tools.timeit
|
2017-06-16 16:21:13 +02:00
|
|
|
def second_pass(flats_list, constraint, config):
|
2017-04-03 17:29:29 +02:00
|
|
|
"""
|
|
|
|
Second filtering pass.
|
|
|
|
|
|
|
|
This pass is expected to have as most information as possible on the
|
|
|
|
available housings. Plus it runs after first pass which already
|
|
|
|
consolidated data.
|
|
|
|
|
|
|
|
It should consolidate everything and try to extract as many data as
|
|
|
|
possible from the fetched housings.
|
|
|
|
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
2017-06-16 16:21:13 +02:00
|
|
|
:param constraint: The constraint that the ``flats_list`` should satisfy.
|
2017-04-03 17:29:29 +02:00
|
|
|
:param config: A config dict.
|
2017-04-27 17:08:10 +02:00
|
|
|
:return: A dict mapping flat status and list of flat objects.
|
2017-04-03 17:29:29 +02:00
|
|
|
"""
|
|
|
|
LOGGER.info("Running second filtering pass.")
|
|
|
|
# Assumed to run after first pass, so there should be no obvious duplicates
|
|
|
|
# left and we already tried to find postal code and nearby stations.
|
|
|
|
|
|
|
|
# Confirm postal code
|
2017-06-16 16:21:13 +02:00
|
|
|
flats_list = metadata.guess_postal_code(flats_list, constraint, config)
|
2017-04-03 17:29:29 +02:00
|
|
|
|
|
|
|
# Better match with stations (confirm and check better)
|
2017-06-16 16:21:13 +02:00
|
|
|
flats_list = metadata.guess_stations(flats_list, constraint, config)
|
2017-04-03 17:29:29 +02:00
|
|
|
|
|
|
|
# Compute travel time to specified points
|
2017-06-16 16:21:13 +02:00
|
|
|
flats_list = metadata.compute_travel_times(flats_list, constraint, config)
|
2017-04-03 17:29:29 +02:00
|
|
|
|
|
|
|
# Remove returned housing posts that do not match criteria
|
2017-06-16 16:21:13 +02:00
|
|
|
flats_list, ignored_list = refine_with_housing_criteria(flats_list,
|
2017-06-20 13:37:22 +02:00
|
|
|
constraint)
|
2017-04-03 17:29:29 +02:00
|
|
|
|
2017-10-29 20:16:33 +01:00
|
|
|
# Remove returned housing posts which do not match criteria relying on
|
|
|
|
# fetched details.
|
|
|
|
flats_list, ignored_list = refine_with_details_criteria(flats_list,
|
|
|
|
constraint)
|
2017-10-29 03:05:35 +01:00
|
|
|
|
2018-01-22 01:06:09 +01:00
|
|
|
if config["serve_images_locally"]:
|
|
|
|
images.download_images(flats_list, config)
|
|
|
|
|
2017-04-27 17:08:10 +02:00
|
|
|
return {
|
|
|
|
"new": flats_list,
|
2017-04-28 20:59:46 +02:00
|
|
|
"ignored": ignored_list,
|
2017-05-04 20:52:10 +02:00
|
|
|
"duplicate": []
|
|
|
|
}
|
|
|
|
|
2017-06-13 19:22:08 +02:00
|
|
|
@tools.timeit
|
2018-01-05 19:01:08 +01:00
|
|
|
def third_pass(flats_list, config):
|
2017-05-04 20:52:10 +02:00
|
|
|
"""
|
|
|
|
Third filtering pass.
|
|
|
|
|
|
|
|
This pass is expected to perform deep duplicate detection on available
|
|
|
|
flats.
|
|
|
|
|
|
|
|
:param flats_list: A list of flats dict to filter.
|
2018-01-05 19:01:08 +01:00
|
|
|
:param config: A config dict.
|
2017-05-04 20:52:10 +02:00
|
|
|
:return: A dict mapping flat status and list of flat objects.
|
|
|
|
"""
|
2017-06-13 19:22:08 +02:00
|
|
|
LOGGER.info("Running third filtering pass.")
|
|
|
|
|
2017-05-04 20:52:10 +02:00
|
|
|
# Deduplicate the list using every available data
|
2018-01-05 19:01:08 +01:00
|
|
|
flats_list, duplicate_flats = duplicates.deep_detect(flats_list, config)
|
2017-05-04 20:52:10 +02:00
|
|
|
|
|
|
|
return {
|
|
|
|
"new": flats_list,
|
|
|
|
"ignored": [],
|
2017-04-28 20:59:46 +02:00
|
|
|
"duplicate": duplicate_flats
|
2017-04-27 17:08:10 +02:00
|
|
|
}
|