Handle multiple constraints in the config

All the backend part has been rewritten to handle multiple constraints
in the config (== multiple queries).

Also did some linting.

Still to be done: frontend part and doc.
This commit is contained in:
Lucas Verney 2017-06-16 16:21:13 +02:00
parent 12a55e64be
commit bdf8a6b8d2
12 changed files with 195 additions and 118 deletions

View File

@ -10,11 +10,13 @@ import sys
logging.basicConfig()
# pylint: disable=locally-disabled,wrong-import-position
import flatisfy.config
from flatisfy import cmds
from flatisfy import data
from flatisfy import fetch
from flatisfy import tools
# pylint: enable=locally-disabled,wrong-import-position
LOGGER = logging.getLogger("flatisfy")
@ -166,31 +168,35 @@ def main():
# Fetch command
if args.cmd == "fetch":
# Fetch and filter flats list
flats_list = fetch.fetch_flats_list(config)
flats_list = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=True)["new"]
fetched_flats = fetch.fetch_flats(config)
fetched_flats = cmds.filter_fetched_flats(config,
fetched_flats=fetched_flats,
fetch_details=True)["new"]
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
fetched_flats = tools.sort_list_of_dicts_by(fetched_flats, "cost")
print(
tools.pretty_json(flats_list)
tools.pretty_json(sum(fetched_flats.values(), []))
)
return
# Filter command
elif args.cmd == "filter":
# Load and filter flats list
if args.input:
flats_list = fetch.load_flats_list_from_file(args.input)
fetched_flats = fetch.load_flats_from_file(args.input, config)
flats_list = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=False)["new"]
fetched_flats = cmds.filter_fetched_flats(
config,
fetched_flats=fetched_flats,
fetch_details=False
)["new"]
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
fetched_flats = tools.sort_list_of_dicts_by(fetched_flats, "cost")
# Output to stdout
print(
tools.pretty_json(flats_list)
tools.pretty_json(sum(fetched_flats.values(), []))
)
else:
cmds.import_and_filter(config, load_from_db=True)

View File

@ -21,19 +21,28 @@ from flatisfy.web import app as web_app
LOGGER = logging.getLogger(__name__)
def filter_flats(config, flats_list, fetch_details=True):
def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
"""
Filter the available flats list. Then, filter it according to criteria.
:param config: A config dict.
:param constraint_name: The constraint name that the ``flats_list`` should
satisfy.
:param fetch_details: Whether additional details should be fetched between
the two passes.
:param flats_list: The initial list of flat objects to filter.
:return: A dict mapping flat status and list of flat objects.
"""
# pylint: disable=locally-disabled,redefined-variable-type
# Add the flatisfy metadata entry and prepare the flat objects
flats_list = metadata.init(flats_list)
flats_list = metadata.init(flats_list, constraint_name)
# Get the associated constraint from config
try:
constraint = config["constraints"][constraint_name]
except KeyError:
LOGGER.warning("Missing constraint %s. Using default one.",
constraint_name)
constraint = config["constraints"]["default"]
first_pass_result = collections.defaultdict(list)
second_pass_result = collections.defaultdict(list)
@ -42,6 +51,7 @@ def filter_flats(config, flats_list, fetch_details=True):
# unwanted postings as possible
if config["passes"] > 0:
first_pass_result = flatisfy.filters.first_pass(flats_list,
constraint,
config)
else:
first_pass_result["new"] = flats_list
@ -56,7 +66,7 @@ def filter_flats(config, flats_list, fetch_details=True):
# additional infos
if config["passes"] > 1:
second_pass_result = flatisfy.filters.second_pass(
first_pass_result["new"], config
first_pass_result["new"], constraint, config
)
else:
second_pass_result["new"] = first_pass_result["new"]
@ -84,6 +94,28 @@ def filter_flats(config, flats_list, fetch_details=True):
}
def filter_fetched_flats(config, fetched_flats, fetch_details=True):
"""
Filter the available flats list. Then, filter it according to criteria.
:param config: A config dict.
:param fetch_details: Whether additional details should be fetched between
the two passes.
:param fetched_flats: The initial dict mapping constraints to the list of
fetched flat objects to filter.
:return: A dict mapping constraints to a dict mapping flat status and list
of flat objects.
"""
for constraint_name, flats_list in fetched_flats.items():
fetched_flats[constraint_name] = filter_flats_list(
config,
constraint_name,
flats_list,
fetch_details
)
return fetched_flats
def import_and_filter(config, load_from_db=False):
"""
Fetch the available flats list. Then, filter it according to criteria.
@ -96,18 +128,24 @@ def import_and_filter(config, load_from_db=False):
"""
# Fetch and filter flats list
if load_from_db:
flats_list = fetch.load_flats_list_from_db(config)
fetched_flats = fetch.load_flats_from_db(config)
else:
flats_list = fetch.fetch_flats_list(config)
fetched_flats = fetch.fetch_flats(config)
# Do not fetch additional details if we loaded data from the db.
flats_list_by_status = filter_flats(config, flats_list=flats_list,
fetch_details=(not load_from_db))
flats_by_status = filter_fetched_flats(config, fetched_flats=fetched_flats,
fetch_details=(not load_from_db))
# Create database connection
get_session = database.init_db(config["database"], config["search_index"])
LOGGER.info("Merging fetched flats in database...")
# Flatten the flats_by_status dict
flatten_flats_by_status = collections.defaultdict(list)
for flats in flats_by_status.values():
for status, flats_list in flats.items():
flatten_flats_by_status[status].extend(flats_list)
with get_session() as session:
for status, flats_list in flats_list_by_status.items():
for status, flats_list in flatten_flats_by_status.items():
# Build SQLAlchemy Flat model objects for every available flat
flats_objects = {
flat_dict["id"]: flat_model.Flat.from_dict(flat_dict)

View File

@ -23,17 +23,19 @@ from flatisfy import tools
DEFAULT_CONFIG = {
# Constraints to match
"constraints": {
"type": None, # RENT, SALE, SHARING
"house_types": [], # List of house types, must be in APART, HOUSE,
# PARKING, LAND, OTHER or UNKNOWN
"postal_codes": [], # List of postal codes
"area": (None, None), # (min, max) in m^2
"cost": (None, None), # (min, max) in currency unit
"rooms": (None, None), # (min, max)
"bedrooms": (None, None), # (min, max)
"time_to": {} # Dict mapping names to {"gps": [lat, lng],
# "time": (min, max) }
# Time is in seconds
"default": {
"type": None, # RENT, SALE, SHARING
"house_types": [], # List of house types, must be in APART, HOUSE,
# PARKING, LAND, OTHER or UNKNOWN
"postal_codes": [], # List of postal codes
"area": (None, None), # (min, max) in m^2
"cost": (None, None), # (min, max) in currency unit
"rooms": (None, None), # (min, max)
"bedrooms": (None, None), # (min, max)
"time_to": {} # Dict mapping names to {"gps": [lat, lng],
# "time": (min, max) }
# Time is in seconds
}
},
# Navitia API key
"navitia_api_key": None,
@ -94,41 +96,44 @@ def validate_config(config):
# and use long lines whenever needed, in order to have the full assert
# message in the log output.
# pylint: disable=locally-disabled,line-too-long
assert "type" in config["constraints"]
assert isinstance(config["constraints"]["type"], str)
assert config["constraints"]["type"].upper() in ["RENT",
"SALE", "SHARING"]
assert "house_types" in config["constraints"]
assert config["constraints"]["house_types"]
for house_type in config["constraints"]["house_types"]:
assert house_type.upper() in ["APART", "HOUSE", "PARKING", "LAND",
"OTHER", "UNKNOWN"]
# Ensure default constraint is here
assert "default" in config["constraints"]
# Ensure constraints are ok
for constraint in config["constraints"].values():
assert "type" in constraint
assert isinstance(constraint["type"], str)
assert constraint["type"].upper() in ["RENT", "SALE", "SHARING"]
assert "postal_codes" in config["constraints"]
assert config["constraints"]["postal_codes"]
assert "house_types" in constraint
assert constraint["house_types"]
for house_type in constraint["house_types"]:
assert house_type.upper() in ["APART", "HOUSE", "PARKING", "LAND", "OTHER", "UNKNOWN"] # noqa: E501
assert "area" in config["constraints"]
_check_constraints_bounds(config["constraints"]["area"])
assert "postal_codes" in constraint
assert constraint["postal_codes"]
assert "cost" in config["constraints"]
_check_constraints_bounds(config["constraints"]["cost"])
assert "area" in constraint
_check_constraints_bounds(constraint["area"])
assert "rooms" in config["constraints"]
_check_constraints_bounds(config["constraints"]["rooms"])
assert "cost" in constraint
_check_constraints_bounds(constraint["cost"])
assert "bedrooms" in config["constraints"]
_check_constraints_bounds(config["constraints"]["bedrooms"])
assert "rooms" in constraint
_check_constraints_bounds(constraint["rooms"])
assert "time_to" in config["constraints"]
assert isinstance(config["constraints"]["time_to"], dict)
for name, item in config["constraints"]["time_to"].items():
assert isinstance(name, str)
assert "gps" in item
assert isinstance(item["gps"], list)
assert len(item["gps"]) == 2
assert "time" in item
_check_constraints_bounds(item["time"])
assert "bedrooms" in constraint
_check_constraints_bounds(constraint["bedrooms"])
assert "time_to" in constraint
assert isinstance(constraint["time_to"], dict)
for name, item in constraint["time_to"].items():
assert isinstance(name, str)
assert "gps" in item
assert isinstance(item["gps"], list)
assert len(item["gps"]) == 2
assert "time" in item
_check_constraints_bounds(item["time"])
assert config["passes"] in [0, 1, 2, 3]
assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0) # noqa: E501

View File

@ -24,6 +24,9 @@ except ImportError:
from functools32 import lru_cache
except ImportError:
def lru_cache(maxsize=None):
"""
Identity implementation of ``lru_cache`` for fallback.
"""
return lambda func: func
LOGGER.warning(
"`functools.lru_cache` is not available on your system. Consider "
@ -66,19 +69,21 @@ def preprocess_data(config, force=False):
@lru_cache(maxsize=5)
def load_data(model, config):
def load_data(model, constraint, config):
"""
Load data of the specified model from the database. Only load data for the
specific areas of the postal codes in config.
:param model: SQLAlchemy model to load.
:param constraint: A constraint from configuration to limit the spatial
extension of the loaded data.
:param config: A config dictionary.
:returns: A list of loaded SQLAlchemy objects from the db
"""
get_session = database.init_db(config["database"], config["search_index"])
results = []
with get_session() as session:
for postal_code in config["constraints"]["postal_codes"]:
for postal_code in constraint["postal_codes"]:
area = data_files.french_postal_codes_to_iso_3166(postal_code)
results.extend(
session.query(model)

View File

@ -38,9 +38,9 @@ def french_postal_codes_to_iso_3166(postal_code):
"FR-IDF": ["75", "77", "78", "91", "92", "93", "94", "95"],
"FR-NOR": ["14", "27", "50", "61", "76"],
"FR-NAQ": ["16", "17", "19", "23", "24", "33", "40", "47", "64", "79",
"86", "87"],
"86", "87"],
"FR-OCC": ["09", "11", "12", "30", "31", "32", "34", "46", "48", "65",
"66", "81", "82"],
"66", "81", "82"],
"FR-PDL": ["44", "49", "53", "72", "85"],
"FR-PAC": ["04", "05", "06", "13", "83", "84"]
}

View File

@ -4,6 +4,7 @@ This module contains all the code related to fetching and loading flats lists.
"""
from __future__ import absolute_import, print_function, unicode_literals
import collections
import itertools
import json
import logging
@ -225,29 +226,32 @@ class WeboobProxy(object):
return "{}"
def fetch_flats_list(config):
def fetch_flats(config):
"""
Fetch the available flats using the Flatboob / Weboob config.
:param config: A config dict.
:return: A list of all available flats.
:return: A dict mapping constraint in config to all available matching
flats.
"""
flats_list = []
fetched_flats = {}
with WeboobProxy(config) as weboob_proxy:
LOGGER.info("Loading flats...")
queries = weboob_proxy.build_queries(config["constraints"])
housing_posts = []
for query in queries:
housing_posts.extend(
weboob_proxy.query(query, config["max_entries"])
)
for constraint_name, constraint in config["constraints"].items():
LOGGER.info("Loading flats for constraint %s...", constraint_name)
with WeboobProxy(config) as weboob_proxy:
queries = weboob_proxy.build_queries(constraint)
housing_posts = []
for query in queries:
housing_posts.extend(
weboob_proxy.query(query, config["max_entries"])
)
LOGGER.info("Fetched %d flats.", len(housing_posts))
flats_list = [json.loads(flat) for flat in housing_posts]
flats_list = [WeboobProxy.restore_decimal_fields(flat)
for flat in flats_list]
return flats_list
constraint_flats_list = [json.loads(flat) for flat in housing_posts]
constraint_flats_list = [WeboobProxy.restore_decimal_fields(flat)
for flat in constraint_flats_list]
fetched_flats[constraint_name] = constraint_flats_list
return fetched_flats
def fetch_details(config, flat_id):
@ -269,12 +273,18 @@ def fetch_details(config, flat_id):
return flat_details
def load_flats_list_from_file(json_file):
def load_flats_from_file(json_file, config):
"""
Load a dumped flats list from JSON file.
:param json_file: The file to load housings list from.
:return: A list of all the flats in the dump file.
:return: A dict mapping constraint in config to all available matching
flats.
.. note::
As we do not know which constraint is met by a given flat, all the
flats are returned for any available constraint, and they will be
filtered out afterwards.
"""
flats_list = []
try:
@ -284,21 +294,24 @@ def load_flats_list_from_file(json_file):
LOGGER.info("Found %d flats.", len(flats_list))
except (IOError, ValueError):
LOGGER.error("File %s is not a valid dump file.", json_file)
return flats_list
return {
constraint_name: flats_list
for constraint_name in config["constraints"]
}
def load_flats_list_from_db(config):
def load_flats_from_db(config):
"""
Load flats from database.
:param config: A config dict.
:return: A list of all the flats in the database.
:return: A dict mapping constraint in config to all available matching
flats.
"""
flats_list = []
get_session = database.init_db(config["database"], config["search_index"])
loaded_flats = collections.defaultdict(list)
with get_session() as session:
# TODO: Better serialization
flats_list = [flat.json_api_repr()
for flat in session.query(flat_model.Flat).all()]
return flats_list
for flat in session.query(flat_model.Flat).all():
loaded_flats[flat.flatisfy_constraint].append(flat.json_api_repr())
return loaded_flats

View File

@ -16,7 +16,7 @@ from flatisfy.filters import metadata
LOGGER = logging.getLogger(__name__)
def refine_with_housing_criteria(flats_list, config):
def refine_with_housing_criteria(flats_list, constraint, config):
"""
Filter a list of flats according to criteria.
@ -25,6 +25,7 @@ def refine_with_housing_criteria(flats_list, config):
user criteria, and avoid exposing unwanted flats.
:param flats_list: A list of flats dict to filter.
:param constraint: The constraint that the ``flats_list`` should satisfy.
:param config: A config dict.
:return: A tuple of flats to keep and flats to delete.
"""
@ -37,7 +38,7 @@ def refine_with_housing_criteria(flats_list, config):
postal_code = flat["flatisfy"].get("postal_code", None)
if (
postal_code and
postal_code not in config["constraints"]["postal_codes"]
postal_code not in constraint["postal_codes"]
):
LOGGER.info("Postal code for flat %s is out of range.", flat["id"])
is_ok[i] = is_ok[i] and False
@ -47,7 +48,7 @@ def refine_with_housing_criteria(flats_list, config):
time = time["time"]
is_within_interval = tools.is_within_interval(
time,
*(config["constraints"]["time_to"][place_name]["time"])
*(constraint["time_to"][place_name]["time"])
)
if not is_within_interval:
LOGGER.info("Flat %s is too far from place %s: %ds.",
@ -56,7 +57,7 @@ def refine_with_housing_criteria(flats_list, config):
# Check other fields
for field in ["area", "cost", "rooms", "bedrooms"]:
interval = config["constraints"][field]
interval = constraint[field]
is_within_interval = tools.is_within_interval(
flat.get(field, None),
*interval
@ -80,7 +81,7 @@ def refine_with_housing_criteria(flats_list, config):
)
def first_pass(flats_list, config):
def first_pass(flats_list, constraint, config):
"""
First filtering pass.
@ -89,6 +90,7 @@ def first_pass(flats_list, config):
only request more data for the remaining housings.
:param flats_list: A list of flats dict to filter.
:param constraint: The constraint that the ``flats_list`` should satisfy.
:param config: A config dict.
:return: A dict mapping flat status and list of flat objects.
"""
@ -108,11 +110,12 @@ def first_pass(flats_list, config):
)
# Guess the postal codes
flats_list = metadata.guess_postal_code(flats_list, config)
flats_list = metadata.guess_postal_code(flats_list, constraint, config)
# Try to match with stations
flats_list = metadata.guess_stations(flats_list, config)
flats_list = metadata.guess_stations(flats_list, constraint, config)
# Remove returned housing posts that do not match criteria
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
flats_list, ignored_list = refine_with_housing_criteria(flats_list,
constraint, config)
return {
"new": flats_list,
@ -121,7 +124,7 @@ def first_pass(flats_list, config):
}
def second_pass(flats_list, config):
def second_pass(flats_list, constraint, config):
"""
Second filtering pass.
@ -133,6 +136,7 @@ def second_pass(flats_list, config):
possible from the fetched housings.
:param flats_list: A list of flats dict to filter.
:param constraint: The constraint that the ``flats_list`` should satisfy.
:param config: A config dict.
:return: A dict mapping flat status and list of flat objects.
"""
@ -141,16 +145,17 @@ def second_pass(flats_list, config):
# left and we already tried to find postal code and nearby stations.
# Confirm postal code
flats_list = metadata.guess_postal_code(flats_list, config)
flats_list = metadata.guess_postal_code(flats_list, constraint, config)
# Better match with stations (confirm and check better)
flats_list = metadata.guess_stations(flats_list, config)
flats_list = metadata.guess_stations(flats_list, constraint, config)
# Compute travel time to specified points
flats_list = metadata.compute_travel_times(flats_list, config)
flats_list = metadata.compute_travel_times(flats_list, constraint, config)
# Remove returned housing posts that do not match criteria
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
flats_list, ignored_list = refine_with_housing_criteria(flats_list,
constraint, config)
return {
"new": flats_list,

View File

@ -19,13 +19,14 @@ from flatisfy.models.public_transport import PublicTransport
LOGGER = logging.getLogger(__name__)
def init(flats_list):
def init(flats_list, constraint):
"""
Create a flatisfy key containing a dict of metadata fetched by flatisfy for
each flat in the list. Also perform some basic transform on flat objects to
prepare for the metadata fetching.
:param flats_list: A list of flats dict.
:param constraint: The constraint that the ``flats_list`` should satisfy.
:return: The updated list
"""
for flat in flats_list:
@ -41,6 +42,8 @@ def init(flats_list):
# Create merged_ids key
if "merged_ids" not in flat:
flat["merged_ids"] = [flat["id"]]
if "constraint" not in flat:
flat["constraint"] = constraint
return flats_list
@ -119,11 +122,12 @@ def fuzzy_match(query, choices, limit=3, threshold=75):
return matches
def guess_postal_code(flats_list, config, distance_threshold=20000):
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
"""
Try to guess the postal code from the location of the flats.
:param flats_list: A list of flats dict.
:param constraint: The constraint that the ``flats_list`` should satisfy.
:param config: A config dict.
:param distance_threshold: Maximum distance in meters between the
constraint postal codes (from config) and the one found by this function,
@ -132,7 +136,7 @@ def guess_postal_code(flats_list, config, distance_threshold=20000):
:return: An updated list of flats dict with guessed postal code.
"""
opendata = {
"postal_codes": data.load_data(PostalCode, config)
"postal_codes": data.load_data(PostalCode, constraint, config)
}
for flat in flats_list:
@ -200,10 +204,10 @@ def guess_postal_code(flats_list, config, distance_threshold=20000):
next(
(x.lat, x.lng)
for x in opendata["postal_codes"]
if x.postal_code == constraint
if x.postal_code == constraint_postal_code
)
)
for constraint in config["constraints"]["postal_codes"]
for constraint_postal_code in constraint["postal_codes"]
)
if distance > distance_threshold:
@ -229,21 +233,21 @@ def guess_postal_code(flats_list, config, distance_threshold=20000):
return flats_list
def guess_stations(flats_list, config, distance_threshold=1500):
def guess_stations(flats_list, constraint, config, distance_threshold=1500):
"""
Try to match the station field with a list of available stations nearby.
:param flats_list: A list of flats dict.
:param constraint: The constraint that the ``flats_list`` should satisfy.
:param config: A config dict.
:param distance_threshold: Maximum distance (in meters) between the center
of the postal code and the station to consider it ok.
:return: An updated list of flats dict with guessed nearby stations.
"""
# TODO: opendata["stations"]
opendata = {
"postal_codes": data.load_data(PostalCode, config),
"stations": data.load_data(PublicTransport, config)
"postal_codes": data.load_data(PostalCode, constraint, config),
"stations": data.load_data(PublicTransport, constraint, config)
}
for flat in flats_list:
@ -343,12 +347,13 @@ def guess_stations(flats_list, config, distance_threshold=1500):
return flats_list
def compute_travel_times(flats_list, config):
def compute_travel_times(flats_list, constraint, config):
"""
Compute the travel time between each flat and the points listed in the
constraints.
:param flats_list: A list of flats dict.
:param constraint: The constraint that the ``flats_list`` should satisfy.
:param config: A config dict.
:return: An updated list of flats dict with computed travel times.
@ -371,7 +376,7 @@ def compute_travel_times(flats_list, config):
# For each place, loop over the stations close to the flat, and find
# the minimum travel time.
for place_name, place in config["constraints"]["time_to"].items():
for place_name, place in constraint["time_to"].items():
time_to_place = None
for station in flat["flatisfy"]["matched_stations"]:
time_from_station = tools.get_travel_time_between(

View File

@ -86,6 +86,7 @@ class Flat(BASE):
flatisfy_stations = Column(MagicJSON)
flatisfy_postal_code = Column(String)
flatisfy_time_to = Column(MagicJSON)
flatisfy_constraint = Column(String)
# Status
status = Column(Enum(FlatStatus), default=FlatStatus.new)

View File

@ -298,5 +298,4 @@ def get_travel_time_between(latlng_from, latlng_to, config):
"time": time,
"sections": sections
}
else:
return None
return None

View File

@ -5,7 +5,7 @@ export default {
flat: (state, getters) => id => state.flats.find(flat => flat.id === id),
isLoading: state => state.loading > 0,
isLoading: state => state.loading > 0,
postalCodesFlatsBuckets: (state, getters) => filter => {
const postalCodeBuckets = {}

View File

@ -39,7 +39,7 @@ def flats_v1(config, db):
:return: The available flats objects in a JSON ``data`` dict.
"""
postal_codes = flatisfy.data.load_data(PostalCode, config)
postal_codes = flatisfy.data.load_data(PostalCode, config) # TODO
flats = [
flat.json_api_repr()
@ -99,7 +99,7 @@ def flat_v1(flat_id, config, db):
:return: The flat object in a JSON ``data`` dict.
"""
postal_codes = flatisfy.data.load_data(PostalCode, config)
postal_codes = flatisfy.data.load_data(PostalCode, config) # TODO
flat = db.query(flat_model.Flat).filter_by(id=flat_id).first()
@ -222,7 +222,7 @@ def time_to_places_v1(config):
"""
places = {
k: v["gps"]
for k, v in config["constraints"]["time_to"].items()
for k, v in config["constraints"]["time_to"].items() # TODO: Constraints should be named and stored in db along flats
}
return {
"data": places
@ -240,7 +240,7 @@ def search_v1(db, config):
:return: The matching flat objects in a JSON ``data`` dict.
"""
postal_codes = flatisfy.data.load_data(PostalCode, config)
postal_codes = flatisfy.data.load_data(PostalCode, config) # TODO
try:
query = json.load(bottle.request.body)["query"]