Refilter command and backends in config

* Add a refilter command
* Add a backend option in config to only enable some backends.
This commit is contained in:
Lucas Verney 2017-04-27 16:37:39 +02:00
parent 18ef841672
commit 1d98c631e0
9 changed files with 112 additions and 79 deletions

View File

@ -32,9 +32,11 @@ The available commands are:
* `init-config` to generate an empty configuration file, either on the `stdin` * `init-config` to generate an empty configuration file, either on the `stdin`
or in the specified file. or in the specified file.
* `build-data` to rebuild OpenData datasets.
* `fetch` to load and filter housings posts and output a JSON dump. * `fetch` to load and filter housings posts and output a JSON dump.
* `filter` to filter a previously fetched list of housings posts, provided as * `filter` to filter again the flats in the database (and update their status)
a JSON dump. according to changes in config. It can also filter a previously fetched list
of housings posts, provided as a JSON dump (with a `--input` argument).
* `import` to import and filter housing posts into the database. * `import` to import and filter housing posts into the database.
* `serve` to serve the built-in webapp with the development server. Do not use * `serve` to serve the built-in webapp with the development server. Do not use
in production. in production.

View File

@ -11,6 +11,7 @@ import sys
import flatisfy.config import flatisfy.config
from flatisfy import cmds from flatisfy import cmds
from flatisfy import data from flatisfy import data
from flatisfy import fetch
from flatisfy import tools from flatisfy import tools
@ -76,14 +77,18 @@ def parse_args(argv=None):
help="Fetch housings posts") help="Fetch housings posts")
# Filter subcommand parser # Filter subcommand parser
parser_filter = subparsers.add_parser("filter", parents=[parent_parser], parser_filter = subparsers.add_parser(
help=( "filter", parents=[parent_parser],
"Filter housings posts. No " help="Filter housings posts according to constraints in config."
"fetching of additional infos " )
"is done."))
parser_filter.add_argument( parser_filter.add_argument(
"input", "--input",
help="JSON dump of the housings post to filter." help=(
"Optional JSON dump of the housings post to filter. If provided, "
"no additional fetching of infos is done, and the script outputs "
"a filtered JSON dump on stdout. If not provided, update status "
"of the flats in the database."
)
) )
# Import subcommand parser # Import subcommand parser
@ -149,7 +154,9 @@ def main():
# Fetch command # Fetch command
if args.cmd == "fetch": if args.cmd == "fetch":
# Fetch and filter flats list # Fetch and filter flats list
flats_list, _ = cmds.fetch_and_filter(config) flats_list = fetch.fetch_flats_list(config)
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=True)
# Sort by cost # Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost") flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
@ -159,18 +166,26 @@ def main():
# Filter command # Filter command
elif args.cmd == "filter": elif args.cmd == "filter":
# Load and filter flats list # Load and filter flats list
flats_list = cmds.load_and_filter(args.input, config) if args.input:
# Sort by cost flats_list = fetch.load_flats_list_from_file(args.input)
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
print( flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
tools.pretty_json(flats_list) fetch_details=False)
)
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
# Output to stdout
print(
tools.pretty_json(flats_list)
)
else:
cmds.import_and_filter(config, load_from_db=True)
# Import command # Import command
elif args.cmd == "import": elif args.cmd == "import":
# TODO: Do not fetch details for already imported flats / use the last # TODO: Do not fetch details for already imported flats / use the last
# timestamp # timestamp
cmds.import_and_filter(config) cmds.import_and_filter(config, load_from_db=False)
# Purge command # Purge command
elif args.cmd == "purge": elif args.cmd == "purge":
cmds.purge_db(config) cmds.purge_db(config)

View File

@ -17,18 +17,17 @@ from flatisfy.web import app as web_app
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
def fetch_and_filter(config): def filter_flats(config, flats_list=None, fetch_details=True):
""" """
Fetch the available flats list. Then, filter it according to criteria. Filter the available flats list. Then, filter it according to criteria.
:param config: A config dict. :param config: A config dict.
:param fetch_details: Whether additional details should be fetched between
the two passes.
:param flats_list: The initial list of flat objects to filter.
:return: A tuple of the list of all matching flats and the list of ignored :return: A tuple of the list of all matching flats and the list of ignored
flats. flats.
""" """
# TODO: Reduce load on housings listing websites
# Fetch flats list with flatboobs
flats_list = fetch.fetch_flats_list(config)
# Do a first pass with the available infos to try to remove as much # Do a first pass with the available infos to try to remove as much
# unwanted postings as possible # unwanted postings as possible
if config["passes"] > 0: if config["passes"] > 0:
@ -39,9 +38,10 @@ def fetch_and_filter(config):
# additional infos # additional infos
if config["passes"] > 1: if config["passes"] > 1:
# Load additional infos # Load additional infos
for i, flat in enumerate(flats_list): if fetch_details:
details = fetch.fetch_details(config, flat["id"]) for i, flat in enumerate(flats_list):
flats_list[i] = tools.merge_dicts(flat, details) details = fetch.fetch_details(config, flat["id"])
flats_list[i] = tools.merge_dicts(flat, details)
flats_list, extra_ignored_flats = flatisfy.filters.second_pass( flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config flats_list, config
@ -51,44 +51,23 @@ def fetch_and_filter(config):
return flats_list, ignored_flats return flats_list, ignored_flats
def load_and_filter(housing_file, config): def import_and_filter(config, load_from_db=False):
"""
Load the dumped flats list. Then, filter it according to criteria.
:param housing_file: The JSON file to load flats from.
:param config: A config dict.
:return: A tuple of the list of all matching flats and the list of ignored
flats.
"""
# Load flats list
flats_list = fetch.load_flats_list(housing_file)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
config)
# Do a second pass to consolidate all the infos we found
if config["passes"] > 1:
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
)
ignored_flats.extend(extra_ignored_flats)
return flats_list, ignored_flats
def import_and_filter(config):
""" """
Fetch the available flats list. Then, filter it according to criteria. Fetch the available flats list. Then, filter it according to criteria.
Finally, store it in the database. Finally, store it in the database.
:param config: A config dict. :param config: A config dict.
:param load_from_db: Whether to load flats from database or fetch them
using Weboob.
:return: ``None``. :return: ``None``.
""" """
# Fetch and filter flats list # Fetch and filter flats list
flats_list, ignored_list = fetch_and_filter(config) if load_from_db:
flats_list = fetch.load_flats_list_from_db(config)
else:
flats_list = fetch.fetch_flats_list(config)
flats_list, ignored_list = filter_flats(config, flats_list=flats_list,
fetch_details=True)
# Create database connection # Create database connection
get_session = database.init_db(config["database"]) get_session = database.init_db(config["database"])

View File

@ -54,7 +54,9 @@ DEFAULT_CONFIG = {
# Web app host to listen on # Web app host to listen on
"host": "127.0.0.1", "host": "127.0.0.1",
# Web server to use to serve the webapp (see Bottle deployment doc) # Web server to use to serve the webapp (see Bottle deployment doc)
"webserver": None "webserver": None,
# List of Weboob backends to use (default to any backend available)
"backends": None
} }
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
@ -135,6 +137,7 @@ def validate_config(config):
assert isinstance(config["port"], int) assert isinstance(config["port"], int)
assert isinstance(config["host"], str) assert isinstance(config["host"], str)
assert config["webserver"] is None or isinstance(config["webserver"], str) # noqa: E501 assert config["webserver"] is None or isinstance(config["webserver"], str) # noqa: E501
assert config["backends"] is None or isinstance(config["backends"], list) # noqa: E501
return True return True
except (AssertionError, KeyError): except (AssertionError, KeyError):

View File

@ -37,7 +37,10 @@ def _preprocess_ratp(output_dir):
ratp_data = collections.defaultdict(list) ratp_data = collections.defaultdict(list)
for item in ratp_data_raw: for item in ratp_data_raw:
stop_name = item["fields"]["stop_name"].lower() stop_name = item["fields"]["stop_name"].lower()
ratp_data[stop_name].append(item["fields"]["coord"]) ratp_data[stop_name].append({
"gps": item["fields"]["coord"],
"name": item["fields"]["stop_name"]
})
# Output it # Output it
with open(os.path.join(output_dir, "ratp.json"), "w") as fh: with open(os.path.join(output_dir, "ratp.json"), "w") as fh:

View File

@ -8,7 +8,9 @@ import itertools
import json import json
import logging import logging
from flatisfy import database
from flatisfy import tools from flatisfy import tools
from flatisfy.models import flat as flat_model
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
@ -59,6 +61,13 @@ class WeboobProxy(object):
:param config: A config dict. :param config: A config dict.
""" """
# Default backends
if not config["backends"]:
backends = ["seloger", "pap", "leboncoin", "logicimmo",
"explorimmo", "entreparticuliers"]
else:
backends = config["backends"]
# Create base WebNip object # Create base WebNip object
self.webnip = WebNip(modules_path=config["modules_path"]) self.webnip = WebNip(modules_path=config["modules_path"])
@ -69,8 +78,7 @@ class WeboobProxy(object):
module, module,
params={} params={}
) )
for module in ["seloger", "pap", "leboncoin", "logicimmo", for module in backends
"explorimmo", "entreparticuliers"]
] ]
def __enter__(self): def __enter__(self):
@ -210,13 +218,13 @@ def fetch_details(config, flat_id):
weboob_output = weboob_proxy.info(flat_id) weboob_output = weboob_proxy.info(flat_id)
flat_details = json.loads(weboob_output) flat_details = json.loads(weboob_output)
flats_details = WeboobProxy.restore_decimal_fields(flat_details) flat_details = WeboobProxy.restore_decimal_fields(flat_details)
LOGGER.info("Fetched details for flat %s.", flat_id) LOGGER.info("Fetched details for flat %s.", flat_id)
return flat_details return flat_details
def load_flats_list(json_file): def load_flats_list_from_file(json_file):
""" """
Load a dumped flats list from JSON file. Load a dumped flats list from JSON file.
@ -232,3 +240,20 @@ def load_flats_list(json_file):
except (IOError, ValueError): except (IOError, ValueError):
LOGGER.error("File %s is not a valid dump file.", json_file) LOGGER.error("File %s is not a valid dump file.", json_file)
return flats_list return flats_list
def load_flats_list_from_db(config):
"""
Load flats from database.
:param config: A config dict.
:return: A list of all the flats in the database.
"""
flats_list = []
get_session = database.init_db(config["database"])
with get_session() as session:
# TODO: Better serialization
flats_list = [flat.json_api_repr()
for flat in session.query(flat_model.Flat).all()]
return flats_list

View File

@ -31,9 +31,14 @@ def init(flats_list):
if "flatisfy" not in flat: if "flatisfy" not in flat:
flat["flatisfy"] = {} flat["flatisfy"] = {}
# Move url key to urls # Move url key to urls
flat["urls"] = [flat["url"]] if "urls" not in flat:
if "url" in flat:
flat["urls"] = [flat["url"]]
else:
flat["urls"] = []
# Create merged_ids key # Create merged_ids key
flat["merged_ids"] = [flat["id"]] if "merged_ids" not in flat:
flat["merged_ids"] = [flat["id"]]
return flats_list return flats_list
@ -261,16 +266,18 @@ def guess_stations(flats_list, config, distance_threshold=1500):
# of coordinates, for efficiency. Note that multiple stations # of coordinates, for efficiency. Note that multiple stations
# with the same name exist in a city, hence the list of # with the same name exist in a city, hence the list of
# coordinates. # coordinates.
for station_gps in opendata["stations"][station[0]]: for station_data in opendata["stations"][station[0]]:
distance = tools.distance(station_gps, postal_code_gps) distance = tools.distance(station_data["gps"],
postal_code_gps)
if distance < distance_threshold: if distance < distance_threshold:
# If at least one of the coordinates for a given # If at least one of the coordinates for a given
# station is close enough, that's ok and we can add # station is close enough, that's ok and we can add
# the station # the station
good_matched_stations.append({ good_matched_stations.append({
"name": station[0], "key": station[0],
"name": station_data["name"],
"confidence": station[1], "confidence": station[1],
"gps": station_gps "gps": station_data["gps"]
}) })
break break
LOGGER.debug( LOGGER.debug(

View File

@ -64,16 +64,15 @@ class DatabasePlugin(object):
if self.KEYWORD not in callback_args: if self.KEYWORD not in callback_args:
# If no need for a db session, call the route callback # If no need for a db session, call the route callback
return callback return callback
else: def wrapper(*args, **kwargs):
def wrapper(*args, **kwargs): """
""" Wrap the callback in a call to get_session.
Wrap the callback in a call to get_session. """
""" with self.get_session() as session:
with self.get_session() as session: # Get a db session and pass it to the callback
# Get a db session and pass it to the callback kwargs[self.KEYWORD] = session
kwargs[self.KEYWORD] = session return callback(*args, **kwargs)
return callback(*args, **kwargs) return wrapper
return wrapper
Plugin = DatabasePlugin Plugin = DatabasePlugin

View File

@ -197,7 +197,7 @@ export default {
}, },
displayedStations () { displayedStations () {
if (this.flat.flatisfy_stations.length > 0) { if (this.flat.flatisfy_stations.length > 0) {
const stationsNames = this.flat.flatisfy_stations.map(station => capitalize(station.name)) const stationsNames = this.flat.flatisfy_stations.map(station => station.name)
return stationsNames.join(', ') return stationsNames.join(', ')
} else { } else {
return null return null