Refilter command and backends in config

* Add a refilter command
* Add a backend option in config to only enable some backends.
This commit is contained in:
Lucas Verney 2017-04-27 16:37:39 +02:00
parent 18ef841672
commit 1d98c631e0
9 changed files with 112 additions and 79 deletions

View File

@ -32,9 +32,11 @@ The available commands are:
* `init-config` to generate an empty configuration file, either on the `stdin`
or in the specified file.
* `build-data` to rebuild OpenData datasets.
* `fetch` to load and filter housings posts and output a JSON dump.
* `filter` to filter a previously fetched list of housings posts, provided as
a JSON dump.
* `filter` to filter again the flats in the database (and update their status)
according to changes in config. It can also filter a previously fetched list
of housings posts, provided as a JSON dump (with a `--input` argument).
* `import` to import and filter housing posts into the database.
* `serve` to serve the built-in webapp with the development server. Do not use
in production.

View File

@ -11,6 +11,7 @@ import sys
import flatisfy.config
from flatisfy import cmds
from flatisfy import data
from flatisfy import fetch
from flatisfy import tools
@ -76,14 +77,18 @@ def parse_args(argv=None):
help="Fetch housings posts")
# Filter subcommand parser
parser_filter = subparsers.add_parser("filter", parents=[parent_parser],
help=(
"Filter housings posts. No "
"fetching of additional infos "
"is done."))
parser_filter = subparsers.add_parser(
"filter", parents=[parent_parser],
help="Filter housings posts according to constraints in config."
)
parser_filter.add_argument(
"input",
help="JSON dump of the housings post to filter."
"--input",
help=(
"Optional JSON dump of the housings post to filter. If provided, "
"no additional fetching of infos is done, and the script outputs "
"a filtered JSON dump on stdout. If not provided, update status "
"of the flats in the database."
)
)
# Import subcommand parser
@ -149,7 +154,9 @@ def main():
# Fetch command
if args.cmd == "fetch":
# Fetch and filter flats list
flats_list, _ = cmds.fetch_and_filter(config)
flats_list = fetch.fetch_flats_list(config)
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=True)
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
@ -159,18 +166,26 @@ def main():
# Filter command
elif args.cmd == "filter":
# Load and filter flats list
flats_list = cmds.load_and_filter(args.input, config)
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
if args.input:
flats_list = fetch.load_flats_list_from_file(args.input)
print(
tools.pretty_json(flats_list)
)
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
fetch_details=False)
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
# Output to stdout
print(
tools.pretty_json(flats_list)
)
else:
cmds.import_and_filter(config, load_from_db=True)
# Import command
elif args.cmd == "import":
# TODO: Do not fetch details for already imported flats / use the last
# timestamp
cmds.import_and_filter(config)
cmds.import_and_filter(config, load_from_db=False)
# Purge command
elif args.cmd == "purge":
cmds.purge_db(config)

View File

@ -17,18 +17,17 @@ from flatisfy.web import app as web_app
LOGGER = logging.getLogger(__name__)
def fetch_and_filter(config):
def filter_flats(config, flats_list=None, fetch_details=True):
"""
Fetch the available flats list. Then, filter it according to criteria.
Filter the available flats list. Then, filter it according to criteria.
:param config: A config dict.
:param fetch_details: Whether additional details should be fetched between
the two passes.
:param flats_list: The initial list of flat objects to filter.
:return: A tuple of the list of all matching flats and the list of ignored
flats.
"""
# TODO: Reduce load on housings listing websites
# Fetch flats list with flatboobs
flats_list = fetch.fetch_flats_list(config)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
@ -39,9 +38,10 @@ def fetch_and_filter(config):
# additional infos
if config["passes"] > 1:
# Load additional infos
for i, flat in enumerate(flats_list):
details = fetch.fetch_details(config, flat["id"])
flats_list[i] = tools.merge_dicts(flat, details)
if fetch_details:
for i, flat in enumerate(flats_list):
details = fetch.fetch_details(config, flat["id"])
flats_list[i] = tools.merge_dicts(flat, details)
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
@ -51,44 +51,23 @@ def fetch_and_filter(config):
return flats_list, ignored_flats
def load_and_filter(housing_file, config):
"""
Load the dumped flats list. Then, filter it according to criteria.
:param housing_file: The JSON file to load flats from.
:param config: A config dict.
:return: A tuple of the list of all matching flats and the list of ignored
flats.
"""
# Load flats list
flats_list = fetch.load_flats_list(housing_file)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
config)
# Do a second pass to consolidate all the infos we found
if config["passes"] > 1:
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
)
ignored_flats.extend(extra_ignored_flats)
return flats_list, ignored_flats
def import_and_filter(config):
def import_and_filter(config, load_from_db=False):
"""
Fetch the available flats list. Then, filter it according to criteria.
Finally, store it in the database.
:param config: A config dict.
:param load_from_db: Whether to load flats from database or fetch them
using Weboob.
:return: ``None``.
"""
# Fetch and filter flats list
flats_list, ignored_list = fetch_and_filter(config)
if load_from_db:
flats_list = fetch.load_flats_list_from_db(config)
else:
flats_list = fetch.fetch_flats_list(config)
flats_list, ignored_list = filter_flats(config, flats_list=flats_list,
fetch_details=True)
# Create database connection
get_session = database.init_db(config["database"])

View File

@ -54,7 +54,9 @@ DEFAULT_CONFIG = {
# Web app host to listen on
"host": "127.0.0.1",
# Web server to use to serve the webapp (see Bottle deployment doc)
"webserver": None
"webserver": None,
# List of Weboob backends to use (default to any backend available)
"backends": None
}
LOGGER = logging.getLogger(__name__)
@ -135,6 +137,7 @@ def validate_config(config):
assert isinstance(config["port"], int)
assert isinstance(config["host"], str)
assert config["webserver"] is None or isinstance(config["webserver"], str) # noqa: E501
assert config["backends"] is None or isinstance(config["backends"], list) # noqa: E501
return True
except (AssertionError, KeyError):

View File

@ -37,7 +37,10 @@ def _preprocess_ratp(output_dir):
ratp_data = collections.defaultdict(list)
for item in ratp_data_raw:
stop_name = item["fields"]["stop_name"].lower()
ratp_data[stop_name].append(item["fields"]["coord"])
ratp_data[stop_name].append({
"gps": item["fields"]["coord"],
"name": item["fields"]["stop_name"]
})
# Output it
with open(os.path.join(output_dir, "ratp.json"), "w") as fh:

View File

@ -8,7 +8,9 @@ import itertools
import json
import logging
from flatisfy import database
from flatisfy import tools
from flatisfy.models import flat as flat_model
LOGGER = logging.getLogger(__name__)
@ -59,6 +61,13 @@ class WeboobProxy(object):
:param config: A config dict.
"""
# Default backends
if not config["backends"]:
backends = ["seloger", "pap", "leboncoin", "logicimmo",
"explorimmo", "entreparticuliers"]
else:
backends = config["backends"]
# Create base WebNip object
self.webnip = WebNip(modules_path=config["modules_path"])
@ -69,8 +78,7 @@ class WeboobProxy(object):
module,
params={}
)
for module in ["seloger", "pap", "leboncoin", "logicimmo",
"explorimmo", "entreparticuliers"]
for module in backends
]
def __enter__(self):
@ -210,13 +218,13 @@ def fetch_details(config, flat_id):
weboob_output = weboob_proxy.info(flat_id)
flat_details = json.loads(weboob_output)
flats_details = WeboobProxy.restore_decimal_fields(flat_details)
flat_details = WeboobProxy.restore_decimal_fields(flat_details)
LOGGER.info("Fetched details for flat %s.", flat_id)
return flat_details
def load_flats_list(json_file):
def load_flats_list_from_file(json_file):
"""
Load a dumped flats list from JSON file.
@ -232,3 +240,20 @@ def load_flats_list(json_file):
except (IOError, ValueError):
LOGGER.error("File %s is not a valid dump file.", json_file)
return flats_list
def load_flats_list_from_db(config):
"""
Load flats from database.
:param config: A config dict.
:return: A list of all the flats in the database.
"""
flats_list = []
get_session = database.init_db(config["database"])
with get_session() as session:
# TODO: Better serialization
flats_list = [flat.json_api_repr()
for flat in session.query(flat_model.Flat).all()]
return flats_list

View File

@ -31,9 +31,14 @@ def init(flats_list):
if "flatisfy" not in flat:
flat["flatisfy"] = {}
# Move url key to urls
flat["urls"] = [flat["url"]]
if "urls" not in flat:
if "url" in flat:
flat["urls"] = [flat["url"]]
else:
flat["urls"] = []
# Create merged_ids key
flat["merged_ids"] = [flat["id"]]
if "merged_ids" not in flat:
flat["merged_ids"] = [flat["id"]]
return flats_list
@ -261,16 +266,18 @@ def guess_stations(flats_list, config, distance_threshold=1500):
# of coordinates, for efficiency. Note that multiple stations
# with the same name exist in a city, hence the list of
# coordinates.
for station_gps in opendata["stations"][station[0]]:
distance = tools.distance(station_gps, postal_code_gps)
for station_data in opendata["stations"][station[0]]:
distance = tools.distance(station_data["gps"],
postal_code_gps)
if distance < distance_threshold:
# If at least one of the coordinates for a given
# station is close enough, that's ok and we can add
# the station
good_matched_stations.append({
"name": station[0],
"key": station[0],
"name": station_data["name"],
"confidence": station[1],
"gps": station_gps
"gps": station_data["gps"]
})
break
LOGGER.debug(

View File

@ -64,16 +64,15 @@ class DatabasePlugin(object):
if self.KEYWORD not in callback_args:
# If no need for a db session, call the route callback
return callback
else:
def wrapper(*args, **kwargs):
"""
Wrap the callback in a call to get_session.
"""
with self.get_session() as session:
# Get a db session and pass it to the callback
kwargs[self.KEYWORD] = session
return callback(*args, **kwargs)
return wrapper
def wrapper(*args, **kwargs):
"""
Wrap the callback in a call to get_session.
"""
with self.get_session() as session:
# Get a db session and pass it to the callback
kwargs[self.KEYWORD] = session
return callback(*args, **kwargs)
return wrapper
Plugin = DatabasePlugin

View File

@ -197,7 +197,7 @@ export default {
},
displayedStations () {
if (this.flat.flatisfy_stations.length > 0) {
const stationsNames = this.flat.flatisfy_stations.map(station => capitalize(station.name))
const stationsNames = this.flat.flatisfy_stations.map(station => station.name)
return stationsNames.join(', ')
} else {
return null