You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

260 lines
9.1 KiB

# coding: utf-8
Main commands available for flatisfy.
from __future__ import absolute_import, print_function, unicode_literals
import collections
import logging
import os
import flatisfy.filters
from flatisfy import database
from flatisfy import email
from flatisfy.models import flat as flat_model
from flatisfy.models import postal_code as postal_code_model
from flatisfy.models import public_transport as public_transport_model
from flatisfy import fetch
from flatisfy import tools
from flatisfy.filters import metadata
from flatisfy.web import app as web_app
import time
LOGGER = logging.getLogger(__name__)
def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
Filter the available flats list. Then, filter it according to criteria.
:param config: A config dict.
:param constraint_name: The constraint name that the ``flats_list`` should
:param fetch_details: Whether additional details should be fetched between
the two passes.
:param flats_list: The initial list of flat objects to filter.
:return: A dict mapping flat status and list of flat objects.
# Add the flatisfy metadata entry and prepare the flat objects
flats_list = metadata.init(flats_list, constraint_name)
# Get the associated constraint from config
constraint = config["constraints"][constraint_name]
except KeyError:
"Missing constraint %s. Skipping filtering for these posts.",
return {
"new": [],
"duplicate": [],
"ignored": []
first_pass_result = collections.defaultdict(list)
second_pass_result = collections.defaultdict(list)
third_pass_result = collections.defaultdict(list)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
first_pass_result = flatisfy.filters.first_pass(flats_list,
first_pass_result["new"] = flats_list
# Load additional infos
if fetch_details:
for i, flat in enumerate(first_pass_result["new"]):
details = fetch.fetch_details(config, flat["id"])
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
if flat["id"].endswith("@leboncoin"):
# sleep 0.5s to avoid rate-kick
# Do a second pass to consolidate all the infos we found and make use of
# additional infos
if config["passes"] > 1:
second_pass_result = flatisfy.filters.second_pass(
first_pass_result["new"], constraint, config
second_pass_result["new"] = first_pass_result["new"]
# Do a third pass to deduplicate better
if config["passes"] > 2:
third_pass_result = flatisfy.filters.third_pass(
third_pass_result["new"] = second_pass_result["new"]
return {
"new": third_pass_result["new"],
"duplicate": (
first_pass_result["duplicate"] +
second_pass_result["duplicate"] +
"ignored": (
first_pass_result["ignored"] +
second_pass_result["ignored"] +
def filter_fetched_flats(config, fetched_flats, fetch_details=True):
Filter the available flats list. Then, filter it according to criteria.
:param config: A config dict.
:param fetch_details: Whether additional details should be fetched between
the two passes.
:param fetched_flats: The initial dict mapping constraints to the list of
fetched flat objects to filter.
:return: A dict mapping constraints to a dict mapping flat status and list
of flat objects.
for constraint_name, flats_list in fetched_flats.items():
fetched_flats[constraint_name] = filter_flats_list(
return fetched_flats
def import_and_filter(config, load_from_db=False):
Fetch the available flats list. Then, filter it according to criteria.
Finally, store it in the database.
:param config: A config dict.
:param load_from_db: Whether to load flats from database or fetch them
using WebOOB.
:return: ``None``.
# Fetch and filter flats list
if load_from_db:
fetched_flats = fetch.load_flats_from_db(config)
fetched_flats = fetch.fetch_flats(config)
# Do not fetch additional details if we loaded data from the db.
flats_by_status = filter_fetched_flats(config, fetched_flats=fetched_flats,
fetch_details=(not load_from_db))
# Create database connection
get_session = database.init_db(config["database"], config["search_index"])
new_flats = []"Merging fetched flats in database...")
# Flatten the flats_by_status dict
flatten_flats_by_status = collections.defaultdict(list)
for flats in flats_by_status.values():
for status, flats_list in flats.items():
with get_session() as session:
# Set is_expired to true for all existing flats.
# This will be set back to false if we find them during importing.
for flat in session.query(flat_model.Flat).all():
flat.is_expired = True;
for status, flats_list in flatten_flats_by_status.items():
# Build SQLAlchemy Flat model objects for every available flat
flats_objects = {
flat_dict["id"]: flat_model.Flat.from_dict(flat_dict)
for flat_dict in flats_list
if flats_objects:
# If there are some flats, try to merge them with the ones in
# db
existing_flats_queries = session.query(flat_model.Flat).filter(
for each in existing_flats_queries.all():
# For each flat to merge, take care not to overwrite the
# status if the user defined it
flat_object = flats_objects[]
if each.status in flat_model.AUTOMATED_STATUSES:
flat_object.status = getattr(
flat_model.FlatStatus, status
flat_object.status = each.status
# Every flat we fetched isn't expired
flat_object.is_expired = False
# For each flat already in the db, merge it (UPDATE)
# instead of adding it
# For any other flat, it is not already in the database, so we can
# just set the status field without worrying
for flat in flats_objects.values():
flat.status = getattr(flat_model.FlatStatus, status)
if flat.status ==
if config["send_email"]:
email.send_notification(config, new_flats)
# Touch a file to indicate last update timestamp
ts_file = os.path.join(
with open(ts_file, 'w'):
os.utime(ts_file, None)"Done!")
def purge_db(config):
Purge the database.
:param config: A config dict.
:return: ``None``
get_session = database.init_db(config["database"], config["search_index"])
with get_session() as session:
# Delete every flat in the db"Purge all flats from the database.")
for flat in session.query(flat_model.Flat).all():
# Use (slower) deletion by object, to ensure whoosh index is
# updated
session.delete(flat)"Purge all postal codes from the database.")
session.query(postal_code_model.PostalCode).delete()"Purge all public transportations from the database.")
def serve(config):
Serve the web app.
:param config: A config dict.
:return: ``None``, long-running process.
app = web_app.get_app(config)
server = config.get("webserver", None)
if not server:
# Default webserver is quiet, as Bottle is used with Canister for
# standard logging
server = web_app.QuietWSGIRefServer
print("Launching web viewer running on http://%s:%s" % (config["host"], config["port"]))["host"], port=config["port"], server=server)