Reduce number of requests to housing websites

Keep track of the last seen date and start crawling again from there for
the next crawl, instead of crawling everything at each invocation.

Can be configured through configuration options.
This commit is contained in:
Lucas Verney 2021-04-12 23:28:42 +02:00
parent 0b86a8fd23
commit 977e354646
5 changed files with 144 additions and 35 deletions

View File

@ -135,7 +135,11 @@ List of configuration options:
doc](http://bottlepy.org/docs/dev/deployment.html). doc](http://bottlepy.org/docs/dev/deployment.html).
* `backends` is a list of Woob backends to enable. It defaults to any * `backends` is a list of Woob backends to enable. It defaults to any
available and supported Woob backend. available and supported Woob backend.
* `store_personal_data` is a boolean indicated whether or not Flatisfy should * `force_fetch_all` is a boolean indicating whether or not Flatisfy should
fetch all available flats or only theones added from the last fetch (relying
on last known housing date). By default, Flatisfy will only iterate on
housings until the last known housing date.
* `store_personal_data` is a boolean indicating whether or not Flatisfy should
fetch personal data from housing posts and store them in database. Such fetch personal data from housing posts and store them in database. Such
personal data include contact phone number for instance. By default, personal data include contact phone number for instance. By default,
Flatisfy does not store such personal data. Flatisfy does not store such personal data.

View File

@ -55,6 +55,9 @@ DEFAULT_CONFIG = {
# Time is in seconds # Time is in seconds
} }
}, },
# Whether to force fetching all available flats at each time or only fetch
# diff
"force_fetch_all": False,
# Whether or not to store personal data from housing posts (phone number # Whether or not to store personal data from housing posts (phone number
# etc) # etc)
"store_personal_data": False, "store_personal_data": False,
@ -162,6 +165,7 @@ def validate_config(config, check_with_data):
assert config["smtp_to"] is None or isinstance(config["smtp_to"], list) assert config["smtp_to"] is None or isinstance(config["smtp_to"], list)
assert config["notification_lang"] is None or isinstance(config["notification_lang"], str) assert config["notification_lang"] is None or isinstance(config["notification_lang"], str)
assert isinstance(config["force_fetch_all"], bool)
assert isinstance(config["store_personal_data"], bool) assert isinstance(config["store_personal_data"], bool)
assert isinstance(config["max_distance_housing_station"], (int, float)) assert isinstance(config["max_distance_housing_station"], (int, float))
assert isinstance(config["duplicate_threshold"], int) assert isinstance(config["duplicate_threshold"], int)

View File

@ -5,7 +5,9 @@ This module contains all the code related to fetching and loading flats lists.
from __future__ import absolute_import, print_function, unicode_literals from __future__ import absolute_import, print_function, unicode_literals
from builtins import str from builtins import str
import arrow
import collections import collections
import datetime
import itertools import itertools
import json import json
import logging import logging
@ -15,6 +17,7 @@ from flatisfy import database
from flatisfy import tools from flatisfy import tools
from flatisfy.constants import BACKENDS_BY_PRECEDENCE from flatisfy.constants import BACKENDS_BY_PRECEDENCE
from flatisfy.models import flat as flat_model from flatisfy.models import flat as flat_model
from flatisfy.models import last_fetch as last_fetch_model
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
@ -161,7 +164,11 @@ class WoobProxy(object):
return queries return queries
def query(self, query, max_entries=None, store_personal_data=False): def query(
self, query,
max_entries=None, store_personal_data=False, force_fetch_all=False,
last_fetch_by_backend=None
):
""" """
Fetch the housings posts matching a given Woob query. Fetch the housings posts matching a given Woob query.
@ -169,12 +176,18 @@ class WoobProxy(object):
:param max_entries: Maximum number of entries to fetch. :param max_entries: Maximum number of entries to fetch.
:param store_personal_data: Whether personal data should be fetched :param store_personal_data: Whether personal data should be fetched
from housing posts (phone number etc). from housing posts (phone number etc).
:param force_fetch_all: Whether to force fetching all available flats
or only diff from last fetch (based on timestamps).
:param last_fetch_by_backend: A dict mapping all backends to last fetch
datetimes.
:return: The matching housing posts, dumped as a list of JSON objects. :return: The matching housing posts, dumped as a list of JSON objects.
""" """
if last_fetch_by_backend is None:
last_fetch_by_backend = {}
housings = [] housings = []
# List the useful backends for this specific query # List the useful backends for this specific query
useful_backends = [x.backend for x in query.cities] useful_backends = [x.backend for x in query.cities]
# TODO: Handle max_entries better
try: try:
for housing in itertools.islice( for housing in itertools.islice(
self.webnip.do( self.webnip.do(
@ -187,6 +200,16 @@ class WoobProxy(object):
), ),
max_entries, max_entries,
): ):
if not force_fetch_all:
# Check whether we should continue iterating or not
last_fetch_datetime = last_fetch_by_backend.get(housing.backend)
if last_fetch_datetime and housing.date and housing.date < last_fetch_datetime:
LOGGER.info(
'Done iterating till last fetch (housing.date=%s, last_fetch=%s). Stopping iteration.',
housing.date,
last_fetch_datetime
)
break
if not store_personal_data: if not store_personal_data:
housing.phone = None housing.phone = None
housings.append(json.dumps(housing, cls=WoobEncoder)) housings.append(json.dumps(housing, cls=WoobEncoder))
@ -240,19 +263,66 @@ def fetch_flats(config):
""" """
fetched_flats = {} fetched_flats = {}
# Get last fetch datetimes for all constraints / backends
get_session = database.init_db(config["database"], config["search_index"])
with get_session() as session:
last_fetch = collections.defaultdict(dict)
for item in session.query(last_fetch_model.LastFetch).all():
last_fetch[item.constraint_name][item.backend] = item.last_fetch
# Do the actual fetching
for constraint_name, constraint in config["constraints"].items(): for constraint_name, constraint in config["constraints"].items():
LOGGER.info("Loading flats for constraint %s...", constraint_name) LOGGER.info("Loading flats for constraint %s...", constraint_name)
with WoobProxy(config) as woob_proxy: with WoobProxy(config) as woob_proxy:
queries = woob_proxy.build_queries(constraint) queries = woob_proxy.build_queries(constraint)
housing_posts = [] housing_posts = []
for query in queries: for query in queries:
housing_posts.extend(woob_proxy.query(query, config["max_entries"], config["store_personal_data"])) housing_posts.extend(
woob_proxy.query(
query,
config["max_entries"],
config["store_personal_data"],
config["force_fetch_all"],
last_fetch[constraint_name]
)
)
housing_posts = [json.loads(flat) for flat in housing_posts]
# Update last_fetch
last_fetch_by_backends = collections.defaultdict(lambda: None)
for flat in housing_posts:
backend = flat['id'].split('@')[-1]
if (
last_fetch_by_backends[backend] is None
or last_fetch_by_backends[backend] < flat['date']
):
last_fetch_by_backends[backend] = flat['date']
for backend in last_fetch_by_backends:
last_fetch_in_db = session.query(last_fetch_model.LastFetch).where(
last_fetch_model.LastFetch.constraint_name == constraint_name,
last_fetch_model.LastFetch.backend == backend
).first()
if last_fetch_in_db:
last_fetch_in_db.last_fetch = arrow.get(
last_fetch_by_backends[backend]
).date()
else:
last_fetch_in_db = last_fetch_model.LastFetch(
constraint_name=constraint_name,
backend=backend,
last_fetch=arrow.get(last_fetch_by_backends[backend]).date()
)
session.add(last_fetch_in_db)
session.commit()
housing_posts = housing_posts[: config["max_entries"]] housing_posts = housing_posts[: config["max_entries"]]
LOGGER.info("Fetched %d flats.", len(housing_posts)) LOGGER.info("Fetched %d flats.", len(housing_posts))
constraint_flats_list = [json.loads(flat) for flat in housing_posts] constraint_flats_list = [WoobProxy.restore_decimal_fields(flat) for flat in housing_posts]
constraint_flats_list = [WoobProxy.restore_decimal_fields(flat) for flat in constraint_flats_list]
fetched_flats[constraint_name] = constraint_flats_list fetched_flats[constraint_name] = constraint_flats_list
return fetched_flats return fetched_flats

View File

@ -0,0 +1,31 @@
# coding: utf-8
"""
This modules defines an SQLAlchemy ORM model for a flat.
"""
# pylint: disable=locally-disabled,invalid-name,too-few-public-methods
from __future__ import absolute_import, print_function, unicode_literals
import logging
from sqlalchemy import (
Column,
DateTime,
String,
)
from flatisfy.database.base import BASE
LOGGER = logging.getLogger(__name__)
class LastFetch(BASE):
"""
SQLAlchemy ORM model to store last timestamp of fetch by backend.
"""
__tablename__ = "last_fetch"
backend = Column(String, primary_key=True)
last_fetch = Column(DateTime)
constraint_name = Column(String)

View File

@ -9,80 +9,80 @@ export default {
isLoading: (state) => state.loading > 0, isLoading: (state) => state.loading > 0,
inseeCodesFlatsBuckets: (state, getters) => (filter) => { inseeCodesFlatsBuckets: (state, getters) => (filter) => {
const buckets = {}; const buckets = {}
state.flats.forEach((flat) => { state.flats.forEach((flat) => {
if (!filter || filter(flat)) { if (!filter || filter(flat)) {
const insee = flat.flatisfy_postal_code.insee_code; const insee = flat.flatisfy_postal_code.insee_code
if (!buckets[insee]) { if (!buckets[insee]) {
buckets[insee] = { buckets[insee] = {
name: flat.flatisfy_postal_code.name, name: flat.flatisfy_postal_code.name,
flats: [], flats: []
}; }
} }
buckets[insee].flats.push(flat); buckets[insee].flats.push(flat)
} }
}); })
return buckets; return buckets
}, },
flatsMarkers: (state, getters) => (router, filter) => { flatsMarkers: (state, getters) => (router, filter) => {
const markers = []; const markers = []
state.flats.forEach((flat) => { state.flats.forEach((flat) => {
if (filter && filter(flat)) { if (filter && filter(flat)) {
const gps = findFlatGPS(flat); const gps = findFlatGPS(flat)
if (gps) { if (gps) {
const previousMarker = markers.find( const previousMarker = markers.find(
(marker) => (marker) =>
marker.gps[0] === gps[0] && marker.gps[1] === gps[1] marker.gps[0] === gps[0] && marker.gps[1] === gps[1]
); )
if (previousMarker) { if (previousMarker) {
// randomize position a bit // randomize position a bit
// gps[0] += (Math.random() - 0.5) / 500 // gps[0] += (Math.random() - 0.5) / 500
// gps[1] += (Math.random() - 0.5) / 500 // gps[1] += (Math.random() - 0.5) / 500
} }
const href = router.resolve({ const href = router.resolve({
name: "details", name: 'details',
params: { id: flat.id }, params: { id: flat.id }
}).href; }).href
const cost = flat.cost const cost = flat.cost
? costFilter(flat.cost, flat.currency) ? costFilter(flat.cost, flat.currency)
: ""; : ''
markers.push({ markers.push({
title: "", title: '',
content: content:
'<a href="' + '<a href="' +
href + href +
'">' + '">' +
flat.title + flat.title +
"</a>" + '</a>' +
cost, cost,
gps: gps, gps: gps,
flatId: flat.id, flatId: flat.id
}); })
} }
} }
}); })
return markers; return markers
}, },
allTimeToPlaces: (state) => { allTimeToPlaces: (state) => {
const places = {}; const places = {}
Object.keys(state.timeToPlaces).forEach((constraint) => { Object.keys(state.timeToPlaces).forEach((constraint) => {
const constraintTimeToPlaces = state.timeToPlaces[constraint]; const constraintTimeToPlaces = state.timeToPlaces[constraint]
Object.keys(constraintTimeToPlaces).forEach((name) => { Object.keys(constraintTimeToPlaces).forEach((name) => {
places[name] = constraintTimeToPlaces[name]; places[name] = constraintTimeToPlaces[name]
}); })
}); })
return places; return places
}, },
timeToPlaces: (state, getters) => (constraintName) => { timeToPlaces: (state, getters) => (constraintName) => {
return state.timeToPlaces[constraintName]; return state.timeToPlaces[constraintName]
}, },
metadata: (state) => state.metadata, metadata: (state) => state.metadata
}; }