Reduce number of requests to housing websites
Keep track of the last seen date and start crawling again from there for the next crawl, instead of crawling everything at each invocation. Can be configured through configuration options.
This commit is contained in:
parent
0b86a8fd23
commit
977e354646
@ -135,7 +135,11 @@ List of configuration options:
|
|||||||
doc](http://bottlepy.org/docs/dev/deployment.html).
|
doc](http://bottlepy.org/docs/dev/deployment.html).
|
||||||
* `backends` is a list of Woob backends to enable. It defaults to any
|
* `backends` is a list of Woob backends to enable. It defaults to any
|
||||||
available and supported Woob backend.
|
available and supported Woob backend.
|
||||||
* `store_personal_data` is a boolean indicated whether or not Flatisfy should
|
* `force_fetch_all` is a boolean indicating whether or not Flatisfy should
|
||||||
|
fetch all available flats or only theones added from the last fetch (relying
|
||||||
|
on last known housing date). By default, Flatisfy will only iterate on
|
||||||
|
housings until the last known housing date.
|
||||||
|
* `store_personal_data` is a boolean indicating whether or not Flatisfy should
|
||||||
fetch personal data from housing posts and store them in database. Such
|
fetch personal data from housing posts and store them in database. Such
|
||||||
personal data include contact phone number for instance. By default,
|
personal data include contact phone number for instance. By default,
|
||||||
Flatisfy does not store such personal data.
|
Flatisfy does not store such personal data.
|
||||||
|
@ -55,6 +55,9 @@ DEFAULT_CONFIG = {
|
|||||||
# Time is in seconds
|
# Time is in seconds
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
# Whether to force fetching all available flats at each time or only fetch
|
||||||
|
# diff
|
||||||
|
"force_fetch_all": False,
|
||||||
# Whether or not to store personal data from housing posts (phone number
|
# Whether or not to store personal data from housing posts (phone number
|
||||||
# etc)
|
# etc)
|
||||||
"store_personal_data": False,
|
"store_personal_data": False,
|
||||||
@ -162,6 +165,7 @@ def validate_config(config, check_with_data):
|
|||||||
assert config["smtp_to"] is None or isinstance(config["smtp_to"], list)
|
assert config["smtp_to"] is None or isinstance(config["smtp_to"], list)
|
||||||
assert config["notification_lang"] is None or isinstance(config["notification_lang"], str)
|
assert config["notification_lang"] is None or isinstance(config["notification_lang"], str)
|
||||||
|
|
||||||
|
assert isinstance(config["force_fetch_all"], bool)
|
||||||
assert isinstance(config["store_personal_data"], bool)
|
assert isinstance(config["store_personal_data"], bool)
|
||||||
assert isinstance(config["max_distance_housing_station"], (int, float))
|
assert isinstance(config["max_distance_housing_station"], (int, float))
|
||||||
assert isinstance(config["duplicate_threshold"], int)
|
assert isinstance(config["duplicate_threshold"], int)
|
||||||
|
@ -5,7 +5,9 @@ This module contains all the code related to fetching and loading flats lists.
|
|||||||
from __future__ import absolute_import, print_function, unicode_literals
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
from builtins import str
|
from builtins import str
|
||||||
|
|
||||||
|
import arrow
|
||||||
import collections
|
import collections
|
||||||
|
import datetime
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@ -15,6 +17,7 @@ from flatisfy import database
|
|||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
from flatisfy.constants import BACKENDS_BY_PRECEDENCE
|
from flatisfy.constants import BACKENDS_BY_PRECEDENCE
|
||||||
from flatisfy.models import flat as flat_model
|
from flatisfy.models import flat as flat_model
|
||||||
|
from flatisfy.models import last_fetch as last_fetch_model
|
||||||
|
|
||||||
LOGGER = logging.getLogger(__name__)
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -161,7 +164,11 @@ class WoobProxy(object):
|
|||||||
|
|
||||||
return queries
|
return queries
|
||||||
|
|
||||||
def query(self, query, max_entries=None, store_personal_data=False):
|
def query(
|
||||||
|
self, query,
|
||||||
|
max_entries=None, store_personal_data=False, force_fetch_all=False,
|
||||||
|
last_fetch_by_backend=None
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Fetch the housings posts matching a given Woob query.
|
Fetch the housings posts matching a given Woob query.
|
||||||
|
|
||||||
@ -169,12 +176,18 @@ class WoobProxy(object):
|
|||||||
:param max_entries: Maximum number of entries to fetch.
|
:param max_entries: Maximum number of entries to fetch.
|
||||||
:param store_personal_data: Whether personal data should be fetched
|
:param store_personal_data: Whether personal data should be fetched
|
||||||
from housing posts (phone number etc).
|
from housing posts (phone number etc).
|
||||||
|
:param force_fetch_all: Whether to force fetching all available flats
|
||||||
|
or only diff from last fetch (based on timestamps).
|
||||||
|
:param last_fetch_by_backend: A dict mapping all backends to last fetch
|
||||||
|
datetimes.
|
||||||
:return: The matching housing posts, dumped as a list of JSON objects.
|
:return: The matching housing posts, dumped as a list of JSON objects.
|
||||||
"""
|
"""
|
||||||
|
if last_fetch_by_backend is None:
|
||||||
|
last_fetch_by_backend = {}
|
||||||
|
|
||||||
housings = []
|
housings = []
|
||||||
# List the useful backends for this specific query
|
# List the useful backends for this specific query
|
||||||
useful_backends = [x.backend for x in query.cities]
|
useful_backends = [x.backend for x in query.cities]
|
||||||
# TODO: Handle max_entries better
|
|
||||||
try:
|
try:
|
||||||
for housing in itertools.islice(
|
for housing in itertools.islice(
|
||||||
self.webnip.do(
|
self.webnip.do(
|
||||||
@ -187,6 +200,16 @@ class WoobProxy(object):
|
|||||||
),
|
),
|
||||||
max_entries,
|
max_entries,
|
||||||
):
|
):
|
||||||
|
if not force_fetch_all:
|
||||||
|
# Check whether we should continue iterating or not
|
||||||
|
last_fetch_datetime = last_fetch_by_backend.get(housing.backend)
|
||||||
|
if last_fetch_datetime and housing.date and housing.date < last_fetch_datetime:
|
||||||
|
LOGGER.info(
|
||||||
|
'Done iterating till last fetch (housing.date=%s, last_fetch=%s). Stopping iteration.',
|
||||||
|
housing.date,
|
||||||
|
last_fetch_datetime
|
||||||
|
)
|
||||||
|
break
|
||||||
if not store_personal_data:
|
if not store_personal_data:
|
||||||
housing.phone = None
|
housing.phone = None
|
||||||
housings.append(json.dumps(housing, cls=WoobEncoder))
|
housings.append(json.dumps(housing, cls=WoobEncoder))
|
||||||
@ -240,19 +263,66 @@ def fetch_flats(config):
|
|||||||
"""
|
"""
|
||||||
fetched_flats = {}
|
fetched_flats = {}
|
||||||
|
|
||||||
|
# Get last fetch datetimes for all constraints / backends
|
||||||
|
get_session = database.init_db(config["database"], config["search_index"])
|
||||||
|
with get_session() as session:
|
||||||
|
last_fetch = collections.defaultdict(dict)
|
||||||
|
for item in session.query(last_fetch_model.LastFetch).all():
|
||||||
|
last_fetch[item.constraint_name][item.backend] = item.last_fetch
|
||||||
|
|
||||||
|
# Do the actual fetching
|
||||||
for constraint_name, constraint in config["constraints"].items():
|
for constraint_name, constraint in config["constraints"].items():
|
||||||
LOGGER.info("Loading flats for constraint %s...", constraint_name)
|
LOGGER.info("Loading flats for constraint %s...", constraint_name)
|
||||||
|
|
||||||
with WoobProxy(config) as woob_proxy:
|
with WoobProxy(config) as woob_proxy:
|
||||||
queries = woob_proxy.build_queries(constraint)
|
queries = woob_proxy.build_queries(constraint)
|
||||||
housing_posts = []
|
housing_posts = []
|
||||||
for query in queries:
|
for query in queries:
|
||||||
housing_posts.extend(woob_proxy.query(query, config["max_entries"], config["store_personal_data"]))
|
housing_posts.extend(
|
||||||
|
woob_proxy.query(
|
||||||
|
query,
|
||||||
|
config["max_entries"],
|
||||||
|
config["store_personal_data"],
|
||||||
|
config["force_fetch_all"],
|
||||||
|
last_fetch[constraint_name]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
housing_posts = [json.loads(flat) for flat in housing_posts]
|
||||||
|
|
||||||
|
# Update last_fetch
|
||||||
|
last_fetch_by_backends = collections.defaultdict(lambda: None)
|
||||||
|
for flat in housing_posts:
|
||||||
|
backend = flat['id'].split('@')[-1]
|
||||||
|
if (
|
||||||
|
last_fetch_by_backends[backend] is None
|
||||||
|
or last_fetch_by_backends[backend] < flat['date']
|
||||||
|
):
|
||||||
|
last_fetch_by_backends[backend] = flat['date']
|
||||||
|
for backend in last_fetch_by_backends:
|
||||||
|
last_fetch_in_db = session.query(last_fetch_model.LastFetch).where(
|
||||||
|
last_fetch_model.LastFetch.constraint_name == constraint_name,
|
||||||
|
last_fetch_model.LastFetch.backend == backend
|
||||||
|
).first()
|
||||||
|
if last_fetch_in_db:
|
||||||
|
last_fetch_in_db.last_fetch = arrow.get(
|
||||||
|
last_fetch_by_backends[backend]
|
||||||
|
).date()
|
||||||
|
else:
|
||||||
|
last_fetch_in_db = last_fetch_model.LastFetch(
|
||||||
|
constraint_name=constraint_name,
|
||||||
|
backend=backend,
|
||||||
|
last_fetch=arrow.get(last_fetch_by_backends[backend]).date()
|
||||||
|
)
|
||||||
|
session.add(last_fetch_in_db)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
housing_posts = housing_posts[: config["max_entries"]]
|
housing_posts = housing_posts[: config["max_entries"]]
|
||||||
LOGGER.info("Fetched %d flats.", len(housing_posts))
|
LOGGER.info("Fetched %d flats.", len(housing_posts))
|
||||||
|
|
||||||
constraint_flats_list = [json.loads(flat) for flat in housing_posts]
|
constraint_flats_list = [WoobProxy.restore_decimal_fields(flat) for flat in housing_posts]
|
||||||
constraint_flats_list = [WoobProxy.restore_decimal_fields(flat) for flat in constraint_flats_list]
|
|
||||||
fetched_flats[constraint_name] = constraint_flats_list
|
fetched_flats[constraint_name] = constraint_flats_list
|
||||||
|
|
||||||
return fetched_flats
|
return fetched_flats
|
||||||
|
|
||||||
|
|
||||||
|
31
flatisfy/models/last_fetch.py
Normal file
31
flatisfy/models/last_fetch.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
"""
|
||||||
|
This modules defines an SQLAlchemy ORM model for a flat.
|
||||||
|
"""
|
||||||
|
# pylint: disable=locally-disabled,invalid-name,too-few-public-methods
|
||||||
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from sqlalchemy import (
|
||||||
|
Column,
|
||||||
|
DateTime,
|
||||||
|
String,
|
||||||
|
)
|
||||||
|
|
||||||
|
from flatisfy.database.base import BASE
|
||||||
|
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LastFetch(BASE):
|
||||||
|
"""
|
||||||
|
SQLAlchemy ORM model to store last timestamp of fetch by backend.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__tablename__ = "last_fetch"
|
||||||
|
|
||||||
|
backend = Column(String, primary_key=True)
|
||||||
|
last_fetch = Column(DateTime)
|
||||||
|
constraint_name = Column(String)
|
@ -9,80 +9,80 @@ export default {
|
|||||||
isLoading: (state) => state.loading > 0,
|
isLoading: (state) => state.loading > 0,
|
||||||
|
|
||||||
inseeCodesFlatsBuckets: (state, getters) => (filter) => {
|
inseeCodesFlatsBuckets: (state, getters) => (filter) => {
|
||||||
const buckets = {};
|
const buckets = {}
|
||||||
|
|
||||||
state.flats.forEach((flat) => {
|
state.flats.forEach((flat) => {
|
||||||
if (!filter || filter(flat)) {
|
if (!filter || filter(flat)) {
|
||||||
const insee = flat.flatisfy_postal_code.insee_code;
|
const insee = flat.flatisfy_postal_code.insee_code
|
||||||
if (!buckets[insee]) {
|
if (!buckets[insee]) {
|
||||||
buckets[insee] = {
|
buckets[insee] = {
|
||||||
name: flat.flatisfy_postal_code.name,
|
name: flat.flatisfy_postal_code.name,
|
||||||
flats: [],
|
flats: []
|
||||||
};
|
|
||||||
}
|
}
|
||||||
buckets[insee].flats.push(flat);
|
|
||||||
}
|
}
|
||||||
});
|
buckets[insee].flats.push(flat)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
return buckets;
|
return buckets
|
||||||
},
|
},
|
||||||
|
|
||||||
flatsMarkers: (state, getters) => (router, filter) => {
|
flatsMarkers: (state, getters) => (router, filter) => {
|
||||||
const markers = [];
|
const markers = []
|
||||||
state.flats.forEach((flat) => {
|
state.flats.forEach((flat) => {
|
||||||
if (filter && filter(flat)) {
|
if (filter && filter(flat)) {
|
||||||
const gps = findFlatGPS(flat);
|
const gps = findFlatGPS(flat)
|
||||||
|
|
||||||
if (gps) {
|
if (gps) {
|
||||||
const previousMarker = markers.find(
|
const previousMarker = markers.find(
|
||||||
(marker) =>
|
(marker) =>
|
||||||
marker.gps[0] === gps[0] && marker.gps[1] === gps[1]
|
marker.gps[0] === gps[0] && marker.gps[1] === gps[1]
|
||||||
);
|
)
|
||||||
if (previousMarker) {
|
if (previousMarker) {
|
||||||
// randomize position a bit
|
// randomize position a bit
|
||||||
// gps[0] += (Math.random() - 0.5) / 500
|
// gps[0] += (Math.random() - 0.5) / 500
|
||||||
// gps[1] += (Math.random() - 0.5) / 500
|
// gps[1] += (Math.random() - 0.5) / 500
|
||||||
}
|
}
|
||||||
const href = router.resolve({
|
const href = router.resolve({
|
||||||
name: "details",
|
name: 'details',
|
||||||
params: { id: flat.id },
|
params: { id: flat.id }
|
||||||
}).href;
|
}).href
|
||||||
const cost = flat.cost
|
const cost = flat.cost
|
||||||
? costFilter(flat.cost, flat.currency)
|
? costFilter(flat.cost, flat.currency)
|
||||||
: "";
|
: ''
|
||||||
markers.push({
|
markers.push({
|
||||||
title: "",
|
title: '',
|
||||||
content:
|
content:
|
||||||
'<a href="' +
|
'<a href="' +
|
||||||
href +
|
href +
|
||||||
'">' +
|
'">' +
|
||||||
flat.title +
|
flat.title +
|
||||||
"</a>" +
|
'</a>' +
|
||||||
cost,
|
cost,
|
||||||
gps: gps,
|
gps: gps,
|
||||||
flatId: flat.id,
|
flatId: flat.id
|
||||||
});
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
})
|
||||||
|
|
||||||
return markers;
|
return markers
|
||||||
},
|
},
|
||||||
|
|
||||||
allTimeToPlaces: (state) => {
|
allTimeToPlaces: (state) => {
|
||||||
const places = {};
|
const places = {}
|
||||||
Object.keys(state.timeToPlaces).forEach((constraint) => {
|
Object.keys(state.timeToPlaces).forEach((constraint) => {
|
||||||
const constraintTimeToPlaces = state.timeToPlaces[constraint];
|
const constraintTimeToPlaces = state.timeToPlaces[constraint]
|
||||||
Object.keys(constraintTimeToPlaces).forEach((name) => {
|
Object.keys(constraintTimeToPlaces).forEach((name) => {
|
||||||
places[name] = constraintTimeToPlaces[name];
|
places[name] = constraintTimeToPlaces[name]
|
||||||
});
|
})
|
||||||
});
|
})
|
||||||
return places;
|
return places
|
||||||
},
|
},
|
||||||
|
|
||||||
timeToPlaces: (state, getters) => (constraintName) => {
|
timeToPlaces: (state, getters) => (constraintName) => {
|
||||||
return state.timeToPlaces[constraintName];
|
return state.timeToPlaces[constraintName]
|
||||||
},
|
},
|
||||||
|
|
||||||
metadata: (state) => state.metadata,
|
metadata: (state) => state.metadata
|
||||||
};
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user