diff --git a/doc/0.getting_started.md b/doc/0.getting_started.md index 1e365e8..5a6ef0c 100644 --- a/doc/0.getting_started.md +++ b/doc/0.getting_started.md @@ -135,7 +135,11 @@ List of configuration options: doc](http://bottlepy.org/docs/dev/deployment.html). * `backends` is a list of Woob backends to enable. It defaults to any available and supported Woob backend. -* `store_personal_data` is a boolean indicated whether or not Flatisfy should +* `force_fetch_all` is a boolean indicating whether or not Flatisfy should + fetch all available flats or only theones added from the last fetch (relying + on last known housing date). By default, Flatisfy will only iterate on + housings until the last known housing date. +* `store_personal_data` is a boolean indicating whether or not Flatisfy should fetch personal data from housing posts and store them in database. Such personal data include contact phone number for instance. By default, Flatisfy does not store such personal data. diff --git a/flatisfy/config.py b/flatisfy/config.py index 4c20e3f..7182140 100644 --- a/flatisfy/config.py +++ b/flatisfy/config.py @@ -55,6 +55,9 @@ DEFAULT_CONFIG = { # Time is in seconds } }, + # Whether to force fetching all available flats at each time or only fetch + # diff + "force_fetch_all": False, # Whether or not to store personal data from housing posts (phone number # etc) "store_personal_data": False, @@ -162,6 +165,7 @@ def validate_config(config, check_with_data): assert config["smtp_to"] is None or isinstance(config["smtp_to"], list) assert config["notification_lang"] is None or isinstance(config["notification_lang"], str) + assert isinstance(config["force_fetch_all"], bool) assert isinstance(config["store_personal_data"], bool) assert isinstance(config["max_distance_housing_station"], (int, float)) assert isinstance(config["duplicate_threshold"], int) diff --git a/flatisfy/fetch.py b/flatisfy/fetch.py index 4e79a83..7d1cc5a 100644 --- a/flatisfy/fetch.py +++ b/flatisfy/fetch.py @@ -5,7 +5,9 @@ This module contains all the code related to fetching and loading flats lists. from __future__ import absolute_import, print_function, unicode_literals from builtins import str +import arrow import collections +import datetime import itertools import json import logging @@ -15,6 +17,7 @@ from flatisfy import database from flatisfy import tools from flatisfy.constants import BACKENDS_BY_PRECEDENCE from flatisfy.models import flat as flat_model +from flatisfy.models import last_fetch as last_fetch_model LOGGER = logging.getLogger(__name__) @@ -161,7 +164,11 @@ class WoobProxy(object): return queries - def query(self, query, max_entries=None, store_personal_data=False): + def query( + self, query, + max_entries=None, store_personal_data=False, force_fetch_all=False, + last_fetch_by_backend=None + ): """ Fetch the housings posts matching a given Woob query. @@ -169,12 +176,18 @@ class WoobProxy(object): :param max_entries: Maximum number of entries to fetch. :param store_personal_data: Whether personal data should be fetched from housing posts (phone number etc). + :param force_fetch_all: Whether to force fetching all available flats + or only diff from last fetch (based on timestamps). + :param last_fetch_by_backend: A dict mapping all backends to last fetch + datetimes. :return: The matching housing posts, dumped as a list of JSON objects. """ + if last_fetch_by_backend is None: + last_fetch_by_backend = {} + housings = [] # List the useful backends for this specific query useful_backends = [x.backend for x in query.cities] - # TODO: Handle max_entries better try: for housing in itertools.islice( self.webnip.do( @@ -187,6 +200,16 @@ class WoobProxy(object): ), max_entries, ): + if not force_fetch_all: + # Check whether we should continue iterating or not + last_fetch_datetime = last_fetch_by_backend.get(housing.backend) + if last_fetch_datetime and housing.date and housing.date < last_fetch_datetime: + LOGGER.info( + 'Done iterating till last fetch (housing.date=%s, last_fetch=%s). Stopping iteration.', + housing.date, + last_fetch_datetime + ) + break if not store_personal_data: housing.phone = None housings.append(json.dumps(housing, cls=WoobEncoder)) @@ -240,19 +263,66 @@ def fetch_flats(config): """ fetched_flats = {} + # Get last fetch datetimes for all constraints / backends + get_session = database.init_db(config["database"], config["search_index"]) + with get_session() as session: + last_fetch = collections.defaultdict(dict) + for item in session.query(last_fetch_model.LastFetch).all(): + last_fetch[item.constraint_name][item.backend] = item.last_fetch + + # Do the actual fetching for constraint_name, constraint in config["constraints"].items(): LOGGER.info("Loading flats for constraint %s...", constraint_name) + with WoobProxy(config) as woob_proxy: queries = woob_proxy.build_queries(constraint) housing_posts = [] for query in queries: - housing_posts.extend(woob_proxy.query(query, config["max_entries"], config["store_personal_data"])) + housing_posts.extend( + woob_proxy.query( + query, + config["max_entries"], + config["store_personal_data"], + config["force_fetch_all"], + last_fetch[constraint_name] + ) + ) + + housing_posts = [json.loads(flat) for flat in housing_posts] + + # Update last_fetch + last_fetch_by_backends = collections.defaultdict(lambda: None) + for flat in housing_posts: + backend = flat['id'].split('@')[-1] + if ( + last_fetch_by_backends[backend] is None + or last_fetch_by_backends[backend] < flat['date'] + ): + last_fetch_by_backends[backend] = flat['date'] + for backend in last_fetch_by_backends: + last_fetch_in_db = session.query(last_fetch_model.LastFetch).where( + last_fetch_model.LastFetch.constraint_name == constraint_name, + last_fetch_model.LastFetch.backend == backend + ).first() + if last_fetch_in_db: + last_fetch_in_db.last_fetch = arrow.get( + last_fetch_by_backends[backend] + ).date() + else: + last_fetch_in_db = last_fetch_model.LastFetch( + constraint_name=constraint_name, + backend=backend, + last_fetch=arrow.get(last_fetch_by_backends[backend]).date() + ) + session.add(last_fetch_in_db) + session.commit() + housing_posts = housing_posts[: config["max_entries"]] LOGGER.info("Fetched %d flats.", len(housing_posts)) - constraint_flats_list = [json.loads(flat) for flat in housing_posts] - constraint_flats_list = [WoobProxy.restore_decimal_fields(flat) for flat in constraint_flats_list] + constraint_flats_list = [WoobProxy.restore_decimal_fields(flat) for flat in housing_posts] fetched_flats[constraint_name] = constraint_flats_list + return fetched_flats diff --git a/flatisfy/models/last_fetch.py b/flatisfy/models/last_fetch.py new file mode 100644 index 0000000..281fed9 --- /dev/null +++ b/flatisfy/models/last_fetch.py @@ -0,0 +1,31 @@ +# coding: utf-8 +""" +This modules defines an SQLAlchemy ORM model for a flat. +""" +# pylint: disable=locally-disabled,invalid-name,too-few-public-methods +from __future__ import absolute_import, print_function, unicode_literals + +import logging + +from sqlalchemy import ( + Column, + DateTime, + String, +) + +from flatisfy.database.base import BASE + + +LOGGER = logging.getLogger(__name__) + + +class LastFetch(BASE): + """ + SQLAlchemy ORM model to store last timestamp of fetch by backend. + """ + + __tablename__ = "last_fetch" + + backend = Column(String, primary_key=True) + last_fetch = Column(DateTime) + constraint_name = Column(String) diff --git a/flatisfy/web/js_src/store/getters.js b/flatisfy/web/js_src/store/getters.js index 4d9b94e..0f3fea9 100644 --- a/flatisfy/web/js_src/store/getters.js +++ b/flatisfy/web/js_src/store/getters.js @@ -9,80 +9,80 @@ export default { isLoading: (state) => state.loading > 0, inseeCodesFlatsBuckets: (state, getters) => (filter) => { - const buckets = {}; + const buckets = {} state.flats.forEach((flat) => { if (!filter || filter(flat)) { - const insee = flat.flatisfy_postal_code.insee_code; + const insee = flat.flatisfy_postal_code.insee_code if (!buckets[insee]) { buckets[insee] = { name: flat.flatisfy_postal_code.name, - flats: [], - }; + flats: [] + } } - buckets[insee].flats.push(flat); + buckets[insee].flats.push(flat) } - }); + }) - return buckets; + return buckets }, flatsMarkers: (state, getters) => (router, filter) => { - const markers = []; + const markers = [] state.flats.forEach((flat) => { if (filter && filter(flat)) { - const gps = findFlatGPS(flat); + const gps = findFlatGPS(flat) if (gps) { const previousMarker = markers.find( (marker) => marker.gps[0] === gps[0] && marker.gps[1] === gps[1] - ); + ) if (previousMarker) { // randomize position a bit // gps[0] += (Math.random() - 0.5) / 500 // gps[1] += (Math.random() - 0.5) / 500 } const href = router.resolve({ - name: "details", - params: { id: flat.id }, - }).href; + name: 'details', + params: { id: flat.id } + }).href const cost = flat.cost ? costFilter(flat.cost, flat.currency) - : ""; + : '' markers.push({ - title: "", + title: '', content: '' + flat.title + - "" + + '' + cost, gps: gps, - flatId: flat.id, - }); + flatId: flat.id + }) } } - }); + }) - return markers; + return markers }, allTimeToPlaces: (state) => { - const places = {}; + const places = {} Object.keys(state.timeToPlaces).forEach((constraint) => { - const constraintTimeToPlaces = state.timeToPlaces[constraint]; + const constraintTimeToPlaces = state.timeToPlaces[constraint] Object.keys(constraintTimeToPlaces).forEach((name) => { - places[name] = constraintTimeToPlaces[name]; - }); - }); - return places; + places[name] = constraintTimeToPlaces[name] + }) + }) + return places }, timeToPlaces: (state, getters) => (constraintName) => { - return state.timeToPlaces[constraintName]; + return state.timeToPlaces[constraintName] }, - metadata: (state) => state.metadata, -}; + metadata: (state) => state.metadata +}