From ee2880326c60093fc6c57772a64c1f89eb15380d Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Mon, 22 Jan 2018 01:06:09 +0100 Subject: [PATCH] Add a way to download photos locally Fix for #94. --- flatisfy/config.py | 3 +++ flatisfy/filters/__init__.py | 4 ++++ flatisfy/filters/cache.py | 36 ++++++++++++++++++++++++++++++--- flatisfy/filters/duplicates.py | 16 ++++++++------- flatisfy/filters/images.py | 37 ++++++++++++++++++++++++++++++++++ flatisfy/tests.py | 16 +++++++++++++-- 6 files changed, 100 insertions(+), 12 deletions(-) create mode 100644 flatisfy/filters/images.py diff --git a/flatisfy/config.py b/flatisfy/config.py index 68df028..e9c01ac 100644 --- a/flatisfy/config.py +++ b/flatisfy/config.py @@ -51,6 +51,8 @@ DEFAULT_CONFIG = { "duplicate_threshold": 15, # Score to consider two images as being duplicates through hash comparison "duplicate_image_hash_threshold": 10, + # Whether images should be downloaded and served locally + "serve_images_locally": True, # Navitia API key "navitia_api_key": None, # Number of filtering passes to run @@ -275,6 +277,7 @@ def load_config(args=None, check_with_data=True): LOGGER.info("Creating data directory according to config: %s", config_data["data_directory"]) os.makedirs(config_data["data_directory"]) + os.makedirs(os.path.join(config_data["data_directory"], "images")) if config_data["database"] is None: config_data["database"] = "sqlite:///" + os.path.join( diff --git a/flatisfy/filters/__init__.py b/flatisfy/filters/__init__.py index 4efadac..2e9bf62 100644 --- a/flatisfy/filters/__init__.py +++ b/flatisfy/filters/__init__.py @@ -10,6 +10,7 @@ import logging from flatisfy import tools from flatisfy.filters import duplicates +from flatisfy.filters import images from flatisfy.filters import metadata @@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config): flats_list, ignored_list = refine_with_details_criteria(flats_list, constraint) + if config["serve_images_locally"]: + images.download_images(flats_list, config) + return { "new": flats_list, "ignored": ignored_list, diff --git a/flatisfy/filters/cache.py b/flatisfy/filters/cache.py index a98b74b..6d5de6d 100644 --- a/flatisfy/filters/cache.py +++ b/flatisfy/filters/cache.py @@ -1,12 +1,16 @@ # coding: utf-8 - """ Caching function for pictures. """ from __future__ import absolute_import, print_function, unicode_literals +import hashlib +import os import requests +from io import BytesIO + +import PIL.Image class MemoryCache(object): @@ -81,8 +85,34 @@ class ImageCache(MemoryCache): A cache for images, stored in memory. """ @staticmethod - def on_miss(url): + def compute_filename(url): + """ + Compute filename (hash of the URL) for the cached image. + + :param url: The URL of the image. + :return: The filename, with its extension. + """ + # Always store as JPEG + return "%s.jpg" % hashlib.sha1(url.encode("utf-8")).hexdigest() + + def on_miss(self, url): """ Helper to actually retrieve photos if not already cached. """ - return requests.get(url) + filepath = os.path.join( + self.storage_dir, + self.compute_filename(url) + ) + if os.path.isfile(filepath): + image = PIL.Image.open(filepath) + else: + image = PIL.Image.open(BytesIO(requests.get(url).content)) + if self.storage_dir: + image.save(filepath, format=image.format) + return image + + def __init__(self, storage_dir=None): + self.storage_dir = storage_dir + if self.storage_dir and not os.path.isdir(self.storage_dir): + os.makedirs(self.storage_dir) + super(ImageCache, self).__init__() diff --git a/flatisfy/filters/duplicates.py b/flatisfy/filters/duplicates.py index e41d828..c12309a 100644 --- a/flatisfy/filters/duplicates.py +++ b/flatisfy/filters/duplicates.py @@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals import collections import itertools import logging +import os import re -from io import BytesIO - import imagehash -import PIL.Image import requests from flatisfy import tools @@ -69,8 +67,7 @@ def get_or_compute_photo_hash(photo, photo_cache): return photo["hash"] except KeyError: # Otherwise, get the image and compute the hash - req = photo_cache.get(photo["url"]) - image = PIL.Image.open(BytesIO(req.content)) + image = photo_cache.get(photo["url"]) photo["hash"] = imagehash.average_hash(image) return photo["hash"] @@ -329,8 +326,13 @@ def deep_detect(flats_list, config): the flats objects that should be removed and considered as duplicates (they were already merged). """ - - photo_cache = ImageCache() + if config["serve_images_locally"]: + storage_dir = os.path.join(config["data_directory"], "images") + else: + storage_dir = None + photo_cache = ImageCache( + storage_dir=storage_dir + ) LOGGER.info("Running deep duplicates detection.") matching_flats = collections.defaultdict(list) diff --git a/flatisfy/filters/images.py b/flatisfy/filters/images.py new file mode 100644 index 0000000..022f537 --- /dev/null +++ b/flatisfy/filters/images.py @@ -0,0 +1,37 @@ +# coding: utf-8 +""" +Filtering functions to handle images. + +This includes functions to download images. +""" +from __future__ import absolute_import, print_function, unicode_literals + +import logging +import os + +from flatisfy.filters.cache import ImageCache + + +LOGGER = logging.getLogger(__name__) + + +def download_images(flats_list, config): + """ + Download images for all flats in the list, to serve them locally. + + :param flats_list: A list of flats dicts. + :param config: A config dict. + """ + photo_cache = ImageCache( + storage_dir=os.path.join(config["data_directory"], "images") + ) + flats_list_length = len(flats_list) + for i, flat in enumerate(flats_list): + LOGGER.info( + "Downloading photos for flat %d/%d.", i + 1, flats_list_length + ) + for photo in flat["photos"]: + # Download photo + photo_cache.get(photo["url"]) + # And store the local image + photo["local"] = photo_cache.compute_filename(photo["url"]) diff --git a/flatisfy/tests.py b/flatisfy/tests.py index b74e3d9..d686a36 100644 --- a/flatisfy/tests.py +++ b/flatisfy/tests.py @@ -9,6 +9,8 @@ import os import random import sys import unittest +import tempfile + import requests import requests_mock @@ -166,9 +168,14 @@ class TestPhoneNumbers(unittest.TestCase): class TestPhotos(unittest.TestCase): - IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name + def __init__(self, *args, **kwargs): + self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name + storage_dir=tempfile.mkdtemp(prefix="flatisfy-") + ) + super(TestPhotos, self).__init__(*args, **kwargs) + def test_same_photo_twice(self): """ Compares a photo against itself. @@ -262,7 +269,12 @@ class TestDuplicates(unittest.TestCase): DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name - IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name + + def __init__(self, *args, **kwargs): + self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name + storage_dir=tempfile.mkdtemp(prefix="flatisfy-") + ) + super(TestDuplicates, self).__init__(*args, **kwargs) @staticmethod def generate_fake_flat():