From 538bbe5a0511563da5d9665d59926a908a1d7e59 Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Mon, 22 Jan 2018 01:06:09 +0100 Subject: [PATCH] Add a way to download photos locally Fix for #94. --- flatisfy/config.py | 2 + flatisfy/filters/__init__.py | 4 ++ flatisfy/filters/cache.py | 29 ++++++++++++-- flatisfy/filters/duplicates.py | 16 ++++---- flatisfy/filters/images.py | 34 ++++++++++++++++ flatisfy/tests.py | 72 ++++++++++++++++++++-------------- 6 files changed, 117 insertions(+), 40 deletions(-) create mode 100644 flatisfy/filters/images.py diff --git a/flatisfy/config.py b/flatisfy/config.py index 68df028..ecce0b0 100644 --- a/flatisfy/config.py +++ b/flatisfy/config.py @@ -51,6 +51,8 @@ DEFAULT_CONFIG = { "duplicate_threshold": 15, # Score to consider two images as being duplicates through hash comparison "duplicate_image_hash_threshold": 10, + # Whether images should be downloaded and served locally + "serve_images_locally": True, # Navitia API key "navitia_api_key": None, # Number of filtering passes to run diff --git a/flatisfy/filters/__init__.py b/flatisfy/filters/__init__.py index 4efadac..2e9bf62 100644 --- a/flatisfy/filters/__init__.py +++ b/flatisfy/filters/__init__.py @@ -10,6 +10,7 @@ import logging from flatisfy import tools from flatisfy.filters import duplicates +from flatisfy.filters import images from flatisfy.filters import metadata @@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config): flats_list, ignored_list = refine_with_details_criteria(flats_list, constraint) + if config["serve_images_locally"]: + images.download_images(flats_list, config) + return { "new": flats_list, "ignored": ignored_list, diff --git a/flatisfy/filters/cache.py b/flatisfy/filters/cache.py index a98b74b..229d5f1 100644 --- a/flatisfy/filters/cache.py +++ b/flatisfy/filters/cache.py @@ -1,12 +1,16 @@ # coding: utf-8 - """ Caching function for pictures. """ from __future__ import absolute_import, print_function, unicode_literals +import hashlib +import os import requests +from io import BytesIO + +import PIL.Image class MemoryCache(object): @@ -81,8 +85,27 @@ class ImageCache(MemoryCache): A cache for images, stored in memory. """ @staticmethod - def on_miss(url): + def compute_filename(url): + return hashlib.sha1(url.encode("utf-8")).hexdigest() + + def on_miss(self, url): """ Helper to actually retrieve photos if not already cached. """ - return requests.get(url) + filepath = os.path.join( + self.storage_dir, + self.compute_filename(url) + ) + if os.path.isfile(filepath): + image = PIL.Image.open(filepath) + else: + image = PIL.Image.open(BytesIO(requests.get(url).content)) + if self.storage_dir: + image.save(filepath, format=image.format) + return image + + def __init__(self, storage_dir=None): + self.storage_dir = storage_dir + if self.storage_dir and not os.path.isdir(self.storage_dir): + os.makedirs(self.storage_dir) + super(ImageCache, self).__init__() diff --git a/flatisfy/filters/duplicates.py b/flatisfy/filters/duplicates.py index b5e2803..509b7f6 100644 --- a/flatisfy/filters/duplicates.py +++ b/flatisfy/filters/duplicates.py @@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals import collections import itertools import logging +import os import re -from io import BytesIO - import imagehash -import PIL.Image import requests from flatisfy import tools @@ -61,8 +59,7 @@ def get_or_compute_photo_hash(photo, photo_cache): return photo["hash"] except KeyError: # Otherwise, get the image and compute the hash - req = photo_cache.get(photo["url"]) - image = PIL.Image.open(BytesIO(req.content)) + image = photo_cache.get(photo["url"]) photo["hash"] = imagehash.average_hash(image) return photo["hash"] @@ -322,8 +319,13 @@ def deep_detect(flats_list, config): the flats objects that should be removed and considered as duplicates (they were already merged). """ - - photo_cache = ImageCache() + if config["serve_images_locally"]: + storage_dir = os.path.join(config["data_directory"], "images") + else: + storage_dir = None + photo_cache = ImageCache( + storage_dir=storage_dir + ) LOGGER.info("Running deep duplicates detection.") matching_flats = collections.defaultdict(list) diff --git a/flatisfy/filters/images.py b/flatisfy/filters/images.py new file mode 100644 index 0000000..9c21ca8 --- /dev/null +++ b/flatisfy/filters/images.py @@ -0,0 +1,34 @@ +# coding: utf-8 +""" +Filtering functions to handle images. + +This includes functions to download images. +""" +from __future__ import absolute_import, print_function, unicode_literals + +import logging +import os + +from flatisfy.filters.cache import ImageCache + + +LOGGER = logging.getLogger(__name__) + + +def download_images(flats_list, config): + """ + TODO + """ + photo_cache = ImageCache( + storage_dir=os.path.join(config["data_directory"], "images") + ) + flats_list_length = len(flats_list) + for i, flat in enumerate(flats_list): + LOGGER.info( + "Downloading photos for flat %d/%d.", i + 1, flats_list_length + ) + for photo in flat["photos"]: + # Download photo + photo_cache.get(photo["url"]) + # And store the local image + photo["local"] = photo_cache.compute_filename(photo["url"]) diff --git a/flatisfy/tests.py b/flatisfy/tests.py index bf73067..d668bba 100644 --- a/flatisfy/tests.py +++ b/flatisfy/tests.py @@ -9,6 +9,8 @@ import os import random import sys import unittest +import tempfile + import requests import requests_mock @@ -157,9 +159,14 @@ class TestPhoneNumbers(unittest.TestCase): class TestPhotos(unittest.TestCase): - IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name + def __init__(self, *args, **kwargs): + self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name + storage_dir=tempfile.mkdtemp(prefix="flatisfy-") + ) + super(TestPhotos, self).__init__(*args, **kwargs) + def test_same_photo_twice(self): """ Compares a photo against itself. @@ -171,8 +178,8 @@ class TestPhotos(unittest.TestCase): self.assertTrue(duplicates.compare_photos( photo, photo, - TestPhotos.IMAGE_CACHE, - TestPhotos.HASH_THRESHOLD + self.IMAGE_CACHE, + self.HASH_THRESHOLD )) def test_different_photos(self): @@ -182,15 +189,15 @@ class TestPhotos(unittest.TestCase): self.assertFalse(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, - TestPhotos.IMAGE_CACHE, - TestPhotos.HASH_THRESHOLD + self.IMAGE_CACHE, + self.HASH_THRESHOLD )) self.assertFalse(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, - TestPhotos.IMAGE_CACHE, - TestPhotos.HASH_THRESHOLD + self.IMAGE_CACHE, + self.HASH_THRESHOLD )) def test_matching_photos(self): @@ -200,29 +207,29 @@ class TestPhotos(unittest.TestCase): self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"}, - TestPhotos.IMAGE_CACHE, - TestPhotos.HASH_THRESHOLD + self.IMAGE_CACHE, + self.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"}, - TestPhotos.IMAGE_CACHE, - TestPhotos.HASH_THRESHOLD + self.IMAGE_CACHE, + self.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"}, - TestPhotos.IMAGE_CACHE, - TestPhotos.HASH_THRESHOLD + self.IMAGE_CACHE, + self.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"}, - TestPhotos.IMAGE_CACHE, - TestPhotos.HASH_THRESHOLD + self.IMAGE_CACHE, + self.HASH_THRESHOLD )) @@ -233,7 +240,12 @@ class TestDuplicates(unittest.TestCase): DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name - IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name + + def __init__(self, *args, **kwargs): + self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name + storage_dir=tempfile.mkdtemp(prefix="flatisfy-") + ) + super(TestDuplicates, self).__init__(*args, **kwargs) @staticmethod def generate_fake_flat(): @@ -276,10 +288,10 @@ class TestDuplicates(unittest.TestCase): flat2 = copy.deepcopy(flat1) score = duplicates.get_duplicate_score( flat1, flat2, - TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD + self.IMAGE_CACHE, self.HASH_THRESHOLD ) self.assertTrue( - score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS + score >= self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_prices(self): @@ -292,10 +304,10 @@ class TestDuplicates(unittest.TestCase): score = duplicates.get_duplicate_score( flat1, flat2, - TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD + self.IMAGE_CACHE, self.HASH_THRESHOLD ) self.assertTrue( - score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS + score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_rooms(self): @@ -309,10 +321,10 @@ class TestDuplicates(unittest.TestCase): score = duplicates.get_duplicate_score( flat1, flat2, - TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD + self.IMAGE_CACHE, self.HASH_THRESHOLD ) self.assertTrue( - score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS + score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_areas(self): @@ -325,10 +337,10 @@ class TestDuplicates(unittest.TestCase): score = duplicates.get_duplicate_score( flat1, flat2, - TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD + self.IMAGE_CACHE, self.HASH_THRESHOLD ) self.assertTrue( - score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS + score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_areas_decimals(self): @@ -343,10 +355,10 @@ class TestDuplicates(unittest.TestCase): score = duplicates.get_duplicate_score( flat1, flat2, - TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD + self.IMAGE_CACHE, self.HASH_THRESHOLD ) self.assertTrue( - score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS + score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_phones(self): @@ -360,10 +372,10 @@ class TestDuplicates(unittest.TestCase): score = duplicates.get_duplicate_score( flat1, flat2, - TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD + self.IMAGE_CACHE, self.HASH_THRESHOLD ) self.assertTrue( - score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS + score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_real_duplicates(self): @@ -378,10 +390,10 @@ class TestDuplicates(unittest.TestCase): score = duplicates.get_duplicate_score( flats[0], flats[1], - TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD + self.IMAGE_CACHE, self.HASH_THRESHOLD ) self.assertTrue( - score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS + score >= self.DUPLICATES_MIN_SCORE_WITH_PHOTOS )