Add a way to download photos locally

Fix for #94.
This commit is contained in:
Lucas Verney 2018-01-22 01:06:09 +01:00
parent d6b82b24c6
commit ee2880326c
6 changed files with 100 additions and 12 deletions

View File

@ -51,6 +51,8 @@ DEFAULT_CONFIG = {
"duplicate_threshold": 15,
# Score to consider two images as being duplicates through hash comparison
"duplicate_image_hash_threshold": 10,
# Whether images should be downloaded and served locally
"serve_images_locally": True,
# Navitia API key
"navitia_api_key": None,
# Number of filtering passes to run
@ -275,6 +277,7 @@ def load_config(args=None, check_with_data=True):
LOGGER.info("Creating data directory according to config: %s",
config_data["data_directory"])
os.makedirs(config_data["data_directory"])
os.makedirs(os.path.join(config_data["data_directory"], "images"))
if config_data["database"] is None:
config_data["database"] = "sqlite:///" + os.path.join(

View File

@ -10,6 +10,7 @@ import logging
from flatisfy import tools
from flatisfy.filters import duplicates
from flatisfy.filters import images
from flatisfy.filters import metadata
@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config):
flats_list, ignored_list = refine_with_details_criteria(flats_list,
constraint)
if config["serve_images_locally"]:
images.download_images(flats_list, config)
return {
"new": flats_list,
"ignored": ignored_list,

View File

@ -1,12 +1,16 @@
# coding: utf-8
"""
Caching function for pictures.
"""
from __future__ import absolute_import, print_function, unicode_literals
import hashlib
import os
import requests
from io import BytesIO
import PIL.Image
class MemoryCache(object):
@ -81,8 +85,34 @@ class ImageCache(MemoryCache):
A cache for images, stored in memory.
"""
@staticmethod
def on_miss(url):
def compute_filename(url):
"""
Compute filename (hash of the URL) for the cached image.
:param url: The URL of the image.
:return: The filename, with its extension.
"""
# Always store as JPEG
return "%s.jpg" % hashlib.sha1(url.encode("utf-8")).hexdigest()
def on_miss(self, url):
"""
Helper to actually retrieve photos if not already cached.
"""
return requests.get(url)
filepath = os.path.join(
self.storage_dir,
self.compute_filename(url)
)
if os.path.isfile(filepath):
image = PIL.Image.open(filepath)
else:
image = PIL.Image.open(BytesIO(requests.get(url).content))
if self.storage_dir:
image.save(filepath, format=image.format)
return image
def __init__(self, storage_dir=None):
self.storage_dir = storage_dir
if self.storage_dir and not os.path.isdir(self.storage_dir):
os.makedirs(self.storage_dir)
super(ImageCache, self).__init__()

View File

@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals
import collections
import itertools
import logging
import os
import re
from io import BytesIO
import imagehash
import PIL.Image
import requests
from flatisfy import tools
@ -69,8 +67,7 @@ def get_or_compute_photo_hash(photo, photo_cache):
return photo["hash"]
except KeyError:
# Otherwise, get the image and compute the hash
req = photo_cache.get(photo["url"])
image = PIL.Image.open(BytesIO(req.content))
image = photo_cache.get(photo["url"])
photo["hash"] = imagehash.average_hash(image)
return photo["hash"]
@ -329,8 +326,13 @@ def deep_detect(flats_list, config):
the flats objects that should be removed and considered as duplicates
(they were already merged).
"""
photo_cache = ImageCache()
if config["serve_images_locally"]:
storage_dir = os.path.join(config["data_directory"], "images")
else:
storage_dir = None
photo_cache = ImageCache(
storage_dir=storage_dir
)
LOGGER.info("Running deep duplicates detection.")
matching_flats = collections.defaultdict(list)

View File

@ -0,0 +1,37 @@
# coding: utf-8
"""
Filtering functions to handle images.
This includes functions to download images.
"""
from __future__ import absolute_import, print_function, unicode_literals
import logging
import os
from flatisfy.filters.cache import ImageCache
LOGGER = logging.getLogger(__name__)
def download_images(flats_list, config):
"""
Download images for all flats in the list, to serve them locally.
:param flats_list: A list of flats dicts.
:param config: A config dict.
"""
photo_cache = ImageCache(
storage_dir=os.path.join(config["data_directory"], "images")
)
flats_list_length = len(flats_list)
for i, flat in enumerate(flats_list):
LOGGER.info(
"Downloading photos for flat %d/%d.", i + 1, flats_list_length
)
for photo in flat["photos"]:
# Download photo
photo_cache.get(photo["url"])
# And store the local image
photo["local"] = photo_cache.compute_filename(photo["url"])

View File

@ -9,6 +9,8 @@ import os
import random
import sys
import unittest
import tempfile
import requests
import requests_mock
@ -166,9 +168,14 @@ class TestPhoneNumbers(unittest.TestCase):
class TestPhotos(unittest.TestCase):
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestPhotos, self).__init__(*args, **kwargs)
def test_same_photo_twice(self):
"""
Compares a photo against itself.
@ -262,7 +269,12 @@ class TestDuplicates(unittest.TestCase):
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestDuplicates, self).__init__(*args, **kwargs)
@staticmethod
def generate_fake_flat():