Add a way to download photos locally

Fix for #94.
This commit is contained in:
Lucas Verney 2018-01-22 01:06:09 +01:00
parent d6b82b24c6
commit ee2880326c
6 changed files with 100 additions and 12 deletions

View File

@ -51,6 +51,8 @@ DEFAULT_CONFIG = {
"duplicate_threshold": 15, "duplicate_threshold": 15,
# Score to consider two images as being duplicates through hash comparison # Score to consider two images as being duplicates through hash comparison
"duplicate_image_hash_threshold": 10, "duplicate_image_hash_threshold": 10,
# Whether images should be downloaded and served locally
"serve_images_locally": True,
# Navitia API key # Navitia API key
"navitia_api_key": None, "navitia_api_key": None,
# Number of filtering passes to run # Number of filtering passes to run
@ -275,6 +277,7 @@ def load_config(args=None, check_with_data=True):
LOGGER.info("Creating data directory according to config: %s", LOGGER.info("Creating data directory according to config: %s",
config_data["data_directory"]) config_data["data_directory"])
os.makedirs(config_data["data_directory"]) os.makedirs(config_data["data_directory"])
os.makedirs(os.path.join(config_data["data_directory"], "images"))
if config_data["database"] is None: if config_data["database"] is None:
config_data["database"] = "sqlite:///" + os.path.join( config_data["database"] = "sqlite:///" + os.path.join(

View File

@ -10,6 +10,7 @@ import logging
from flatisfy import tools from flatisfy import tools
from flatisfy.filters import duplicates from flatisfy.filters import duplicates
from flatisfy.filters import images
from flatisfy.filters import metadata from flatisfy.filters import metadata
@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config):
flats_list, ignored_list = refine_with_details_criteria(flats_list, flats_list, ignored_list = refine_with_details_criteria(flats_list,
constraint) constraint)
if config["serve_images_locally"]:
images.download_images(flats_list, config)
return { return {
"new": flats_list, "new": flats_list,
"ignored": ignored_list, "ignored": ignored_list,

View File

@ -1,12 +1,16 @@
# coding: utf-8 # coding: utf-8
""" """
Caching function for pictures. Caching function for pictures.
""" """
from __future__ import absolute_import, print_function, unicode_literals from __future__ import absolute_import, print_function, unicode_literals
import hashlib
import os
import requests import requests
from io import BytesIO
import PIL.Image
class MemoryCache(object): class MemoryCache(object):
@ -81,8 +85,34 @@ class ImageCache(MemoryCache):
A cache for images, stored in memory. A cache for images, stored in memory.
""" """
@staticmethod @staticmethod
def on_miss(url): def compute_filename(url):
"""
Compute filename (hash of the URL) for the cached image.
:param url: The URL of the image.
:return: The filename, with its extension.
"""
# Always store as JPEG
return "%s.jpg" % hashlib.sha1(url.encode("utf-8")).hexdigest()
def on_miss(self, url):
""" """
Helper to actually retrieve photos if not already cached. Helper to actually retrieve photos if not already cached.
""" """
return requests.get(url) filepath = os.path.join(
self.storage_dir,
self.compute_filename(url)
)
if os.path.isfile(filepath):
image = PIL.Image.open(filepath)
else:
image = PIL.Image.open(BytesIO(requests.get(url).content))
if self.storage_dir:
image.save(filepath, format=image.format)
return image
def __init__(self, storage_dir=None):
self.storage_dir = storage_dir
if self.storage_dir and not os.path.isdir(self.storage_dir):
os.makedirs(self.storage_dir)
super(ImageCache, self).__init__()

View File

@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals
import collections import collections
import itertools import itertools
import logging import logging
import os
import re import re
from io import BytesIO
import imagehash import imagehash
import PIL.Image
import requests import requests
from flatisfy import tools from flatisfy import tools
@ -69,8 +67,7 @@ def get_or_compute_photo_hash(photo, photo_cache):
return photo["hash"] return photo["hash"]
except KeyError: except KeyError:
# Otherwise, get the image and compute the hash # Otherwise, get the image and compute the hash
req = photo_cache.get(photo["url"]) image = photo_cache.get(photo["url"])
image = PIL.Image.open(BytesIO(req.content))
photo["hash"] = imagehash.average_hash(image) photo["hash"] = imagehash.average_hash(image)
return photo["hash"] return photo["hash"]
@ -329,8 +326,13 @@ def deep_detect(flats_list, config):
the flats objects that should be removed and considered as duplicates the flats objects that should be removed and considered as duplicates
(they were already merged). (they were already merged).
""" """
if config["serve_images_locally"]:
photo_cache = ImageCache() storage_dir = os.path.join(config["data_directory"], "images")
else:
storage_dir = None
photo_cache = ImageCache(
storage_dir=storage_dir
)
LOGGER.info("Running deep duplicates detection.") LOGGER.info("Running deep duplicates detection.")
matching_flats = collections.defaultdict(list) matching_flats = collections.defaultdict(list)

View File

@ -0,0 +1,37 @@
# coding: utf-8
"""
Filtering functions to handle images.
This includes functions to download images.
"""
from __future__ import absolute_import, print_function, unicode_literals
import logging
import os
from flatisfy.filters.cache import ImageCache
LOGGER = logging.getLogger(__name__)
def download_images(flats_list, config):
"""
Download images for all flats in the list, to serve them locally.
:param flats_list: A list of flats dicts.
:param config: A config dict.
"""
photo_cache = ImageCache(
storage_dir=os.path.join(config["data_directory"], "images")
)
flats_list_length = len(flats_list)
for i, flat in enumerate(flats_list):
LOGGER.info(
"Downloading photos for flat %d/%d.", i + 1, flats_list_length
)
for photo in flat["photos"]:
# Download photo
photo_cache.get(photo["url"])
# And store the local image
photo["local"] = photo_cache.compute_filename(photo["url"])

View File

@ -9,6 +9,8 @@ import os
import random import random
import sys import sys
import unittest import unittest
import tempfile
import requests import requests
import requests_mock import requests_mock
@ -166,9 +168,14 @@ class TestPhoneNumbers(unittest.TestCase):
class TestPhotos(unittest.TestCase): class TestPhotos(unittest.TestCase):
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestPhotos, self).__init__(*args, **kwargs)
def test_same_photo_twice(self): def test_same_photo_twice(self):
""" """
Compares a photo against itself. Compares a photo against itself.
@ -262,7 +269,12 @@ class TestDuplicates(unittest.TestCase):
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestDuplicates, self).__init__(*args, **kwargs)
@staticmethod @staticmethod
def generate_fake_flat(): def generate_fake_flat():