parent
d6b82b24c6
commit
ee2880326c
@ -51,6 +51,8 @@ DEFAULT_CONFIG = {
|
|||||||
"duplicate_threshold": 15,
|
"duplicate_threshold": 15,
|
||||||
# Score to consider two images as being duplicates through hash comparison
|
# Score to consider two images as being duplicates through hash comparison
|
||||||
"duplicate_image_hash_threshold": 10,
|
"duplicate_image_hash_threshold": 10,
|
||||||
|
# Whether images should be downloaded and served locally
|
||||||
|
"serve_images_locally": True,
|
||||||
# Navitia API key
|
# Navitia API key
|
||||||
"navitia_api_key": None,
|
"navitia_api_key": None,
|
||||||
# Number of filtering passes to run
|
# Number of filtering passes to run
|
||||||
@ -275,6 +277,7 @@ def load_config(args=None, check_with_data=True):
|
|||||||
LOGGER.info("Creating data directory according to config: %s",
|
LOGGER.info("Creating data directory according to config: %s",
|
||||||
config_data["data_directory"])
|
config_data["data_directory"])
|
||||||
os.makedirs(config_data["data_directory"])
|
os.makedirs(config_data["data_directory"])
|
||||||
|
os.makedirs(os.path.join(config_data["data_directory"], "images"))
|
||||||
|
|
||||||
if config_data["database"] is None:
|
if config_data["database"] is None:
|
||||||
config_data["database"] = "sqlite:///" + os.path.join(
|
config_data["database"] = "sqlite:///" + os.path.join(
|
||||||
|
@ -10,6 +10,7 @@ import logging
|
|||||||
|
|
||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
from flatisfy.filters import duplicates
|
from flatisfy.filters import duplicates
|
||||||
|
from flatisfy.filters import images
|
||||||
from flatisfy.filters import metadata
|
from flatisfy.filters import metadata
|
||||||
|
|
||||||
|
|
||||||
@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config):
|
|||||||
flats_list, ignored_list = refine_with_details_criteria(flats_list,
|
flats_list, ignored_list = refine_with_details_criteria(flats_list,
|
||||||
constraint)
|
constraint)
|
||||||
|
|
||||||
|
if config["serve_images_locally"]:
|
||||||
|
images.download_images(flats_list, config)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"new": flats_list,
|
"new": flats_list,
|
||||||
"ignored": ignored_list,
|
"ignored": ignored_list,
|
||||||
|
@ -1,12 +1,16 @@
|
|||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Caching function for pictures.
|
Caching function for pictures.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import, print_function, unicode_literals
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
import requests
|
import requests
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
import PIL.Image
|
||||||
|
|
||||||
|
|
||||||
class MemoryCache(object):
|
class MemoryCache(object):
|
||||||
@ -81,8 +85,34 @@ class ImageCache(MemoryCache):
|
|||||||
A cache for images, stored in memory.
|
A cache for images, stored in memory.
|
||||||
"""
|
"""
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def on_miss(url):
|
def compute_filename(url):
|
||||||
|
"""
|
||||||
|
Compute filename (hash of the URL) for the cached image.
|
||||||
|
|
||||||
|
:param url: The URL of the image.
|
||||||
|
:return: The filename, with its extension.
|
||||||
|
"""
|
||||||
|
# Always store as JPEG
|
||||||
|
return "%s.jpg" % hashlib.sha1(url.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
def on_miss(self, url):
|
||||||
"""
|
"""
|
||||||
Helper to actually retrieve photos if not already cached.
|
Helper to actually retrieve photos if not already cached.
|
||||||
"""
|
"""
|
||||||
return requests.get(url)
|
filepath = os.path.join(
|
||||||
|
self.storage_dir,
|
||||||
|
self.compute_filename(url)
|
||||||
|
)
|
||||||
|
if os.path.isfile(filepath):
|
||||||
|
image = PIL.Image.open(filepath)
|
||||||
|
else:
|
||||||
|
image = PIL.Image.open(BytesIO(requests.get(url).content))
|
||||||
|
if self.storage_dir:
|
||||||
|
image.save(filepath, format=image.format)
|
||||||
|
return image
|
||||||
|
|
||||||
|
def __init__(self, storage_dir=None):
|
||||||
|
self.storage_dir = storage_dir
|
||||||
|
if self.storage_dir and not os.path.isdir(self.storage_dir):
|
||||||
|
os.makedirs(self.storage_dir)
|
||||||
|
super(ImageCache, self).__init__()
|
||||||
|
@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals
|
|||||||
import collections
|
import collections
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
import imagehash
|
import imagehash
|
||||||
import PIL.Image
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
@ -69,8 +67,7 @@ def get_or_compute_photo_hash(photo, photo_cache):
|
|||||||
return photo["hash"]
|
return photo["hash"]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
# Otherwise, get the image and compute the hash
|
# Otherwise, get the image and compute the hash
|
||||||
req = photo_cache.get(photo["url"])
|
image = photo_cache.get(photo["url"])
|
||||||
image = PIL.Image.open(BytesIO(req.content))
|
|
||||||
photo["hash"] = imagehash.average_hash(image)
|
photo["hash"] = imagehash.average_hash(image)
|
||||||
return photo["hash"]
|
return photo["hash"]
|
||||||
|
|
||||||
@ -329,8 +326,13 @@ def deep_detect(flats_list, config):
|
|||||||
the flats objects that should be removed and considered as duplicates
|
the flats objects that should be removed and considered as duplicates
|
||||||
(they were already merged).
|
(they were already merged).
|
||||||
"""
|
"""
|
||||||
|
if config["serve_images_locally"]:
|
||||||
photo_cache = ImageCache()
|
storage_dir = os.path.join(config["data_directory"], "images")
|
||||||
|
else:
|
||||||
|
storage_dir = None
|
||||||
|
photo_cache = ImageCache(
|
||||||
|
storage_dir=storage_dir
|
||||||
|
)
|
||||||
|
|
||||||
LOGGER.info("Running deep duplicates detection.")
|
LOGGER.info("Running deep duplicates detection.")
|
||||||
matching_flats = collections.defaultdict(list)
|
matching_flats = collections.defaultdict(list)
|
||||||
|
37
flatisfy/filters/images.py
Normal file
37
flatisfy/filters/images.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
"""
|
||||||
|
Filtering functions to handle images.
|
||||||
|
|
||||||
|
This includes functions to download images.
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from flatisfy.filters.cache import ImageCache
|
||||||
|
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def download_images(flats_list, config):
|
||||||
|
"""
|
||||||
|
Download images for all flats in the list, to serve them locally.
|
||||||
|
|
||||||
|
:param flats_list: A list of flats dicts.
|
||||||
|
:param config: A config dict.
|
||||||
|
"""
|
||||||
|
photo_cache = ImageCache(
|
||||||
|
storage_dir=os.path.join(config["data_directory"], "images")
|
||||||
|
)
|
||||||
|
flats_list_length = len(flats_list)
|
||||||
|
for i, flat in enumerate(flats_list):
|
||||||
|
LOGGER.info(
|
||||||
|
"Downloading photos for flat %d/%d.", i + 1, flats_list_length
|
||||||
|
)
|
||||||
|
for photo in flat["photos"]:
|
||||||
|
# Download photo
|
||||||
|
photo_cache.get(photo["url"])
|
||||||
|
# And store the local image
|
||||||
|
photo["local"] = photo_cache.compute_filename(photo["url"])
|
@ -9,6 +9,8 @@ import os
|
|||||||
import random
|
import random
|
||||||
import sys
|
import sys
|
||||||
import unittest
|
import unittest
|
||||||
|
import tempfile
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import requests_mock
|
import requests_mock
|
||||||
|
|
||||||
@ -166,9 +168,14 @@ class TestPhoneNumbers(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class TestPhotos(unittest.TestCase):
|
class TestPhotos(unittest.TestCase):
|
||||||
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
|
|
||||||
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
|
||||||
|
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
|
||||||
|
)
|
||||||
|
super(TestPhotos, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
def test_same_photo_twice(self):
|
def test_same_photo_twice(self):
|
||||||
"""
|
"""
|
||||||
Compares a photo against itself.
|
Compares a photo against itself.
|
||||||
@ -262,7 +269,12 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
|
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
|
||||||
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
|
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
|
||||||
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
||||||
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name
|
||||||
|
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
|
||||||
|
)
|
||||||
|
super(TestDuplicates, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def generate_fake_flat():
|
def generate_fake_flat():
|
||||||
|
Loading…
Reference in New Issue
Block a user