Compare commits

...

2 Commits

Author SHA1 Message Date
Lucas Verney 99eed82b3d Few fixes 2018-01-22 01:27:50 +01:00
Lucas Verney 538bbe5a05 Add a way to download photos locally
Fix for #94.
2018-01-22 01:06:09 +01:00
6 changed files with 128 additions and 40 deletions

View File

@ -51,6 +51,8 @@ DEFAULT_CONFIG = {
"duplicate_threshold": 15,
# Score to consider two images as being duplicates through hash comparison
"duplicate_image_hash_threshold": 10,
# Whether images should be downloaded and served locally
"serve_images_locally": True,
# Navitia API key
"navitia_api_key": None,
# Number of filtering passes to run
@ -275,6 +277,7 @@ def load_config(args=None, check_with_data=True):
LOGGER.info("Creating data directory according to config: %s",
config_data["data_directory"])
os.makedirs(config_data["data_directory"])
os.makedirs(os.path.join(config_data["data_directory"], "images"))
if config_data["database"] is None:
config_data["database"] = "sqlite:///" + os.path.join(

View File

@ -10,6 +10,7 @@ import logging
from flatisfy import tools
from flatisfy.filters import duplicates
from flatisfy.filters import images
from flatisfy.filters import metadata
@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config):
flats_list, ignored_list = refine_with_details_criteria(flats_list,
constraint)
if config["serve_images_locally"]:
images.download_images(flats_list, config)
return {
"new": flats_list,
"ignored": ignored_list,

View File

@ -1,12 +1,16 @@
# coding: utf-8
"""
Caching function for pictures.
"""
from __future__ import absolute_import, print_function, unicode_literals
import hashlib
import os
import requests
from io import BytesIO
import PIL.Image
class MemoryCache(object):
@ -81,8 +85,34 @@ class ImageCache(MemoryCache):
A cache for images, stored in memory.
"""
@staticmethod
def on_miss(url):
def compute_filename(url):
"""
Compute filename (hash of the URL) for the cached image.
:param url: The URL of the image.
:return: The filename, with its extension.
"""
# Always store as JPEG
return "%s.jpg" % hashlib.sha1(url.encode("utf-8")).hexdigest()
def on_miss(self, url):
"""
Helper to actually retrieve photos if not already cached.
"""
return requests.get(url)
filepath = os.path.join(
self.storage_dir,
self.compute_filename(url)
)
if os.path.isfile(filepath):
image = PIL.Image.open(filepath)
else:
image = PIL.Image.open(BytesIO(requests.get(url).content))
if self.storage_dir:
image.save(filepath, format=image.format)
return image
def __init__(self, storage_dir=None):
self.storage_dir = storage_dir
if self.storage_dir and not os.path.isdir(self.storage_dir):
os.makedirs(self.storage_dir)
super(ImageCache, self).__init__()

View File

@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals
import collections
import itertools
import logging
import os
import re
from io import BytesIO
import imagehash
import PIL.Image
import requests
from flatisfy import tools
@ -61,8 +59,7 @@ def get_or_compute_photo_hash(photo, photo_cache):
return photo["hash"]
except KeyError:
# Otherwise, get the image and compute the hash
req = photo_cache.get(photo["url"])
image = PIL.Image.open(BytesIO(req.content))
image = photo_cache.get(photo["url"])
photo["hash"] = imagehash.average_hash(image)
return photo["hash"]
@ -322,8 +319,13 @@ def deep_detect(flats_list, config):
the flats objects that should be removed and considered as duplicates
(they were already merged).
"""
photo_cache = ImageCache()
if config["serve_images_locally"]:
storage_dir = os.path.join(config["data_directory"], "images")
else:
storage_dir = None
photo_cache = ImageCache(
storage_dir=storage_dir
)
LOGGER.info("Running deep duplicates detection.")
matching_flats = collections.defaultdict(list)

View File

@ -0,0 +1,37 @@
# coding: utf-8
"""
Filtering functions to handle images.
This includes functions to download images.
"""
from __future__ import absolute_import, print_function, unicode_literals
import logging
import os
from flatisfy.filters.cache import ImageCache
LOGGER = logging.getLogger(__name__)
def download_images(flats_list, config):
"""
Download images for all flats in the list, to serve them locally.
:param flats_list: A list of flats dicts.
:param config: A config dict.
"""
photo_cache = ImageCache(
storage_dir=os.path.join(config["data_directory"], "images")
)
flats_list_length = len(flats_list)
for i, flat in enumerate(flats_list):
LOGGER.info(
"Downloading photos for flat %d/%d.", i + 1, flats_list_length
)
for photo in flat["photos"]:
# Download photo
photo_cache.get(photo["url"])
# And store the local image
photo["local"] = photo_cache.compute_filename(photo["url"])

View File

@ -9,6 +9,8 @@ import os
import random
import sys
import unittest
import tempfile
import requests
import requests_mock
@ -157,9 +159,14 @@ class TestPhoneNumbers(unittest.TestCase):
class TestPhotos(unittest.TestCase):
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestPhotos, self).__init__(*args, **kwargs)
def test_same_photo_twice(self):
"""
Compares a photo against itself.
@ -171,8 +178,8 @@ class TestPhotos(unittest.TestCase):
self.assertTrue(duplicates.compare_photos(
photo,
photo,
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
self.IMAGE_CACHE,
self.HASH_THRESHOLD
))
def test_different_photos(self):
@ -182,15 +189,15 @@ class TestPhotos(unittest.TestCase):
self.assertFalse(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
self.IMAGE_CACHE,
self.HASH_THRESHOLD
))
self.assertFalse(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
self.IMAGE_CACHE,
self.HASH_THRESHOLD
))
def test_matching_photos(self):
@ -200,29 +207,29 @@ class TestPhotos(unittest.TestCase):
self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
self.IMAGE_CACHE,
self.HASH_THRESHOLD
))
self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
self.IMAGE_CACHE,
self.HASH_THRESHOLD
))
self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
self.IMAGE_CACHE,
self.HASH_THRESHOLD
))
self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
TestPhotos.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD
self.IMAGE_CACHE,
self.HASH_THRESHOLD
))
@ -233,7 +240,12 @@ class TestDuplicates(unittest.TestCase):
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestDuplicates, self).__init__(*args, **kwargs)
@staticmethod
def generate_fake_flat():
@ -276,10 +288,10 @@ class TestDuplicates(unittest.TestCase):
flat2 = copy.deepcopy(flat1)
score = duplicates.get_duplicate_score(
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
self.IMAGE_CACHE, self.HASH_THRESHOLD
)
self.assertTrue(
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
score >= self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
)
def test_different_prices(self):
@ -292,10 +304,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score(
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
self.IMAGE_CACHE, self.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
)
def test_different_rooms(self):
@ -309,10 +321,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score(
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
self.IMAGE_CACHE, self.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
)
def test_different_areas(self):
@ -325,10 +337,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score(
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
self.IMAGE_CACHE, self.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
)
def test_different_areas_decimals(self):
@ -343,10 +355,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score(
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
self.IMAGE_CACHE, self.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
)
def test_different_phones(self):
@ -360,10 +372,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score(
flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
self.IMAGE_CACHE, self.HASH_THRESHOLD
)
self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
)
def test_real_duplicates(self):
@ -378,10 +390,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score(
flats[0], flats[1],
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
self.IMAGE_CACHE, self.HASH_THRESHOLD
)
self.assertTrue(
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS
score >= self.DUPLICATES_MIN_SCORE_WITH_PHOTOS
)