Compare commits

...

2 Commits

Author SHA1 Message Date
Lucas Verney 99eed82b3d Few fixes 2018-01-22 01:27:50 +01:00
Lucas Verney 538bbe5a05 Add a way to download photos locally
Fix for #94.
2018-01-22 01:06:09 +01:00
6 changed files with 128 additions and 40 deletions

View File

@ -51,6 +51,8 @@ DEFAULT_CONFIG = {
"duplicate_threshold": 15, "duplicate_threshold": 15,
# Score to consider two images as being duplicates through hash comparison # Score to consider two images as being duplicates through hash comparison
"duplicate_image_hash_threshold": 10, "duplicate_image_hash_threshold": 10,
# Whether images should be downloaded and served locally
"serve_images_locally": True,
# Navitia API key # Navitia API key
"navitia_api_key": None, "navitia_api_key": None,
# Number of filtering passes to run # Number of filtering passes to run
@ -275,6 +277,7 @@ def load_config(args=None, check_with_data=True):
LOGGER.info("Creating data directory according to config: %s", LOGGER.info("Creating data directory according to config: %s",
config_data["data_directory"]) config_data["data_directory"])
os.makedirs(config_data["data_directory"]) os.makedirs(config_data["data_directory"])
os.makedirs(os.path.join(config_data["data_directory"], "images"))
if config_data["database"] is None: if config_data["database"] is None:
config_data["database"] = "sqlite:///" + os.path.join( config_data["database"] = "sqlite:///" + os.path.join(

View File

@ -10,6 +10,7 @@ import logging
from flatisfy import tools from flatisfy import tools
from flatisfy.filters import duplicates from flatisfy.filters import duplicates
from flatisfy.filters import images
from flatisfy.filters import metadata from flatisfy.filters import metadata
@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config):
flats_list, ignored_list = refine_with_details_criteria(flats_list, flats_list, ignored_list = refine_with_details_criteria(flats_list,
constraint) constraint)
if config["serve_images_locally"]:
images.download_images(flats_list, config)
return { return {
"new": flats_list, "new": flats_list,
"ignored": ignored_list, "ignored": ignored_list,

View File

@ -1,12 +1,16 @@
# coding: utf-8 # coding: utf-8
""" """
Caching function for pictures. Caching function for pictures.
""" """
from __future__ import absolute_import, print_function, unicode_literals from __future__ import absolute_import, print_function, unicode_literals
import hashlib
import os
import requests import requests
from io import BytesIO
import PIL.Image
class MemoryCache(object): class MemoryCache(object):
@ -81,8 +85,34 @@ class ImageCache(MemoryCache):
A cache for images, stored in memory. A cache for images, stored in memory.
""" """
@staticmethod @staticmethod
def on_miss(url): def compute_filename(url):
"""
Compute filename (hash of the URL) for the cached image.
:param url: The URL of the image.
:return: The filename, with its extension.
"""
# Always store as JPEG
return "%s.jpg" % hashlib.sha1(url.encode("utf-8")).hexdigest()
def on_miss(self, url):
""" """
Helper to actually retrieve photos if not already cached. Helper to actually retrieve photos if not already cached.
""" """
return requests.get(url) filepath = os.path.join(
self.storage_dir,
self.compute_filename(url)
)
if os.path.isfile(filepath):
image = PIL.Image.open(filepath)
else:
image = PIL.Image.open(BytesIO(requests.get(url).content))
if self.storage_dir:
image.save(filepath, format=image.format)
return image
def __init__(self, storage_dir=None):
self.storage_dir = storage_dir
if self.storage_dir and not os.path.isdir(self.storage_dir):
os.makedirs(self.storage_dir)
super(ImageCache, self).__init__()

View File

@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals
import collections import collections
import itertools import itertools
import logging import logging
import os
import re import re
from io import BytesIO
import imagehash import imagehash
import PIL.Image
import requests import requests
from flatisfy import tools from flatisfy import tools
@ -61,8 +59,7 @@ def get_or_compute_photo_hash(photo, photo_cache):
return photo["hash"] return photo["hash"]
except KeyError: except KeyError:
# Otherwise, get the image and compute the hash # Otherwise, get the image and compute the hash
req = photo_cache.get(photo["url"]) image = photo_cache.get(photo["url"])
image = PIL.Image.open(BytesIO(req.content))
photo["hash"] = imagehash.average_hash(image) photo["hash"] = imagehash.average_hash(image)
return photo["hash"] return photo["hash"]
@ -322,8 +319,13 @@ def deep_detect(flats_list, config):
the flats objects that should be removed and considered as duplicates the flats objects that should be removed and considered as duplicates
(they were already merged). (they were already merged).
""" """
if config["serve_images_locally"]:
photo_cache = ImageCache() storage_dir = os.path.join(config["data_directory"], "images")
else:
storage_dir = None
photo_cache = ImageCache(
storage_dir=storage_dir
)
LOGGER.info("Running deep duplicates detection.") LOGGER.info("Running deep duplicates detection.")
matching_flats = collections.defaultdict(list) matching_flats = collections.defaultdict(list)

View File

@ -0,0 +1,37 @@
# coding: utf-8
"""
Filtering functions to handle images.
This includes functions to download images.
"""
from __future__ import absolute_import, print_function, unicode_literals
import logging
import os
from flatisfy.filters.cache import ImageCache
LOGGER = logging.getLogger(__name__)
def download_images(flats_list, config):
"""
Download images for all flats in the list, to serve them locally.
:param flats_list: A list of flats dicts.
:param config: A config dict.
"""
photo_cache = ImageCache(
storage_dir=os.path.join(config["data_directory"], "images")
)
flats_list_length = len(flats_list)
for i, flat in enumerate(flats_list):
LOGGER.info(
"Downloading photos for flat %d/%d.", i + 1, flats_list_length
)
for photo in flat["photos"]:
# Download photo
photo_cache.get(photo["url"])
# And store the local image
photo["local"] = photo_cache.compute_filename(photo["url"])

View File

@ -9,6 +9,8 @@ import os
import random import random
import sys import sys
import unittest import unittest
import tempfile
import requests import requests
import requests_mock import requests_mock
@ -157,9 +159,14 @@ class TestPhoneNumbers(unittest.TestCase):
class TestPhotos(unittest.TestCase): class TestPhotos(unittest.TestCase):
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestPhotos, self).__init__(*args, **kwargs)
def test_same_photo_twice(self): def test_same_photo_twice(self):
""" """
Compares a photo against itself. Compares a photo against itself.
@ -171,8 +178,8 @@ class TestPhotos(unittest.TestCase):
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
photo, photo,
photo, photo,
TestPhotos.IMAGE_CACHE, self.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD self.HASH_THRESHOLD
)) ))
def test_different_photos(self): def test_different_photos(self):
@ -182,15 +189,15 @@ class TestPhotos(unittest.TestCase):
self.assertFalse(duplicates.compare_photos( self.assertFalse(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
TestPhotos.IMAGE_CACHE, self.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD self.HASH_THRESHOLD
)) ))
self.assertFalse(duplicates.compare_photos( self.assertFalse(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
TestPhotos.IMAGE_CACHE, self.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD self.HASH_THRESHOLD
)) ))
def test_matching_photos(self): def test_matching_photos(self):
@ -200,29 +207,29 @@ class TestPhotos(unittest.TestCase):
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"}, {"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE, self.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD self.HASH_THRESHOLD
)) ))
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"}, {"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE, self.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD self.HASH_THRESHOLD
)) ))
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"}, {"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
TestPhotos.IMAGE_CACHE, self.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD self.HASH_THRESHOLD
)) ))
self.assertTrue(duplicates.compare_photos( self.assertTrue(duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
TestPhotos.IMAGE_CACHE, self.IMAGE_CACHE,
TestPhotos.HASH_THRESHOLD self.HASH_THRESHOLD
)) ))
@ -233,7 +240,12 @@ class TestDuplicates(unittest.TestCase):
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
HASH_THRESHOLD = 10 # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestDuplicates, self).__init__(*args, **kwargs)
@staticmethod @staticmethod
def generate_fake_flat(): def generate_fake_flat():
@ -276,10 +288,10 @@ class TestDuplicates(unittest.TestCase):
flat2 = copy.deepcopy(flat1) flat2 = copy.deepcopy(flat1)
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD self.IMAGE_CACHE, self.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score >= self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
) )
def test_different_prices(self): def test_different_prices(self):
@ -292,10 +304,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD self.IMAGE_CACHE, self.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
) )
def test_different_rooms(self): def test_different_rooms(self):
@ -309,10 +321,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD self.IMAGE_CACHE, self.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
) )
def test_different_areas(self): def test_different_areas(self):
@ -325,10 +337,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD self.IMAGE_CACHE, self.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
) )
def test_different_areas_decimals(self): def test_different_areas_decimals(self):
@ -343,10 +355,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD self.IMAGE_CACHE, self.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
) )
def test_different_phones(self): def test_different_phones(self):
@ -360,10 +372,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flat1, flat2, flat1, flat2,
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD self.IMAGE_CACHE, self.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
) )
def test_real_duplicates(self): def test_real_duplicates(self):
@ -378,10 +390,10 @@ class TestDuplicates(unittest.TestCase):
score = duplicates.get_duplicate_score( score = duplicates.get_duplicate_score(
flats[0], flats[1], flats[0], flats[1],
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD self.IMAGE_CACHE, self.HASH_THRESHOLD
) )
self.assertTrue( self.assertTrue(
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS score >= self.DUPLICATES_MIN_SCORE_WITH_PHOTOS
) )