Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
99eed82b3d | |||
538bbe5a05 |
@ -51,6 +51,8 @@ DEFAULT_CONFIG = {
|
||||
"duplicate_threshold": 15,
|
||||
# Score to consider two images as being duplicates through hash comparison
|
||||
"duplicate_image_hash_threshold": 10,
|
||||
# Whether images should be downloaded and served locally
|
||||
"serve_images_locally": True,
|
||||
# Navitia API key
|
||||
"navitia_api_key": None,
|
||||
# Number of filtering passes to run
|
||||
@ -275,6 +277,7 @@ def load_config(args=None, check_with_data=True):
|
||||
LOGGER.info("Creating data directory according to config: %s",
|
||||
config_data["data_directory"])
|
||||
os.makedirs(config_data["data_directory"])
|
||||
os.makedirs(os.path.join(config_data["data_directory"], "images"))
|
||||
|
||||
if config_data["database"] is None:
|
||||
config_data["database"] = "sqlite:///" + os.path.join(
|
||||
|
@ -10,6 +10,7 @@ import logging
|
||||
|
||||
from flatisfy import tools
|
||||
from flatisfy.filters import duplicates
|
||||
from flatisfy.filters import images
|
||||
from flatisfy.filters import metadata
|
||||
|
||||
|
||||
@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config):
|
||||
flats_list, ignored_list = refine_with_details_criteria(flats_list,
|
||||
constraint)
|
||||
|
||||
if config["serve_images_locally"]:
|
||||
images.download_images(flats_list, config)
|
||||
|
||||
return {
|
||||
"new": flats_list,
|
||||
"ignored": ignored_list,
|
||||
|
@ -1,12 +1,16 @@
|
||||
# coding: utf-8
|
||||
|
||||
"""
|
||||
Caching function for pictures.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
import PIL.Image
|
||||
|
||||
|
||||
class MemoryCache(object):
|
||||
@ -81,8 +85,34 @@ class ImageCache(MemoryCache):
|
||||
A cache for images, stored in memory.
|
||||
"""
|
||||
@staticmethod
|
||||
def on_miss(url):
|
||||
def compute_filename(url):
|
||||
"""
|
||||
Compute filename (hash of the URL) for the cached image.
|
||||
|
||||
:param url: The URL of the image.
|
||||
:return: The filename, with its extension.
|
||||
"""
|
||||
# Always store as JPEG
|
||||
return "%s.jpg" % hashlib.sha1(url.encode("utf-8")).hexdigest()
|
||||
|
||||
def on_miss(self, url):
|
||||
"""
|
||||
Helper to actually retrieve photos if not already cached.
|
||||
"""
|
||||
return requests.get(url)
|
||||
filepath = os.path.join(
|
||||
self.storage_dir,
|
||||
self.compute_filename(url)
|
||||
)
|
||||
if os.path.isfile(filepath):
|
||||
image = PIL.Image.open(filepath)
|
||||
else:
|
||||
image = PIL.Image.open(BytesIO(requests.get(url).content))
|
||||
if self.storage_dir:
|
||||
image.save(filepath, format=image.format)
|
||||
return image
|
||||
|
||||
def __init__(self, storage_dir=None):
|
||||
self.storage_dir = storage_dir
|
||||
if self.storage_dir and not os.path.isdir(self.storage_dir):
|
||||
os.makedirs(self.storage_dir)
|
||||
super(ImageCache, self).__init__()
|
||||
|
@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals
|
||||
import collections
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
import imagehash
|
||||
import PIL.Image
|
||||
import requests
|
||||
|
||||
from flatisfy import tools
|
||||
@ -61,8 +59,7 @@ def get_or_compute_photo_hash(photo, photo_cache):
|
||||
return photo["hash"]
|
||||
except KeyError:
|
||||
# Otherwise, get the image and compute the hash
|
||||
req = photo_cache.get(photo["url"])
|
||||
image = PIL.Image.open(BytesIO(req.content))
|
||||
image = photo_cache.get(photo["url"])
|
||||
photo["hash"] = imagehash.average_hash(image)
|
||||
return photo["hash"]
|
||||
|
||||
@ -322,8 +319,13 @@ def deep_detect(flats_list, config):
|
||||
the flats objects that should be removed and considered as duplicates
|
||||
(they were already merged).
|
||||
"""
|
||||
|
||||
photo_cache = ImageCache()
|
||||
if config["serve_images_locally"]:
|
||||
storage_dir = os.path.join(config["data_directory"], "images")
|
||||
else:
|
||||
storage_dir = None
|
||||
photo_cache = ImageCache(
|
||||
storage_dir=storage_dir
|
||||
)
|
||||
|
||||
LOGGER.info("Running deep duplicates detection.")
|
||||
matching_flats = collections.defaultdict(list)
|
||||
|
37
flatisfy/filters/images.py
Normal file
37
flatisfy/filters/images.py
Normal file
@ -0,0 +1,37 @@
|
||||
# coding: utf-8
|
||||
"""
|
||||
Filtering functions to handle images.
|
||||
|
||||
This includes functions to download images.
|
||||
"""
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from flatisfy.filters.cache import ImageCache
|
||||
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def download_images(flats_list, config):
|
||||
"""
|
||||
Download images for all flats in the list, to serve them locally.
|
||||
|
||||
:param flats_list: A list of flats dicts.
|
||||
:param config: A config dict.
|
||||
"""
|
||||
photo_cache = ImageCache(
|
||||
storage_dir=os.path.join(config["data_directory"], "images")
|
||||
)
|
||||
flats_list_length = len(flats_list)
|
||||
for i, flat in enumerate(flats_list):
|
||||
LOGGER.info(
|
||||
"Downloading photos for flat %d/%d.", i + 1, flats_list_length
|
||||
)
|
||||
for photo in flat["photos"]:
|
||||
# Download photo
|
||||
photo_cache.get(photo["url"])
|
||||
# And store the local image
|
||||
photo["local"] = photo_cache.compute_filename(photo["url"])
|
@ -9,6 +9,8 @@ import os
|
||||
import random
|
||||
import sys
|
||||
import unittest
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
import requests_mock
|
||||
|
||||
@ -157,9 +159,14 @@ class TestPhoneNumbers(unittest.TestCase):
|
||||
|
||||
|
||||
class TestPhotos(unittest.TestCase):
|
||||
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
|
||||
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
|
||||
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
|
||||
)
|
||||
super(TestPhotos, self).__init__(*args, **kwargs)
|
||||
|
||||
def test_same_photo_twice(self):
|
||||
"""
|
||||
Compares a photo against itself.
|
||||
@ -171,8 +178,8 @@ class TestPhotos(unittest.TestCase):
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
photo,
|
||||
photo,
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE,
|
||||
self.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
def test_different_photos(self):
|
||||
@ -182,15 +189,15 @@ class TestPhotos(unittest.TestCase):
|
||||
self.assertFalse(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE,
|
||||
self.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
self.assertFalse(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE,
|
||||
self.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
def test_matching_photos(self):
|
||||
@ -200,29 +207,29 @@ class TestPhotos(unittest.TestCase):
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE,
|
||||
self.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE,
|
||||
self.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE,
|
||||
self.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
self.assertTrue(duplicates.compare_photos(
|
||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
|
||||
TestPhotos.IMAGE_CACHE,
|
||||
TestPhotos.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE,
|
||||
self.HASH_THRESHOLD
|
||||
))
|
||||
|
||||
|
||||
@ -233,7 +240,12 @@ class TestDuplicates(unittest.TestCase):
|
||||
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
|
||||
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
|
||||
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
||||
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name
|
||||
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
|
||||
)
|
||||
super(TestDuplicates, self).__init__(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def generate_fake_flat():
|
||||
@ -276,10 +288,10 @@ class TestDuplicates(unittest.TestCase):
|
||||
flat2 = copy.deepcopy(flat1)
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
score >= self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
)
|
||||
|
||||
def test_different_prices(self):
|
||||
@ -292,10 +304,10 @@ class TestDuplicates(unittest.TestCase):
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
)
|
||||
|
||||
def test_different_rooms(self):
|
||||
@ -309,10 +321,10 @@ class TestDuplicates(unittest.TestCase):
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
)
|
||||
|
||||
def test_different_areas(self):
|
||||
@ -325,10 +337,10 @@ class TestDuplicates(unittest.TestCase):
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
)
|
||||
|
||||
def test_different_areas_decimals(self):
|
||||
@ -343,10 +355,10 @@ class TestDuplicates(unittest.TestCase):
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
)
|
||||
|
||||
def test_different_phones(self):
|
||||
@ -360,10 +372,10 @@ class TestDuplicates(unittest.TestCase):
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flat1, flat2,
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||
)
|
||||
|
||||
def test_real_duplicates(self):
|
||||
@ -378,10 +390,10 @@ class TestDuplicates(unittest.TestCase):
|
||||
|
||||
score = duplicates.get_duplicate_score(
|
||||
flats[0], flats[1],
|
||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
||||
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||
)
|
||||
self.assertTrue(
|
||||
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS
|
||||
score >= self.DUPLICATES_MIN_SCORE_WITH_PHOTOS
|
||||
)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user