parent
27d601ca21
commit
538bbe5a05
@ -51,6 +51,8 @@ DEFAULT_CONFIG = {
|
|||||||
"duplicate_threshold": 15,
|
"duplicate_threshold": 15,
|
||||||
# Score to consider two images as being duplicates through hash comparison
|
# Score to consider two images as being duplicates through hash comparison
|
||||||
"duplicate_image_hash_threshold": 10,
|
"duplicate_image_hash_threshold": 10,
|
||||||
|
# Whether images should be downloaded and served locally
|
||||||
|
"serve_images_locally": True,
|
||||||
# Navitia API key
|
# Navitia API key
|
||||||
"navitia_api_key": None,
|
"navitia_api_key": None,
|
||||||
# Number of filtering passes to run
|
# Number of filtering passes to run
|
||||||
|
@ -10,6 +10,7 @@ import logging
|
|||||||
|
|
||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
from flatisfy.filters import duplicates
|
from flatisfy.filters import duplicates
|
||||||
|
from flatisfy.filters import images
|
||||||
from flatisfy.filters import metadata
|
from flatisfy.filters import metadata
|
||||||
|
|
||||||
|
|
||||||
@ -226,6 +227,9 @@ def second_pass(flats_list, constraint, config):
|
|||||||
flats_list, ignored_list = refine_with_details_criteria(flats_list,
|
flats_list, ignored_list = refine_with_details_criteria(flats_list,
|
||||||
constraint)
|
constraint)
|
||||||
|
|
||||||
|
if config["serve_images_locally"]:
|
||||||
|
images.download_images(flats_list, config)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"new": flats_list,
|
"new": flats_list,
|
||||||
"ignored": ignored_list,
|
"ignored": ignored_list,
|
||||||
|
@ -1,12 +1,16 @@
|
|||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Caching function for pictures.
|
Caching function for pictures.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import, print_function, unicode_literals
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
import requests
|
import requests
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
import PIL.Image
|
||||||
|
|
||||||
|
|
||||||
class MemoryCache(object):
|
class MemoryCache(object):
|
||||||
@ -81,8 +85,27 @@ class ImageCache(MemoryCache):
|
|||||||
A cache for images, stored in memory.
|
A cache for images, stored in memory.
|
||||||
"""
|
"""
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def on_miss(url):
|
def compute_filename(url):
|
||||||
|
return hashlib.sha1(url.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
def on_miss(self, url):
|
||||||
"""
|
"""
|
||||||
Helper to actually retrieve photos if not already cached.
|
Helper to actually retrieve photos if not already cached.
|
||||||
"""
|
"""
|
||||||
return requests.get(url)
|
filepath = os.path.join(
|
||||||
|
self.storage_dir,
|
||||||
|
self.compute_filename(url)
|
||||||
|
)
|
||||||
|
if os.path.isfile(filepath):
|
||||||
|
image = PIL.Image.open(filepath)
|
||||||
|
else:
|
||||||
|
image = PIL.Image.open(BytesIO(requests.get(url).content))
|
||||||
|
if self.storage_dir:
|
||||||
|
image.save(filepath, format=image.format)
|
||||||
|
return image
|
||||||
|
|
||||||
|
def __init__(self, storage_dir=None):
|
||||||
|
self.storage_dir = storage_dir
|
||||||
|
if self.storage_dir and not os.path.isdir(self.storage_dir):
|
||||||
|
os.makedirs(self.storage_dir)
|
||||||
|
super(ImageCache, self).__init__()
|
||||||
|
@ -7,12 +7,10 @@ from __future__ import absolute_import, print_function, unicode_literals
|
|||||||
import collections
|
import collections
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
import imagehash
|
import imagehash
|
||||||
import PIL.Image
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
@ -61,8 +59,7 @@ def get_or_compute_photo_hash(photo, photo_cache):
|
|||||||
return photo["hash"]
|
return photo["hash"]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
# Otherwise, get the image and compute the hash
|
# Otherwise, get the image and compute the hash
|
||||||
req = photo_cache.get(photo["url"])
|
image = photo_cache.get(photo["url"])
|
||||||
image = PIL.Image.open(BytesIO(req.content))
|
|
||||||
photo["hash"] = imagehash.average_hash(image)
|
photo["hash"] = imagehash.average_hash(image)
|
||||||
return photo["hash"]
|
return photo["hash"]
|
||||||
|
|
||||||
@ -322,8 +319,13 @@ def deep_detect(flats_list, config):
|
|||||||
the flats objects that should be removed and considered as duplicates
|
the flats objects that should be removed and considered as duplicates
|
||||||
(they were already merged).
|
(they were already merged).
|
||||||
"""
|
"""
|
||||||
|
if config["serve_images_locally"]:
|
||||||
photo_cache = ImageCache()
|
storage_dir = os.path.join(config["data_directory"], "images")
|
||||||
|
else:
|
||||||
|
storage_dir = None
|
||||||
|
photo_cache = ImageCache(
|
||||||
|
storage_dir=storage_dir
|
||||||
|
)
|
||||||
|
|
||||||
LOGGER.info("Running deep duplicates detection.")
|
LOGGER.info("Running deep duplicates detection.")
|
||||||
matching_flats = collections.defaultdict(list)
|
matching_flats = collections.defaultdict(list)
|
||||||
|
34
flatisfy/filters/images.py
Normal file
34
flatisfy/filters/images.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
"""
|
||||||
|
Filtering functions to handle images.
|
||||||
|
|
||||||
|
This includes functions to download images.
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from flatisfy.filters.cache import ImageCache
|
||||||
|
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def download_images(flats_list, config):
|
||||||
|
"""
|
||||||
|
TODO
|
||||||
|
"""
|
||||||
|
photo_cache = ImageCache(
|
||||||
|
storage_dir=os.path.join(config["data_directory"], "images")
|
||||||
|
)
|
||||||
|
flats_list_length = len(flats_list)
|
||||||
|
for i, flat in enumerate(flats_list):
|
||||||
|
LOGGER.info(
|
||||||
|
"Downloading photos for flat %d/%d.", i + 1, flats_list_length
|
||||||
|
)
|
||||||
|
for photo in flat["photos"]:
|
||||||
|
# Download photo
|
||||||
|
photo_cache.get(photo["url"])
|
||||||
|
# And store the local image
|
||||||
|
photo["local"] = photo_cache.compute_filename(photo["url"])
|
@ -9,6 +9,8 @@ import os
|
|||||||
import random
|
import random
|
||||||
import sys
|
import sys
|
||||||
import unittest
|
import unittest
|
||||||
|
import tempfile
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import requests_mock
|
import requests_mock
|
||||||
|
|
||||||
@ -157,9 +159,14 @@ class TestPhoneNumbers(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
class TestPhotos(unittest.TestCase):
|
class TestPhotos(unittest.TestCase):
|
||||||
IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name
|
|
||||||
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
|
||||||
|
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
|
||||||
|
)
|
||||||
|
super(TestPhotos, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
def test_same_photo_twice(self):
|
def test_same_photo_twice(self):
|
||||||
"""
|
"""
|
||||||
Compares a photo against itself.
|
Compares a photo against itself.
|
||||||
@ -171,8 +178,8 @@ class TestPhotos(unittest.TestCase):
|
|||||||
self.assertTrue(duplicates.compare_photos(
|
self.assertTrue(duplicates.compare_photos(
|
||||||
photo,
|
photo,
|
||||||
photo,
|
photo,
|
||||||
TestPhotos.IMAGE_CACHE,
|
self.IMAGE_CACHE,
|
||||||
TestPhotos.HASH_THRESHOLD
|
self.HASH_THRESHOLD
|
||||||
))
|
))
|
||||||
|
|
||||||
def test_different_photos(self):
|
def test_different_photos(self):
|
||||||
@ -182,15 +189,15 @@ class TestPhotos(unittest.TestCase):
|
|||||||
self.assertFalse(duplicates.compare_photos(
|
self.assertFalse(duplicates.compare_photos(
|
||||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||||
TestPhotos.IMAGE_CACHE,
|
self.IMAGE_CACHE,
|
||||||
TestPhotos.HASH_THRESHOLD
|
self.HASH_THRESHOLD
|
||||||
))
|
))
|
||||||
|
|
||||||
self.assertFalse(duplicates.compare_photos(
|
self.assertFalse(duplicates.compare_photos(
|
||||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||||
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
||||||
TestPhotos.IMAGE_CACHE,
|
self.IMAGE_CACHE,
|
||||||
TestPhotos.HASH_THRESHOLD
|
self.HASH_THRESHOLD
|
||||||
))
|
))
|
||||||
|
|
||||||
def test_matching_photos(self):
|
def test_matching_photos(self):
|
||||||
@ -200,29 +207,29 @@ class TestPhotos(unittest.TestCase):
|
|||||||
self.assertTrue(duplicates.compare_photos(
|
self.assertTrue(duplicates.compare_photos(
|
||||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||||
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
|
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
|
||||||
TestPhotos.IMAGE_CACHE,
|
self.IMAGE_CACHE,
|
||||||
TestPhotos.HASH_THRESHOLD
|
self.HASH_THRESHOLD
|
||||||
))
|
))
|
||||||
|
|
||||||
self.assertTrue(duplicates.compare_photos(
|
self.assertTrue(duplicates.compare_photos(
|
||||||
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
|
||||||
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
|
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
|
||||||
TestPhotos.IMAGE_CACHE,
|
self.IMAGE_CACHE,
|
||||||
TestPhotos.HASH_THRESHOLD
|
self.HASH_THRESHOLD
|
||||||
))
|
))
|
||||||
|
|
||||||
self.assertTrue(duplicates.compare_photos(
|
self.assertTrue(duplicates.compare_photos(
|
||||||
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
|
||||||
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
|
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
|
||||||
TestPhotos.IMAGE_CACHE,
|
self.IMAGE_CACHE,
|
||||||
TestPhotos.HASH_THRESHOLD
|
self.HASH_THRESHOLD
|
||||||
))
|
))
|
||||||
|
|
||||||
self.assertTrue(duplicates.compare_photos(
|
self.assertTrue(duplicates.compare_photos(
|
||||||
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
|
||||||
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
|
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
|
||||||
TestPhotos.IMAGE_CACHE,
|
self.IMAGE_CACHE,
|
||||||
TestPhotos.HASH_THRESHOLD
|
self.HASH_THRESHOLD
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
@ -233,7 +240,12 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
|
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name
|
||||||
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
|
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
|
||||||
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
|
||||||
IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.IMAGE_CACHE = ImageCache( # pylint: disable=invalid-name
|
||||||
|
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
|
||||||
|
)
|
||||||
|
super(TestDuplicates, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def generate_fake_flat():
|
def generate_fake_flat():
|
||||||
@ -276,10 +288,10 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
flat2 = copy.deepcopy(flat1)
|
flat2 = copy.deepcopy(flat1)
|
||||||
score = duplicates.get_duplicate_score(
|
score = duplicates.get_duplicate_score(
|
||||||
flat1, flat2,
|
flat1, flat2,
|
||||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||||
)
|
)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
score >= self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_different_prices(self):
|
def test_different_prices(self):
|
||||||
@ -292,10 +304,10 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
|
|
||||||
score = duplicates.get_duplicate_score(
|
score = duplicates.get_duplicate_score(
|
||||||
flat1, flat2,
|
flat1, flat2,
|
||||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||||
)
|
)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_different_rooms(self):
|
def test_different_rooms(self):
|
||||||
@ -309,10 +321,10 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
|
|
||||||
score = duplicates.get_duplicate_score(
|
score = duplicates.get_duplicate_score(
|
||||||
flat1, flat2,
|
flat1, flat2,
|
||||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||||
)
|
)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_different_areas(self):
|
def test_different_areas(self):
|
||||||
@ -325,10 +337,10 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
|
|
||||||
score = duplicates.get_duplicate_score(
|
score = duplicates.get_duplicate_score(
|
||||||
flat1, flat2,
|
flat1, flat2,
|
||||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||||
)
|
)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_different_areas_decimals(self):
|
def test_different_areas_decimals(self):
|
||||||
@ -343,10 +355,10 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
|
|
||||||
score = duplicates.get_duplicate_score(
|
score = duplicates.get_duplicate_score(
|
||||||
flat1, flat2,
|
flat1, flat2,
|
||||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||||
)
|
)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_different_phones(self):
|
def test_different_phones(self):
|
||||||
@ -360,10 +372,10 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
|
|
||||||
score = duplicates.get_duplicate_score(
|
score = duplicates.get_duplicate_score(
|
||||||
flat1, flat2,
|
flat1, flat2,
|
||||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||||
)
|
)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
score < self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_real_duplicates(self):
|
def test_real_duplicates(self):
|
||||||
@ -378,10 +390,10 @@ class TestDuplicates(unittest.TestCase):
|
|||||||
|
|
||||||
score = duplicates.get_duplicate_score(
|
score = duplicates.get_duplicate_score(
|
||||||
flats[0], flats[1],
|
flats[0], flats[1],
|
||||||
TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD
|
self.IMAGE_CACHE, self.HASH_THRESHOLD
|
||||||
)
|
)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS
|
score >= self.DUPLICATES_MIN_SCORE_WITH_PHOTOS
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user