Merge branch 'better-dedup' into 'master'
Optimize photo comparisons Closes #41 and #59 See merge request !9
This commit is contained in:
commit
4c07fc8ba1
@ -80,7 +80,7 @@ def refine_with_housing_criteria(flats_list, constraint):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@tools.timeit
|
||||||
def first_pass(flats_list, constraint, config):
|
def first_pass(flats_list, constraint, config):
|
||||||
"""
|
"""
|
||||||
First filtering pass.
|
First filtering pass.
|
||||||
@ -123,7 +123,7 @@ def first_pass(flats_list, constraint, config):
|
|||||||
"duplicate": duplicates_by_id + duplicates_by_urls
|
"duplicate": duplicates_by_id + duplicates_by_urls
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@tools.timeit
|
||||||
def second_pass(flats_list, constraint, config):
|
def second_pass(flats_list, constraint, config):
|
||||||
"""
|
"""
|
||||||
Second filtering pass.
|
Second filtering pass.
|
||||||
@ -163,7 +163,7 @@ def second_pass(flats_list, constraint, config):
|
|||||||
"duplicate": []
|
"duplicate": []
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@tools.timeit
|
||||||
def third_pass(flats_list, config):
|
def third_pass(flats_list, config):
|
||||||
"""
|
"""
|
||||||
Third filtering pass.
|
Third filtering pass.
|
||||||
@ -175,6 +175,8 @@ def third_pass(flats_list, config):
|
|||||||
:param config: A config dict.
|
:param config: A config dict.
|
||||||
:return: A dict mapping flat status and list of flat objects.
|
:return: A dict mapping flat status and list of flat objects.
|
||||||
"""
|
"""
|
||||||
|
LOGGER.info("Running third filtering pass.")
|
||||||
|
|
||||||
# Deduplicate the list using every available data
|
# Deduplicate the list using every available data
|
||||||
flats_list, duplicate_flats = duplicates.deep_detect(flats_list)
|
flats_list, duplicate_flats = duplicates.deep_detect(flats_list)
|
||||||
|
|
||||||
|
45
flatisfy/filters/cache.py
Normal file
45
flatisfy/filters/cache.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
"""
|
||||||
|
Caching function for pictures.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
class MemoryCache(object):
|
||||||
|
def __init__(self, on_miss):
|
||||||
|
self.hits = 0
|
||||||
|
self.misses = 0
|
||||||
|
self.map = {}
|
||||||
|
self.on_miss = on_miss
|
||||||
|
|
||||||
|
def get(self, key):
|
||||||
|
cached = self.map.get(key, None)
|
||||||
|
if cached is not None:
|
||||||
|
self.hits += 1
|
||||||
|
return cached
|
||||||
|
|
||||||
|
item = self.map[key] = self.on_miss(key)
|
||||||
|
self.misses += 1
|
||||||
|
return item
|
||||||
|
|
||||||
|
def total(self):
|
||||||
|
return self.hits + self.misses
|
||||||
|
|
||||||
|
def hit_rate(self):
|
||||||
|
assert self.total() > 0
|
||||||
|
return 100 * self.hits // self.total()
|
||||||
|
|
||||||
|
def miss_rate(self):
|
||||||
|
assert self.total() > 0
|
||||||
|
return 100 * self.misses // self.total()
|
||||||
|
|
||||||
|
class ImageCache(MemoryCache):
|
||||||
|
@staticmethod
|
||||||
|
def retrieve_photo(url):
|
||||||
|
return requests.get(url)
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(self.__class__, self).__init__(on_miss=ImageCache.retrieve_photo)
|
@ -16,6 +16,7 @@ import PIL.Image
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
|
from flatisfy.filters.cache import ImageCache
|
||||||
|
|
||||||
LOGGER = logging.getLogger(__name__)
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -60,8 +61,7 @@ def homogeneize_phone_number(number):
|
|||||||
|
|
||||||
return number
|
return number
|
||||||
|
|
||||||
|
def find_number_common_photos(photo_cache, flat1_photos, flat2_photos):
|
||||||
def find_number_common_photos(flat1_photos, flat2_photos):
|
|
||||||
"""
|
"""
|
||||||
Compute the number of common photos between the two lists of photos for the
|
Compute the number of common photos between the two lists of photos for the
|
||||||
flats.
|
flats.
|
||||||
@ -75,13 +75,14 @@ def find_number_common_photos(flat1_photos, flat2_photos):
|
|||||||
:return: The found number of common photos.
|
:return: The found number of common photos.
|
||||||
"""
|
"""
|
||||||
n_common_photos = 0
|
n_common_photos = 0
|
||||||
|
|
||||||
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
|
for photo1, photo2 in itertools.product(flat1_photos, flat2_photos):
|
||||||
try:
|
try:
|
||||||
req1 = requests.get(photo1["url"])
|
req1 = photo_cache.get(photo1["url"])
|
||||||
im1 = PIL.Image.open(BytesIO(req1.content))
|
im1 = PIL.Image.open(BytesIO(req1.content))
|
||||||
hash1 = imagehash.average_hash(im1)
|
hash1 = imagehash.average_hash(im1)
|
||||||
|
|
||||||
req2 = requests.get(photo2["url"])
|
req2 = photo_cache.get(photo2["url"])
|
||||||
im2 = PIL.Image.open(BytesIO(req2.content))
|
im2 = PIL.Image.open(BytesIO(req2.content))
|
||||||
hash2 = imagehash.average_hash(im2)
|
hash2 = imagehash.average_hash(im2)
|
||||||
|
|
||||||
@ -89,6 +90,7 @@ def find_number_common_photos(flat1_photos, flat2_photos):
|
|||||||
n_common_photos += 1
|
n_common_photos += 1
|
||||||
except (IOError, requests.exceptions.RequestException):
|
except (IOError, requests.exceptions.RequestException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return n_common_photos
|
return n_common_photos
|
||||||
|
|
||||||
|
|
||||||
@ -187,6 +189,9 @@ def deep_detect(flats_list):
|
|||||||
the flats objects that should be removed and considered as duplicates (they
|
the flats objects that should be removed and considered as duplicates (they
|
||||||
were already merged).
|
were already merged).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
photo_cache = ImageCache()
|
||||||
|
|
||||||
LOGGER.info("Running deep duplicates detection.")
|
LOGGER.info("Running deep duplicates detection.")
|
||||||
matching_flats = collections.defaultdict(list)
|
matching_flats = collections.defaultdict(list)
|
||||||
for i, flat1 in enumerate(flats_list):
|
for i, flat1 in enumerate(flats_list):
|
||||||
@ -250,16 +255,22 @@ def deep_detect(flats_list):
|
|||||||
# They should have at least one photo in common if there
|
# They should have at least one photo in common if there
|
||||||
# are some photos
|
# are some photos
|
||||||
if flat1["photos"] and flat2["photos"]:
|
if flat1["photos"] and flat2["photos"]:
|
||||||
max_number_photos = max(len(flat1["photos"]),
|
|
||||||
len(flat2["photos"]))
|
|
||||||
n_common_photos = find_number_common_photos(
|
n_common_photos = find_number_common_photos(
|
||||||
|
photo_cache,
|
||||||
flat1["photos"],
|
flat1["photos"],
|
||||||
flat2["photos"]
|
flat2["photos"]
|
||||||
)
|
)
|
||||||
assert n_common_photos > 1
|
assert n_common_photos > 1
|
||||||
n_common_items += int(
|
|
||||||
20 * n_common_photos / max_number_photos
|
min_number_photos = min(len(flat1["photos"]),
|
||||||
)
|
len(flat2["photos"]))
|
||||||
|
|
||||||
|
# Either all the photos are the same, or there are at least
|
||||||
|
# three common photos.
|
||||||
|
if n_common_photos == min_number_photos:
|
||||||
|
n_common_items += 15
|
||||||
|
else:
|
||||||
|
n_common_items += 5 * min(n_common_photos, 3)
|
||||||
|
|
||||||
# Minimal score to consider they are duplicates
|
# Minimal score to consider they are duplicates
|
||||||
assert n_common_items >= 15
|
assert n_common_items >= 15
|
||||||
@ -281,6 +292,11 @@ def deep_detect(flats_list):
|
|||||||
matching_flats[flat1["id"]].append(flat2["id"])
|
matching_flats[flat1["id"]].append(flat2["id"])
|
||||||
matching_flats[flat2["id"]].append(flat1["id"])
|
matching_flats[flat2["id"]].append(flat1["id"])
|
||||||
|
|
||||||
|
if photo_cache.total():
|
||||||
|
LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",
|
||||||
|
photo_cache.hit_rate(),
|
||||||
|
photo_cache.miss_rate())
|
||||||
|
|
||||||
seen_ids = []
|
seen_ids = []
|
||||||
duplicate_flats = []
|
duplicate_flats = []
|
||||||
unique_flats_list = []
|
unique_flats_list = []
|
||||||
|
@ -13,6 +13,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import unidecode
|
import unidecode
|
||||||
@ -299,3 +300,16 @@ def get_travel_time_between(latlng_from, latlng_to, config):
|
|||||||
"sections": sections
|
"sections": sections
|
||||||
}
|
}
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def timeit(f):
|
||||||
|
"""
|
||||||
|
A decorator that logs how much time was spent in the function.
|
||||||
|
"""
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
before = time.time()
|
||||||
|
res = f(*args, **kwargs)
|
||||||
|
runtime = time.time() - before
|
||||||
|
LOGGER.info("%s -- Execution took %s seconds.", f.__name__, runtime)
|
||||||
|
return res
|
||||||
|
return wrapped
|
||||||
|
Loading…
Reference in New Issue
Block a user