flatisfy/flatisfy/tests.py

458 lines
14 KiB
Python
Raw Normal View History

2018-01-14 11:53:59 +01:00
# coding: utf-8
"""
This module contains unit testing functions.
"""
import copy
import json
import logging
import os
import random
import sys
import unittest
import tempfile
2018-01-30 10:36:40 +01:00
from io import BytesIO
import PIL
2018-01-19 13:53:53 +01:00
import requests
import requests_mock
2018-01-14 11:53:59 +01:00
from flatisfy import tools
from flatisfy.filters import duplicates
from flatisfy.filters.cache import ImageCache
from flatisfy.constants import BACKENDS_BY_PRECEDENCE
LOGGER = logging.getLogger(__name__)
TESTS_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + "/test_files/"
2018-01-19 13:53:53 +01:00
class LocalImageCache(ImageCache):
"""
A local cache for images, stored in memory.
"""
2021-01-26 14:39:52 +01:00
2018-01-19 13:53:53 +01:00
@staticmethod
def on_miss(path):
"""
Helper to actually retrieve photos if not already cached.
"""
url = "mock://flatisfy" + path
with requests_mock.Mocker() as mock:
with open(path, "rb") as fh:
mock.get(url, content=fh.read())
2018-01-30 10:36:40 +01:00
return PIL.Image.open(BytesIO(requests.get(url).content))
2018-01-19 13:53:53 +01:00
2018-01-14 11:53:59 +01:00
class TestTexts(unittest.TestCase):
"""
Checks string normalizations.
"""
2021-01-26 14:39:52 +01:00
2018-01-14 11:53:59 +01:00
def test_roman_numbers(self):
"""
Checks roman numbers replacement.
"""
2021-01-26 14:39:52 +01:00
self.assertEqual("XIV", tools.convert_arabic_to_roman("14"))
2018-01-18 13:15:09 +01:00
2021-01-26 14:39:52 +01:00
self.assertEqual("XXXIX", tools.convert_arabic_to_roman("39"))
2021-01-26 14:39:52 +01:00
self.assertEqual("40", tools.convert_arabic_to_roman("40"))
2021-01-26 14:39:52 +01:00
self.assertEqual("1987", tools.convert_arabic_to_roman("1987"))
2018-01-18 13:15:09 +01:00
self.assertEqual(
"Dans le XVe arrondissement",
2021-01-26 14:39:52 +01:00
tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement"),
2018-01-18 13:15:09 +01:00
)
self.assertEqual("XXeme arr.", tools.convert_arabic_to_roman_in_text("20eme arr."))
2018-01-18 13:15:09 +01:00
self.assertEqual(
"A AIX EN PROVENCE",
2021-01-26 14:39:52 +01:00
tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE"),
2018-01-18 13:15:09 +01:00
)
self.assertEqual(
"Montigny Le Bretonneux",
2021-01-26 14:39:52 +01:00
tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux"),
)
2018-01-18 13:15:09 +01:00
def test_roman_numbers_in_text(self):
"""
Checks conversion of roman numbers to arabic ones in string
normalization.
"""
2018-01-18 13:15:09 +01:00
self.assertEqual(
"dans le XVe arrondissement",
2021-01-26 14:39:52 +01:00
tools.normalize_string("Dans le 15e arrondissement"),
2018-01-14 11:53:59 +01:00
)
2021-01-26 14:39:52 +01:00
self.assertEqual("paris XVe, 75005", tools.normalize_string("Paris 15e, 75005"))
2021-01-26 14:39:52 +01:00
self.assertEqual("paris xve, 75005", tools.normalize_string("Paris XVe, 75005"))
2018-01-14 11:53:59 +01:00
def test_multiple_whitespaces(self):
"""
Checks whitespaces are collapsed.
"""
2021-01-26 14:39:52 +01:00
self.assertEqual("avec ascenseur", tools.normalize_string("avec ascenseur"))
2018-01-14 11:53:59 +01:00
def test_whitespace_trim(self):
"""
Checks that trailing and beginning whitespaces are trimmed.
"""
2021-01-26 14:39:52 +01:00
self.assertEqual("rennes 35000", tools.normalize_string(" Rennes 35000 "))
2018-01-14 11:53:59 +01:00
def test_accents(self):
"""
Checks accents are replaced.
"""
2021-01-26 14:39:52 +01:00
self.assertEqual("eeeaui", tools.normalize_string(u"éèêàüï"))
2018-01-14 11:53:59 +01:00
2018-01-14 11:53:59 +01:00
class TestPhoneNumbers(unittest.TestCase):
"""
Checks phone numbers normalizations.
"""
2021-01-26 14:39:52 +01:00
2018-01-14 11:53:59 +01:00
def test_prefix(self):
"""
Checks phone numbers with international prefixes.
"""
self.assertEqual("0605040302", duplicates.homogeneize_phone_number("+33605040302"))
2018-01-14 11:53:59 +01:00
def test_dots_separators(self):
"""
Checks phone numbers with dots.
"""
self.assertEqual("0605040302", duplicates.homogeneize_phone_number("06.05.04.03.02"))
2018-01-14 11:53:59 +01:00
def test_spaces_separators(self):
"""
Checks phone numbers with spaces.
"""
self.assertEqual("0605040302", duplicates.homogeneize_phone_number("06 05 04 03 02"))
2018-01-14 11:53:59 +01:00
2018-01-19 13:53:53 +01:00
class TestPhotos(unittest.TestCase):
2018-01-21 11:52:52 +01:00
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
2018-01-19 13:53:53 +01:00
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestPhotos, self).__init__(*args, **kwargs)
2018-01-19 13:53:53 +01:00
def test_same_photo_twice(self):
"""
Compares a photo against itself.
"""
2021-01-26 14:39:52 +01:00
photo = {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}
2018-01-19 13:53:53 +01:00
self.assertTrue(duplicates.compare_photos(photo, photo, self.IMAGE_CACHE, self.HASH_THRESHOLD))
2018-01-19 13:53:53 +01:00
def test_different_photos(self):
"""
Compares two different photos.
"""
2021-01-26 14:39:52 +01:00
self.assertFalse(
duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
self.IMAGE_CACHE,
self.HASH_THRESHOLD,
)
)
2018-01-19 13:53:53 +01:00
2021-01-26 14:39:52 +01:00
self.assertFalse(
duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
self.IMAGE_CACHE,
self.HASH_THRESHOLD,
)
)
2018-01-19 13:53:53 +01:00
def test_matching_photos(self):
"""
Compares two matching photos with different size and source.
"""
2021-01-26 14:39:52 +01:00
self.assertTrue(
duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"},
self.IMAGE_CACHE,
self.HASH_THRESHOLD,
)
)
self.assertTrue(
duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"},
self.IMAGE_CACHE,
self.HASH_THRESHOLD,
)
)
self.assertTrue(
duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"},
{"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"},
self.IMAGE_CACHE,
self.HASH_THRESHOLD,
)
)
self.assertTrue(
duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "127028739@seloger.jpg"},
{"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"},
self.IMAGE_CACHE,
self.HASH_THRESHOLD,
)
)
2018-01-19 13:53:53 +01:00
2018-01-21 15:35:52 +01:00
def test_matching_cropped_photos(self):
"""
Compares two matching photos with one being cropped.
"""
# Fixme: the image hash treshold should be 10 ideally
2021-01-26 14:39:52 +01:00
self.assertTrue(
duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "vertical.jpg"},
{"url": TESTS_DATA_DIR + "vertical-cropped.jpg"},
self.IMAGE_CACHE,
20,
)
)
2018-01-21 13:48:46 +01:00
2018-01-21 15:35:52 +01:00
# Fixme: the image hash treshold should be 10 ideally
2021-01-26 14:39:52 +01:00
self.assertTrue(
duplicates.compare_photos(
{"url": TESTS_DATA_DIR + "13783671@explorimmo.jpg"},
{"url": TESTS_DATA_DIR + "124910113@seloger.jpg"},
self.IMAGE_CACHE,
20,
)
)
2018-01-19 13:53:53 +01:00
2018-01-22 12:40:52 +01:00
class TestImageCache(unittest.TestCase):
"""
Checks image cache is working as expected.
"""
2021-01-26 14:39:52 +01:00
def __init__(self, *args, **kwargs):
self.IMAGE_CACHE = ImageCache(storage_dir=tempfile.mkdtemp(prefix="flatisfy-")) # pylint: disable=invalid-name
super(TestImageCache, self).__init__(*args, **kwargs)
def test_invalid_url(self):
"""
Check that it returns nothing on an invalid URL.
"""
# See https://framagit.org/phyks/Flatisfy/issues/116.
2021-01-26 14:39:52 +01:00
self.assertIsNone(self.IMAGE_CACHE.get("https://httpbin.org/status/404"))
self.assertIsNone(self.IMAGE_CACHE.get("https://httpbin.org/status/500"))
def test_invalid_data(self):
"""
Check that it returns nothing on an invalid data.
"""
# See https://framagit.org/phyks/Flatisfy/issues/116.
2021-01-26 14:39:52 +01:00
self.assertIsNone(self.IMAGE_CACHE.get("https://httpbin.org/"))
2018-01-14 11:53:59 +01:00
class TestDuplicates(unittest.TestCase):
"""
Checks duplicates detection.
"""
2021-01-26 14:39:52 +01:00
2018-01-30 10:36:40 +01:00
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 8 # pylint: disable=invalid-name
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name
2018-01-21 11:52:52 +01:00
HASH_THRESHOLD = 10 # pylint: disable=invalid-name
def __init__(self, *args, **kwargs):
2019-01-17 14:52:26 +01:00
self.IMAGE_CACHE = LocalImageCache( # pylint: disable=invalid-name
storage_dir=tempfile.mkdtemp(prefix="flatisfy-")
)
super(TestDuplicates, self).__init__(*args, **kwargs)
2018-01-14 11:53:59 +01:00
@staticmethod
def generate_fake_flat():
2018-01-14 11:53:59 +01:00
"""
Generates a fake flat post.
"""
backend = BACKENDS_BY_PRECEDENCE[random.randint(0, len(BACKENDS_BY_PRECEDENCE) - 1)]
2018-01-14 11:53:59 +01:00
return {
"id": str(random.randint(100000, 199999)) + "@" + backend,
"phone": "0607080910",
"rooms": random.randint(1, 4),
"utilities": "",
"area": random.randint(200, 1500) / 10,
"cost": random.randint(100000, 300000),
2021-01-26 14:39:52 +01:00
"bedrooms": random.randint(1, 4),
2018-01-14 11:53:59 +01:00
}
@staticmethod
def load_files(file1, file2):
2018-01-14 11:53:59 +01:00
"""
Load two files
:return: A dict with two flats
"""
with open(TESTS_DATA_DIR + file1 + ".json", "r") as flat_file:
flat1 = json.loads(flat_file.read())
with open(TESTS_DATA_DIR + file2 + ".json", "r") as flat_file:
flat2 = json.loads(flat_file.read())
return [flat1, flat2]
def test_duplicates(self):
"""
Two identical flats should be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
score = duplicates.get_duplicate_score(flat1, flat2, self.IMAGE_CACHE, self.HASH_THRESHOLD)
2018-01-22 12:40:52 +01:00
self.assertGreaterEqual(score, self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
2018-01-14 11:53:59 +01:00
def test_different_prices(self):
"""
Two flats with different prices should not be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat2["cost"] += 1000
score = duplicates.get_duplicate_score(flat1, flat2, self.IMAGE_CACHE, self.HASH_THRESHOLD)
2018-01-22 12:40:52 +01:00
self.assertLess(score, self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
2018-01-14 11:53:59 +01:00
def test_different_rooms(self):
"""
Two flats with different rooms quantity should not be detected as
duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat2["rooms"] += 1
score = duplicates.get_duplicate_score(flat1, flat2, self.IMAGE_CACHE, self.HASH_THRESHOLD)
2018-01-22 12:40:52 +01:00
self.assertLess(score, self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
2018-01-14 11:53:59 +01:00
def test_different_areas(self):
"""
Two flats with different areas should not be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat2["area"] += 10
score = duplicates.get_duplicate_score(flat1, flat2, self.IMAGE_CACHE, self.HASH_THRESHOLD)
2018-01-22 12:40:52 +01:00
self.assertLess(score, self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
2018-01-14 11:53:59 +01:00
def test_different_areas_decimals(self):
"""
Two flats which areas integers are equal but decimals are present and
different should not be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat1["area"] = 50.65
flat2["area"] = 50.37
score = duplicates.get_duplicate_score(flat1, flat2, self.IMAGE_CACHE, self.HASH_THRESHOLD)
2018-01-22 12:40:52 +01:00
self.assertLess(score, self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
2018-01-14 11:53:59 +01:00
def test_different_phones(self):
"""
Two flats with different phone numbers should not be detected as
duplicates.
2018-01-14 11:53:59 +01:00
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat2["phone"] = "0708091011"
score = duplicates.get_duplicate_score(flat1, flat2, self.IMAGE_CACHE, self.HASH_THRESHOLD)
2018-01-22 12:40:52 +01:00
self.assertLess(score, self.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
2018-01-14 11:53:59 +01:00
def test_real_duplicates(self):
"""
Two flats with same price, area and rooms quantity should be detected
as duplicates.
2018-01-14 11:53:59 +01:00
"""
2021-01-26 14:39:52 +01:00
flats = self.load_files("127028739@seloger", "14428129@explorimmo")
2018-01-14 11:53:59 +01:00
score = duplicates.get_duplicate_score(flats[0], flats[1], self.IMAGE_CACHE, self.HASH_THRESHOLD)
2018-01-22 12:40:52 +01:00
self.assertGreaterEqual(score, self.DUPLICATES_MIN_SCORE_WITH_PHOTOS)
2019-01-17 14:52:26 +01:00
# TODO: fixme, find new testing examples
# flats = self.load_files(
# "128358415@seloger",
# "14818297@explorimmo"
# )
# score = duplicates.get_duplicate_score(
# flats[0], flats[1],
# self.IMAGE_CACHE, 20
# )
# self.assertGreaterEqual(score, self.DUPLICATES_MIN_SCORE_WITH_PHOTOS)
# # Different number of photos, and some are cropped
# flats = self.load_files(
# "124910113@seloger",
# "13783671@explorimmo"
# )
# score = duplicates.get_duplicate_score(
# flats[0], flats[1],
# self.IMAGE_CACHE, 20
# )
# self.assertGreaterEqual(score, self.DUPLICATES_MIN_SCORE_WITH_PHOTOS)
# # Same flat, different agencies, texts and photos
# flats = self.load_files(
# "122509451@seloger",
# "127963747@seloger"
# )
# score = duplicates.get_duplicate_score(
# flats[0], flats[1],
# self.IMAGE_CACHE, self.HASH_THRESHOLD
# )
# # Fix me : should be TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS
# self.assertGreaterEqual(score, 4)
# # Really similar flats, but different
# flats = self.load_files(
# "123312807@seloger",
# "123314207@seloger"
# )
# score = duplicates.get_duplicate_score(
# flats[0], flats[1],
# self.IMAGE_CACHE, self.HASH_THRESHOLD
# )
# self.assertLess(score, self.DUPLICATES_MIN_SCORE_WITH_PHOTOS)
2018-01-22 12:40:52 +01:00
def run():
2018-01-14 11:53:59 +01:00
"""
Run all the tests
"""
LOGGER.info("Running tests…")
try:
2021-01-26 14:39:52 +01:00
for testsuite in [
TestTexts,
TestPhoneNumbers,
TestImageCache,
TestDuplicates,
TestPhotos,
]:
suite = unittest.TestLoader().loadTestsFromTestCase(testsuite)
result = unittest.TextTestRunner(verbosity=2).run(suite)
assert result.wasSuccessful()
except AssertionError:
sys.exit(1)