# coding: utf-8 """ This module contains unit testing functions. """ import copy import json import logging import os import random import sys import unittest import requests import requests_mock from flatisfy import tools from flatisfy.filters import duplicates from flatisfy.filters.cache import ImageCache from flatisfy.constants import BACKENDS_BY_PRECEDENCE LOGGER = logging.getLogger(__name__) TESTS_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + "/test_files/" class LocalImageCache(ImageCache): """ A local cache for images, stored in memory. """ @staticmethod def on_miss(path): """ Helper to actually retrieve photos if not already cached. """ url = "mock://flatisfy" + path with requests_mock.Mocker() as mock: with open(path, "rb") as fh: mock.get(url, content=fh.read()) return requests.get(url) class TestTexts(unittest.TestCase): """ Checks string normalizations. """ def test_roman_numbers(self): """ Checks roman numbers replacement. """ self.assertEqual( "XIV", tools.convert_arabic_to_roman("14") ) self.assertEqual( "XXXIX", tools.convert_arabic_to_roman("39") ) self.assertEqual( "40", tools.convert_arabic_to_roman("40") ) self.assertEqual( "1987", tools.convert_arabic_to_roman("1987") ) self.assertEqual( "Dans le XVe arrondissement", tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement") ) self.assertEqual( "XXeme arr.", tools.convert_arabic_to_roman_in_text("20eme arr.") ) self.assertEqual( "A AIX EN PROVENCE", tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE") ) self.assertEqual( "Montigny Le Bretonneux", tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux") ) def test_roman_numbers_in_text(self): """ Checks conversion of roman numbers to arabic ones in string normalization. """ self.assertEqual( "dans le XVe arrondissement", tools.normalize_string("Dans le 15e arrondissement") ) self.assertEqual( "paris XVe, 75005", tools.normalize_string("Paris 15e, 75005") ) self.assertEqual( "paris xve, 75005", tools.normalize_string("Paris XVe, 75005") ) def test_multiple_whitespaces(self): """ Checks whitespaces are collapsed. """ self.assertEqual( "avec ascenseur", tools.normalize_string("avec ascenseur") ) def test_accents(self): """ Checks accents are replaced. """ self.assertEqual( "eeeaui", tools.normalize_string(u"éèêàüï") ) class TestPhoneNumbers(unittest.TestCase): """ Checks phone numbers normalizations. """ def test_prefix(self): """ Checks phone numbers with international prefixes. """ self.assertEqual( "0605040302", duplicates.homogeneize_phone_number("+33605040302") ) def test_dots_separators(self): """ Checks phone numbers with dots. """ self.assertEqual( "0605040302", duplicates.homogeneize_phone_number("06.05.04.03.02") ) def test_spaces_separators(self): """ Checks phone numbers with spaces. """ self.assertEqual( "0605040302", duplicates.homogeneize_phone_number("06 05 04 03 02") ) class TestPhotos(unittest.TestCase): IMAGE_CACHE = LocalImageCache() # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name def test_same_photo_twice(self): """ Compares a photo against itself. """ photo = { "url": TESTS_DATA_DIR + "127028739@seloger.jpg" } self.assertTrue(duplicates.compare_photos( photo, photo, TestPhotos.IMAGE_CACHE, TestPhotos.HASH_THRESHOLD )) def test_different_photos(self): """ Compares two different photos. """ self.assertFalse(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, TestPhotos.IMAGE_CACHE, TestPhotos.HASH_THRESHOLD )) self.assertFalse(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, TestPhotos.IMAGE_CACHE, TestPhotos.HASH_THRESHOLD )) def test_matching_photos(self): """ Compares two matching photos with different size and source. """ self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129@explorimmo.jpg"}, TestPhotos.IMAGE_CACHE, TestPhotos.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-2@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129-2@explorimmo.jpg"}, TestPhotos.IMAGE_CACHE, TestPhotos.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739-3@seloger.jpg"}, {"url": TESTS_DATA_DIR + "14428129-3@explorimmo.jpg"}, TestPhotos.IMAGE_CACHE, TestPhotos.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "127028739@seloger.jpg"}, {"url": TESTS_DATA_DIR + "127028739-watermark@seloger.jpg"}, TestPhotos.IMAGE_CACHE, TestPhotos.HASH_THRESHOLD )) self.assertTrue(duplicates.compare_photos( {"url": TESTS_DATA_DIR + "vertical.jpg"}, {"url": TESTS_DATA_DIR + "vertical-cropped.jpg"}, TestPhotos.IMAGE_CACHE, 20 )) class TestDuplicates(unittest.TestCase): """ Checks duplicates detection. """ DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 # pylint: disable=invalid-name DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 # pylint: disable=invalid-name HASH_THRESHOLD = 10 # pylint: disable=invalid-name IMAGE_CACHE = ImageCache() # pylint: disable=invalid-name @staticmethod def generate_fake_flat(): """ Generates a fake flat post. """ backend = BACKENDS_BY_PRECEDENCE[ random.randint(0, len(BACKENDS_BY_PRECEDENCE) - 1) ] return { "id": str(random.randint(100000, 199999)) + "@" + backend, "phone": "0607080910", "rooms": random.randint(1, 4), "utilities": "", "area": random.randint(200, 1500) / 10, "cost": random.randint(100000, 300000), "bedrooms": random.randint(1, 4) } @staticmethod def load_files(file1, file2): """ Load two files :return: A dict with two flats """ with open(TESTS_DATA_DIR + file1 + ".json", "r") as flat_file: flat1 = json.loads(flat_file.read()) with open(TESTS_DATA_DIR + file2 + ".json", "r") as flat_file: flat2 = json.loads(flat_file.read()) return [flat1, flat2] def test_duplicates(self): """ Two identical flats should be detected as duplicates. """ flat1 = self.generate_fake_flat() flat2 = copy.deepcopy(flat1) score = duplicates.get_duplicate_score( flat1, flat2, TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_prices(self): """ Two flats with different prices should not be detected as duplicates. """ flat1 = self.generate_fake_flat() flat2 = copy.deepcopy(flat1) flat2["cost"] += 1000 score = duplicates.get_duplicate_score( flat1, flat2, TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_rooms(self): """ Two flats with different rooms quantity should not be detected as duplicates. """ flat1 = self.generate_fake_flat() flat2 = copy.deepcopy(flat1) flat2["rooms"] += 1 score = duplicates.get_duplicate_score( flat1, flat2, TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_areas(self): """ Two flats with different areas should not be detected as duplicates. """ flat1 = self.generate_fake_flat() flat2 = copy.deepcopy(flat1) flat2["area"] += 10 score = duplicates.get_duplicate_score( flat1, flat2, TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_areas_decimals(self): """ Two flats which areas integers are equal but decimals are present and different should not be detected as duplicates. """ flat1 = self.generate_fake_flat() flat2 = copy.deepcopy(flat1) flat1["area"] = 50.65 flat2["area"] = 50.37 score = duplicates.get_duplicate_score( flat1, flat2, TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_different_phones(self): """ Two flats with different phone numbers should not be detected as duplicates. """ flat1 = self.generate_fake_flat() flat2 = copy.deepcopy(flat1) flat2["phone"] = "0708091011" score = duplicates.get_duplicate_score( flat1, flat2, TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS ) def test_real_duplicates(self): """ Two flats with same price, area and rooms quantity should be detected as duplicates. """ flats = self.load_files( "127028739@seloger", "14428129@explorimmo" ) score = duplicates.get_duplicate_score( flats[0], flats[1], TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS ) flats = self.load_files( "128358415@seloger", "14818297@explorimmo" ) score = duplicates.get_duplicate_score( flats[0], flats[1], TestDuplicates.IMAGE_CACHE, 20 ) self.assertTrue( score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS ) # Same flat, different agencies, texts and photos flats = self.load_files( "122509451@seloger", "127963747@seloger" ) score = duplicates.get_duplicate_score( flats[0], flats[1], TestDuplicates.IMAGE_CACHE, TestDuplicates.HASH_THRESHOLD ) self.assertTrue( score >= 4 ) def run(): """ Run all the tests """ LOGGER.info("Running tests…") try: suite = unittest.TestLoader().loadTestsFromTestCase(TestTexts) result = unittest.TextTestRunner(verbosity=2).run(suite) assert result.wasSuccessful() suite = unittest.TestLoader().loadTestsFromTestCase(TestPhoneNumbers) result = unittest.TextTestRunner(verbosity=2).run(suite) assert result.wasSuccessful() suite = unittest.TestLoader().loadTestsFromTestCase(TestDuplicates) result = unittest.TextTestRunner(verbosity=2).run(suite) assert result.wasSuccessful() suite = unittest.TestLoader().loadTestsFromTestCase(TestPhotos) result = unittest.TextTestRunner(verbosity=2).run(suite) assert result.wasSuccessful() except AssertionError: sys.exit(1)