diff --git a/flatisfy/__main__.py b/flatisfy/__main__.py index d373a86..5c3bced 100644 --- a/flatisfy/__main__.py +++ b/flatisfy/__main__.py @@ -16,6 +16,7 @@ from flatisfy import cmds from flatisfy import data from flatisfy import fetch from flatisfy import tools +from flatisfy import tests # pylint: enable=locally-disabled,wrong-import-position @@ -113,6 +114,10 @@ def parse_args(argv=None): parser_serve.add_argument("--port", type=int, help="Port to bind to.") parser_serve.add_argument("--host", help="Host to listen on.") + # Test subcommand parser + subparsers.add_parser("test", parents=[parent_parser], + help="Unit testing.") + return parser.parse_args(argv) @@ -212,6 +217,10 @@ def main(): elif args.cmd == "serve": cmds.serve(config) return + # Tests command + elif args.cmd == "test": + tests.run(config) + return if __name__ == "__main__": diff --git a/flatisfy/filters/duplicates.py b/flatisfy/filters/duplicates.py index c9539d8..b4a7467 100644 --- a/flatisfy/filters/duplicates.py +++ b/flatisfy/filters/duplicates.py @@ -167,6 +167,103 @@ def detect(flats_list, key="id", merge=True, should_intersect=False): return unique_flats_list, duplicate_flats +def get_duplicate_score(flat1, flat2, photo_cache): + n_common_items = 0 + try: + # They should have the same area, up to one unit + assert abs(flat1["area"] - flat2["area"]) < 1 + n_common_items += 1 + + # They should be at the same price, up to one unit + assert abs(flat1["cost"] - flat2["cost"]) < 1 + n_common_items += 1 + + # They should have the same number of bedrooms if this was + # fetched for both + if flat1["bedrooms"] and flat2["bedrooms"]: + assert flat1["bedrooms"] == flat2["bedrooms"] + n_common_items += 1 + + # They should have the same utilities (included or excluded for + # both of them), if this was fetched for both + if flat1["utilities"] and flat2["utilities"]: + assert flat1["utilities"] == flat2["utilities"] + n_common_items += 1 + + # They should have the same number of rooms if it was fetched + # for both of them + if flat1["rooms"] and flat2["rooms"]: + assert flat1["rooms"] == flat2["rooms"] + n_common_items += 1 + + # They should have the same postal code, if available + if ( + "flatisfy" in flat1 and "flatisfy" in flat2 and + flat1["flatisfy"].get("postal_code", None) and + flat2["flatisfy"].get("postal_code", None) + ): + assert ( + flat1["flatisfy"]["postal_code"] == + flat2["flatisfy"]["postal_code"] + ) + n_common_items += 1 + + # TODO: Better text comparison (one included in the other, fuzzymatch) + flat1_text = tools.normalize_string(flat1.get("text", "")) + flat2_text = tools.normalize_string(flat2.get("text", "")) + if flat1_text and flat2_text and flat1_text == flat2_text: + n_common_items += 1 + + # They should have the same phone number if it was fetched for + # both + flat1_phone = homogeneize_phone_number(flat1["phone"]) + flat2_phone = homogeneize_phone_number(flat2["phone"]) + if flat1_phone and flat2_phone: + assert flat1_phone == flat2_phone + n_common_items += 10 # Counts much more than the rest + + # They should have at least one photo in common if there + # are some photos + if flat1.get("photos", []) and flat2.get("photos", []): + n_common_photos = find_number_common_photos( + photo_cache, + flat1["photos"], + flat2["photos"] + ) + assert n_common_photos > 1 + + min_number_photos = min(len(flat1["photos"]), + len(flat2["photos"])) + + # Either all the photos are the same, or there are at least + # three common photos. + if n_common_photos == min_number_photos: + n_common_items += 15 + else: + n_common_items += 5 * min(n_common_photos, 3) + + # If the two flats are from the same website and have a + # different float part, consider they cannot be duplicates. See + # https://framagit.org/phyks/Flatisfy/issues/100. + both_are_from_same_backend = ( + flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1] + ) + both_have_float_part = ( + (flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0 + ) + both_have_equal_float_part = ( + (flat1["area"] % 1) == (flat2["area"] % 1) + ) + if both_have_float_part and both_are_from_same_backend: + assert both_have_equal_float_part + except (AssertionError, TypeError): + # Skip and consider as not duplicates whenever the conditions + # are not met + # TypeError occurs when an area or a cost is None, which should + # not be considered as duplicates + n_common_items = 0 + + return n_common_items def deep_detect(flats_list, config): """ @@ -192,111 +289,20 @@ def deep_detect(flats_list, config): if flat2["id"] in matching_flats[flat1["id"]]: continue - n_common_items = 0 - try: - # They should have the same area, up to one unit - assert abs(flat1["area"] - flat2["area"]) < 1 - n_common_items += 1 + n_common_items = get_duplicate_score(flat1, flat2, photo_cache) - # They should be at the same price, up to one unit - assert abs(flat1["cost"] - flat2["cost"]) < 1 - n_common_items += 1 - - # They should have the same number of bedrooms if this was - # fetched for both - if flat1["bedrooms"] and flat2["bedrooms"]: - assert flat1["bedrooms"] == flat2["bedrooms"] - n_common_items += 1 - - # They should have the same utilities (included or excluded for - # both of them), if this was fetched for both - if flat1["utilities"] and flat2["utilities"]: - assert flat1["utilities"] == flat2["utilities"] - n_common_items += 1 - - # They should have the same number of rooms if it was fetched - # for both of them - if flat1["rooms"] and flat2["rooms"]: - assert flat1["rooms"] == flat2["rooms"] - n_common_items += 1 - - # They should have the same postal code, if available - if ( - "flatisfy" in flat1 and "flatisfy" in flat2 and - flat1["flatisfy"].get("postal_code", None) and - flat2["flatisfy"].get("postal_code", None) - ): - assert ( - flat1["flatisfy"]["postal_code"] == - flat2["flatisfy"]["postal_code"] - ) - n_common_items += 1 - - # TODO: Compare texts (one is included in another? fuzzymatch?) - - # They should have the same phone number if it was fetched for - # both - flat1_phone = homogeneize_phone_number(flat1["phone"]) - flat2_phone = homogeneize_phone_number(flat2["phone"]) - if flat1_phone and flat2_phone: - assert flat1_phone == flat2_phone - n_common_items += 10 # Counts much more than the rest - - # They should have at least one photo in common if there - # are some photos - if flat1["photos"] and flat2["photos"]: - n_common_photos = find_number_common_photos( - photo_cache, - flat1["photos"], - flat2["photos"] - ) - assert n_common_photos > 1 - - min_number_photos = min(len(flat1["photos"]), - len(flat2["photos"])) - - # Either all the photos are the same, or there are at least - # three common photos. - if n_common_photos == min_number_photos: - n_common_items += 15 - else: - n_common_items += 5 * min(n_common_photos, 3) - - # Minimal score to consider they are duplicates - assert n_common_items >= config["duplicate_threshold"] - - # If the two flats are from the same website and have a - # different float part, consider they cannot be duplicates. See - # https://framagit.org/phyks/Flatisfy/issues/100. - both_are_from_same_backend = ( - flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1] + # Minimal score to consider they are duplicates + if n_common_items >= config["duplicate_threshold"]: + # Mark flats as duplicates + LOGGER.info( + ("Found duplicates using deep detection: (%s, %s). " + "Score is %d."), + flat1["id"], + flat2["id"], + n_common_items ) - both_have_float_part = ( - (flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0 - ) - both_have_different_float_part = ( - (flat1["area"] % 1) != (flat2["area"] % 1) - ) - if(both_have_float_part and both_are_from_same_backend and - both_have_different_float_part): - continue - except (AssertionError, TypeError): - # Skip and consider as not duplicates whenever the conditions - # are not met - # TypeError occurs when an area or a cost is None, which should - # not be considered as duplicates - continue - - # Mark flats as duplicates - LOGGER.info( - ("Found duplicates using deep detection: (%s, %s). " - "Score is %d."), - flat1["id"], - flat2["id"], - n_common_items - ) - matching_flats[flat1["id"]].append(flat2["id"]) - matching_flats[flat2["id"]].append(flat1["id"]) + matching_flats[flat1["id"]].append(flat2["id"]) + matching_flats[flat2["id"]].append(flat1["id"]) if photo_cache.total(): LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.", diff --git a/flatisfy/test_files/127028739@seloger.json b/flatisfy/test_files/127028739@seloger.json new file mode 100644 index 0000000..5afa04a --- /dev/null +++ b/flatisfy/test_files/127028739@seloger.json @@ -0,0 +1,73 @@ +{ + "id": "127028739@seloger", + "url": "http://www.seloger.com/annonces/achat/appartement/rennes-35/centre/127028739.htm?p=", + "title": "Appartement 3 pièces 67m² - Rennes", + "area": 67, + "cost": 155700, + "price_per_meter": 2323.8805970149256, + "currency": "€", + "utilities": "", + "date": "2018-01-12T02:10:00", + "location": "17 PLACE MARECHAL JUIN Rennes (35000)", + "station": "", + "text": "Exclusivité Nexity Dans un immeuble de standing, en étage élevé avec ascenseur, Appartement Type 3 de 67 m² exposé Sud / Ouest, un séjour avec balcon et double exposition vue dégagée. Deux chambres dont une avec balcon, salle de douches, WC séparé, cave et parking en sous-sol.", + "phone": null, + "photos": [ + { + "id": "0an3yarge9y446j653dewxu0jwy33pmwar47k2qym.jpg", + "url": "https://v.seloger.com/s/width/800/visuels/0/a/n/3/0an3yarge9y446j653dewxu0jwy33pmwar47k2qym.jpg", + "data": null + }, + { + "id": "1qnz6hpffcrd1c71htbooubgb7s57d82ie1v0zyf2.jpg", + "url": "https://v.seloger.com/s/width/800/visuels/1/q/n/z/1qnz6hpffcrd1c71htbooubgb7s57d82ie1v0zyf2.jpg", + "data": null + }, + { + "id": "16bv8yqgytefa1fq57hyk6e0y6ox8t2mh8wj2dgxq.jpg", + "url": "https://v.seloger.com/s/width/800/visuels/1/6/b/v/16bv8yqgytefa1fq57hyk6e0y6ox8t2mh8wj2dgxq.jpg", + "data": null + }, + { + "id": "1o23blwk87ew95e3vcq5ygyk10z2hy82fzo5j6hha.jpg", + "url": "https://v.seloger.com/s/width/800/visuels/1/o/2/3/1o23blwk87ew95e3vcq5ygyk10z2hy82fzo5j6hha.jpg", + "data": null + }, + { + "id": "20vuxbdp160sot4ccryf6g7g4rwxrkhz3b3tmq7zy.jpg", + "url": "https://v.seloger.com/s/width/800/visuels/2/0/v/u/20vuxbdp160sot4ccryf6g7g4rwxrkhz3b3tmq7zy.jpg", + "data": null + }, + { + "id": "00d9bpezie95lqtfmoccqg1ddrld2m64c2mcod5ha.jpg", + "url": "https://v.seloger.com/s/width/800/visuels/0/0/d/9/00d9bpezie95lqtfmoccqg1ddrld2m64c2mcod5ha.jpg", + "data": null + }, + { + "id": "0lhqf881qm2j03hz5581d8ggplp1xwwchb2rtoqgu.jpg", + "url": "https://v.seloger.com/s/width/800/visuels/0/l/h/q/0lhqf881qm2j03hz5581d8ggplp1xwwchb2rtoqgu.jpg", + "data": null + }, + { + "id": "0chwbagbf8tc0qf9sd3wryzl4gm7hkswcnrtnx2bi.jpg", + "url": "https://v.seloger.com/s/width/800/visuels/0/c/h/w/0chwbagbf8tc0qf9sd3wryzl4gm7hkswcnrtnx2bi.jpg", + "data": null + } + ], + "rooms": 3, + "bedrooms": 2, + "details": { + "Vue": "", + "Pièces": "3", + "Etage": "15", + "Reference": "MT0136601", + "Chambres": "2", + "Cave": "", + "Balcon": "5 m²", + "Surface": "67 m²", + "Ascenseur": "", + "Etages": "30", + "Parking": "1", + "Salle de Séjour": "" + } +} diff --git a/flatisfy/test_files/14428129@explorimmo.json b/flatisfy/test_files/14428129@explorimmo.json new file mode 100644 index 0000000..6e20c75 --- /dev/null +++ b/flatisfy/test_files/14428129@explorimmo.json @@ -0,0 +1,77 @@ +{ + "id": "14428129@explorimmo", + "url": "http://www.explorimmo.com/annonce-14428129.html", + "title": "Vente appartement 3 pièces 67 m2", + "area": 67, + "cost": 155700, + "price_per_meter": 2323.8805970149256, + "currency": "EUR", + "utilities": "H.C.", + "date": "2017-12-05T07:40:00", + "location": "17 PLACE MARECHAL JUIN Rennes 35000", + "station": null, + "text": "Exclusivité Nexity Dans un immeuble de standing, en étage élevé avec\nascenseur, Appartement Type 3 de 67 m² exposé Sud / Ouest, un séjour avec\nbalcon et double exposition vue dégagée. Deux chambres dont une avec balcon,\nsalle de douches, WC séparé, cave et parking en sous-sol.\n\n", + "phone": null, + "photos": [ + { + "id": "f9b2da6dfa184759aa0c349edb1cd037.jpg", + "url": "http://thbr.figarocms.net/images/2qEDBqRV-QNlp4fHVNhSCWlt6rU=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/f9b2da6dfa184759aa0c349edb1cd037.jpg", + "data": null + }, + { + "id": "3f2cc9dc429d4e3dbb9f4216f109d224.jpg", + "url": "http://thbr.figarocms.net/images/DulZQyZkkwa0ZFBT1nYD9rUD0A4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/3f2cc9dc429d4e3dbb9f4216f109d224.jpg", + "data": null + }, + { + "id": "56ae1db620f44af6b860df10eba55870.jpg", + "url": "http://thbr.figarocms.net/images/EpvEffLcFbBT7spEZB2dcOHaZwA=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/56ae1db620f44af6b860df10eba55870.jpg", + "data": null + }, + { + "id": "5acdef1f05314fe19111a0c3d92b8fe5.jpg", + "url": "http://thbr.figarocms.net/images/wHtDlJMwIrMC3cWXi8ASN4I6Zl4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/5acdef1f05314fe19111a0c3d92b8fe5.jpg", + "data": null + }, + { + "id": "16c686ea91b248129fe60011d61e060b.jpg", + "url": "http://thbr.figarocms.net/images/SD5VT1gxRSXSlt3pAz8r_SI3rqw=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/16c686ea91b248129fe60011d61e060b.jpg", + "data": null + }, + { + "id": "e6a67d42709d443481da0feb9a7e11a1.jpg", + "url": "http://thbr.figarocms.net/images/u8PGKXqC0CL9AyEOI5T9TFeGs-Y=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/e6a67d42709d443481da0feb9a7e11a1.jpg", + "data": null + }, + { + "id": "6888cc7bc823402198205e480c8cab6c.jpg", + "url": "http://thbr.figarocms.net/images/-3AseFCRaleidG2vsDJpA5BLBa4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/6888cc7bc823402198205e480c8cab6c.jpg", + "data": null + }, + { + "id": "d40dbeea9e424ea2a846f5683746ea9e.jpg", + "url": "http://thbr.figarocms.net/images/TMKBtBuucYge-BgCoUGRjxZjdBE=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/d40dbeea9e424ea2a846f5683746ea9e.jpg", + "data": null + } + ], + "rooms": 3, + "bedrooms": 2, + "details": { + "available": true, + "heatingType": "", + "agency": "NEXITY LAMY, 6 avenue Jean Janvier, 35000, Rennes", + "bathrooms": 0, + "exposure": "Non précisé", + "floor": "15", + "energy": "C", + "bedrooms": 2, + "greenhouseGasEmission": null, + "isFurnished": false, + "rooms": 3, + "fees": 0, + "creationDate": 1512455998000, + "agencyFees": 0, + "availabilityDate": null, + "guarantee": 0 + } +} diff --git a/flatisfy/tests.py b/flatisfy/tests.py new file mode 100644 index 0000000..c81b42d --- /dev/null +++ b/flatisfy/tests.py @@ -0,0 +1,204 @@ +# coding: utf-8 +""" +This module contains unit testing functions. +""" + +import random +import logging +import unittest +import copy +import os +import json +from flatisfy import tools +from flatisfy.filters import duplicates +from flatisfy.filters.cache import ImageCache +from flatisfy.constants import BACKENDS_BY_PRECEDENCE + +LOGGER = logging.getLogger(__name__) +TESTS_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + "/test_files/" + +class TestTexts(unittest.TestCase): + def test_roman_numbers(self): + """ + Checks roman numbers replacement. + """ + self.assertEqual( + "14", + tools.normalize_string("XIV") + ) + + def test_multiple_whitespaces(self): + """ + Checks whitespaces are collapsed. + """ + self.assertEqual( + "avec ascenseur", + tools.normalize_string("avec ascenseur") + ) + + def test_accents(self): + """ + Checks accents are replaced. + """ + self.assertEqual( + "éèêàüï", + tools.normalize_string("eeeaui") + ) + +class TestPhoneNumbers(unittest.TestCase): + def test_prefix(self): + """ + Checks phone numbers with international prefixes. + """ + self.assertEqual( + "0605040302", + duplicates.homogeneize_phone_number("+33605040302") + ) + + def test_dots_separators(self): + """ + Checks phone numbers with dots. + """ + self.assertEqual( + "0605040302", + duplicates.homogeneize_phone_number("06.05.04.03.02") + ) + + def test_spaces_separators(self): + """ + Checks phone numbers with spaces. + """ + self.assertEqual( + "0605040302", + duplicates.homogeneize_phone_number("06 05 04 03 02") + ) + +class TestDuplicates(unittest.TestCase): + DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14 + DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15 + IMAGE_CACHE = ImageCache() + + def generate_fake_flat(self): + """ + Generates a fake flat post. + """ + backend = BACKENDS_BY_PRECEDENCE[random.randint(0, len(BACKENDS_BY_PRECEDENCE) - 1)] + return { + "id": str(random.randint(100000, 199999)) + "@" + backend, + "phone": "0607080910", + "rooms": random.randint(1, 4), + "utilities": "", + "area": random.randint(200, 1500) / 10, + "cost": random.randint(100000, 300000), + "bedrooms": random.randint(1, 4) + } + + def load_files(self, file1, file2): + """ + Load two files + + :return: A dict with two flats + """ + with open(TESTS_DATA_DIR + file1 + ".json", "r") as flat_file: + flat1 = json.loads(flat_file.read()) + + with open(TESTS_DATA_DIR + file2 + ".json", "r") as flat_file: + flat2 = json.loads(flat_file.read()) + + return [flat1, flat2] + + def test_duplicates(self): + """ + Two identical flats should be detected as duplicates. + """ + flat1 = self.generate_fake_flat() + flat2 = copy.deepcopy(flat1) + score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE) + self.assertTrue(score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS) + + def test_different_prices(self): + """ + Two flats with different prices should not be detected as duplicates. + """ + flat1 = self.generate_fake_flat() + flat2 = copy.deepcopy(flat1) + flat2["cost"] += 1000 + + score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE) + self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS) + + def test_different_rooms(self): + """ + Two flats with different rooms quantity should not be detected as + duplicates. + """ + flat1 = self.generate_fake_flat() + flat2 = copy.deepcopy(flat1) + flat2["rooms"] += 1 + + score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE) + self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS) + + def test_different_areas(self): + """ + Two flats with different areas should not be detected as duplicates. + """ + flat1 = self.generate_fake_flat() + flat2 = copy.deepcopy(flat1) + flat2["area"] += 10 + + score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE) + self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS) + + def test_different_areas_decimals(self): + """ + Two flats which areas integers are equal but decimals are present and + different should not be detected as duplicates. + """ + flat1 = self.generate_fake_flat() + flat2 = copy.deepcopy(flat1) + flat1["area"] = 50.65 + flat2["area"] = 50.37 + + score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE) + self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS) + + def test_different_phones(self): + """ + Two flats with different phone numbers should not be detected as duplicates. + """ + flat1 = self.generate_fake_flat() + flat2 = copy.deepcopy(flat1) + flat2["phone"] = "0708091011" + + score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE) + self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS) + + def test_real_duplicates(self): + """ + Two flats with same price, area and rooms quantity should be detected as + duplicates. + """ + flats = self.load_files( + "127028739@seloger", + "14428129@explorimmo" + ) + + score = duplicates.get_duplicate_score(flats[0], flats[1], TestDuplicates.IMAGE_CACHE) + self.assertTrue(score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS) + +def run(config): + """ + Run all the tests + + :param config: A config dict. + """ + LOGGER.info("Running tests…") + suite = unittest.TestLoader().loadTestsFromTestCase(TestTexts) + unittest.TextTestRunner(verbosity=2).run(suite) + + suite = unittest.TestLoader().loadTestsFromTestCase(TestPhoneNumbers) + unittest.TextTestRunner(verbosity=2).run(suite) + + suite = unittest.TestLoader().loadTestsFromTestCase(TestDuplicates) + unittest.TextTestRunner(verbosity=2).run(suite)