Add unit tests

This commit is contained in:
nicofrand 2018-01-14 11:53:59 +01:00
parent 48835c0d83
commit 5b85ad6e59
5 changed files with 472 additions and 103 deletions

View File

@ -16,6 +16,7 @@ from flatisfy import cmds
from flatisfy import data
from flatisfy import fetch
from flatisfy import tools
from flatisfy import tests
# pylint: enable=locally-disabled,wrong-import-position
@ -113,6 +114,10 @@ def parse_args(argv=None):
parser_serve.add_argument("--port", type=int, help="Port to bind to.")
parser_serve.add_argument("--host", help="Host to listen on.")
# Test subcommand parser
subparsers.add_parser("test", parents=[parent_parser],
help="Unit testing.")
return parser.parse_args(argv)
@ -212,6 +217,10 @@ def main():
elif args.cmd == "serve":
cmds.serve(config)
return
# Tests command
elif args.cmd == "test":
tests.run(config)
return
if __name__ == "__main__":

View File

@ -167,6 +167,103 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
return unique_flats_list, duplicate_flats
def get_duplicate_score(flat1, flat2, photo_cache):
n_common_items = 0
try:
# They should have the same area, up to one unit
assert abs(flat1["area"] - flat2["area"]) < 1
n_common_items += 1
# They should be at the same price, up to one unit
assert abs(flat1["cost"] - flat2["cost"]) < 1
n_common_items += 1
# They should have the same number of bedrooms if this was
# fetched for both
if flat1["bedrooms"] and flat2["bedrooms"]:
assert flat1["bedrooms"] == flat2["bedrooms"]
n_common_items += 1
# They should have the same utilities (included or excluded for
# both of them), if this was fetched for both
if flat1["utilities"] and flat2["utilities"]:
assert flat1["utilities"] == flat2["utilities"]
n_common_items += 1
# They should have the same number of rooms if it was fetched
# for both of them
if flat1["rooms"] and flat2["rooms"]:
assert flat1["rooms"] == flat2["rooms"]
n_common_items += 1
# They should have the same postal code, if available
if (
"flatisfy" in flat1 and "flatisfy" in flat2 and
flat1["flatisfy"].get("postal_code", None) and
flat2["flatisfy"].get("postal_code", None)
):
assert (
flat1["flatisfy"]["postal_code"] ==
flat2["flatisfy"]["postal_code"]
)
n_common_items += 1
# TODO: Better text comparison (one included in the other, fuzzymatch)
flat1_text = tools.normalize_string(flat1.get("text", ""))
flat2_text = tools.normalize_string(flat2.get("text", ""))
if flat1_text and flat2_text and flat1_text == flat2_text:
n_common_items += 1
# They should have the same phone number if it was fetched for
# both
flat1_phone = homogeneize_phone_number(flat1["phone"])
flat2_phone = homogeneize_phone_number(flat2["phone"])
if flat1_phone and flat2_phone:
assert flat1_phone == flat2_phone
n_common_items += 10 # Counts much more than the rest
# They should have at least one photo in common if there
# are some photos
if flat1.get("photos", []) and flat2.get("photos", []):
n_common_photos = find_number_common_photos(
photo_cache,
flat1["photos"],
flat2["photos"]
)
assert n_common_photos > 1
min_number_photos = min(len(flat1["photos"]),
len(flat2["photos"]))
# Either all the photos are the same, or there are at least
# three common photos.
if n_common_photos == min_number_photos:
n_common_items += 15
else:
n_common_items += 5 * min(n_common_photos, 3)
# If the two flats are from the same website and have a
# different float part, consider they cannot be duplicates. See
# https://framagit.org/phyks/Flatisfy/issues/100.
both_are_from_same_backend = (
flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
)
both_have_float_part = (
(flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
)
both_have_equal_float_part = (
(flat1["area"] % 1) == (flat2["area"] % 1)
)
if both_have_float_part and both_are_from_same_backend:
assert both_have_equal_float_part
except (AssertionError, TypeError):
# Skip and consider as not duplicates whenever the conditions
# are not met
# TypeError occurs when an area or a cost is None, which should
# not be considered as duplicates
n_common_items = 0
return n_common_items
def deep_detect(flats_list, config):
"""
@ -192,111 +289,20 @@ def deep_detect(flats_list, config):
if flat2["id"] in matching_flats[flat1["id"]]:
continue
n_common_items = 0
try:
# They should have the same area, up to one unit
assert abs(flat1["area"] - flat2["area"]) < 1
n_common_items += 1
n_common_items = get_duplicate_score(flat1, flat2, photo_cache)
# They should be at the same price, up to one unit
assert abs(flat1["cost"] - flat2["cost"]) < 1
n_common_items += 1
# They should have the same number of bedrooms if this was
# fetched for both
if flat1["bedrooms"] and flat2["bedrooms"]:
assert flat1["bedrooms"] == flat2["bedrooms"]
n_common_items += 1
# They should have the same utilities (included or excluded for
# both of them), if this was fetched for both
if flat1["utilities"] and flat2["utilities"]:
assert flat1["utilities"] == flat2["utilities"]
n_common_items += 1
# They should have the same number of rooms if it was fetched
# for both of them
if flat1["rooms"] and flat2["rooms"]:
assert flat1["rooms"] == flat2["rooms"]
n_common_items += 1
# They should have the same postal code, if available
if (
"flatisfy" in flat1 and "flatisfy" in flat2 and
flat1["flatisfy"].get("postal_code", None) and
flat2["flatisfy"].get("postal_code", None)
):
assert (
flat1["flatisfy"]["postal_code"] ==
flat2["flatisfy"]["postal_code"]
)
n_common_items += 1
# TODO: Compare texts (one is included in another? fuzzymatch?)
# They should have the same phone number if it was fetched for
# both
flat1_phone = homogeneize_phone_number(flat1["phone"])
flat2_phone = homogeneize_phone_number(flat2["phone"])
if flat1_phone and flat2_phone:
assert flat1_phone == flat2_phone
n_common_items += 10 # Counts much more than the rest
# They should have at least one photo in common if there
# are some photos
if flat1["photos"] and flat2["photos"]:
n_common_photos = find_number_common_photos(
photo_cache,
flat1["photos"],
flat2["photos"]
)
assert n_common_photos > 1
min_number_photos = min(len(flat1["photos"]),
len(flat2["photos"]))
# Either all the photos are the same, or there are at least
# three common photos.
if n_common_photos == min_number_photos:
n_common_items += 15
else:
n_common_items += 5 * min(n_common_photos, 3)
# Minimal score to consider they are duplicates
assert n_common_items >= config["duplicate_threshold"]
# If the two flats are from the same website and have a
# different float part, consider they cannot be duplicates. See
# https://framagit.org/phyks/Flatisfy/issues/100.
both_are_from_same_backend = (
flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
# Minimal score to consider they are duplicates
if n_common_items >= config["duplicate_threshold"]:
# Mark flats as duplicates
LOGGER.info(
("Found duplicates using deep detection: (%s, %s). "
"Score is %d."),
flat1["id"],
flat2["id"],
n_common_items
)
both_have_float_part = (
(flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
)
both_have_different_float_part = (
(flat1["area"] % 1) != (flat2["area"] % 1)
)
if(both_have_float_part and both_are_from_same_backend and
both_have_different_float_part):
continue
except (AssertionError, TypeError):
# Skip and consider as not duplicates whenever the conditions
# are not met
# TypeError occurs when an area or a cost is None, which should
# not be considered as duplicates
continue
# Mark flats as duplicates
LOGGER.info(
("Found duplicates using deep detection: (%s, %s). "
"Score is %d."),
flat1["id"],
flat2["id"],
n_common_items
)
matching_flats[flat1["id"]].append(flat2["id"])
matching_flats[flat2["id"]].append(flat1["id"])
matching_flats[flat1["id"]].append(flat2["id"])
matching_flats[flat2["id"]].append(flat1["id"])
if photo_cache.total():
LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",

View File

@ -0,0 +1,73 @@
{
"id": "127028739@seloger",
"url": "http://www.seloger.com/annonces/achat/appartement/rennes-35/centre/127028739.htm?p=",
"title": "Appartement 3 pièces 67m² - Rennes",
"area": 67,
"cost": 155700,
"price_per_meter": 2323.8805970149256,
"currency": "€",
"utilities": "",
"date": "2018-01-12T02:10:00",
"location": "17 PLACE MARECHAL JUIN Rennes (35000)",
"station": "",
"text": "Exclusivité Nexity Dans un immeuble de standing, en étage élevé avec ascenseur, Appartement Type 3 de 67 m² exposé Sud / Ouest, un séjour avec balcon et double exposition vue dégagée. Deux chambres dont une avec balcon, salle de douches, WC séparé, cave et parking en sous-sol.",
"phone": null,
"photos": [
{
"id": "0an3yarge9y446j653dewxu0jwy33pmwar47k2qym.jpg",
"url": "https://v.seloger.com/s/width/800/visuels/0/a/n/3/0an3yarge9y446j653dewxu0jwy33pmwar47k2qym.jpg",
"data": null
},
{
"id": "1qnz6hpffcrd1c71htbooubgb7s57d82ie1v0zyf2.jpg",
"url": "https://v.seloger.com/s/width/800/visuels/1/q/n/z/1qnz6hpffcrd1c71htbooubgb7s57d82ie1v0zyf2.jpg",
"data": null
},
{
"id": "16bv8yqgytefa1fq57hyk6e0y6ox8t2mh8wj2dgxq.jpg",
"url": "https://v.seloger.com/s/width/800/visuels/1/6/b/v/16bv8yqgytefa1fq57hyk6e0y6ox8t2mh8wj2dgxq.jpg",
"data": null
},
{
"id": "1o23blwk87ew95e3vcq5ygyk10z2hy82fzo5j6hha.jpg",
"url": "https://v.seloger.com/s/width/800/visuels/1/o/2/3/1o23blwk87ew95e3vcq5ygyk10z2hy82fzo5j6hha.jpg",
"data": null
},
{
"id": "20vuxbdp160sot4ccryf6g7g4rwxrkhz3b3tmq7zy.jpg",
"url": "https://v.seloger.com/s/width/800/visuels/2/0/v/u/20vuxbdp160sot4ccryf6g7g4rwxrkhz3b3tmq7zy.jpg",
"data": null
},
{
"id": "00d9bpezie95lqtfmoccqg1ddrld2m64c2mcod5ha.jpg",
"url": "https://v.seloger.com/s/width/800/visuels/0/0/d/9/00d9bpezie95lqtfmoccqg1ddrld2m64c2mcod5ha.jpg",
"data": null
},
{
"id": "0lhqf881qm2j03hz5581d8ggplp1xwwchb2rtoqgu.jpg",
"url": "https://v.seloger.com/s/width/800/visuels/0/l/h/q/0lhqf881qm2j03hz5581d8ggplp1xwwchb2rtoqgu.jpg",
"data": null
},
{
"id": "0chwbagbf8tc0qf9sd3wryzl4gm7hkswcnrtnx2bi.jpg",
"url": "https://v.seloger.com/s/width/800/visuels/0/c/h/w/0chwbagbf8tc0qf9sd3wryzl4gm7hkswcnrtnx2bi.jpg",
"data": null
}
],
"rooms": 3,
"bedrooms": 2,
"details": {
"Vue": "",
"Pièces": "3",
"Etage": "15",
"Reference": "MT0136601",
"Chambres": "2",
"Cave": "",
"Balcon": "5 m²",
"Surface": "67 m²",
"Ascenseur": "",
"Etages": "30",
"Parking": "1",
"Salle de Séjour": ""
}
}

View File

@ -0,0 +1,77 @@
{
"id": "14428129@explorimmo",
"url": "http://www.explorimmo.com/annonce-14428129.html",
"title": "Vente appartement 3 pièces 67 m2",
"area": 67,
"cost": 155700,
"price_per_meter": 2323.8805970149256,
"currency": "EUR",
"utilities": "H.C.",
"date": "2017-12-05T07:40:00",
"location": "17 PLACE MARECHAL JUIN Rennes 35000",
"station": null,
"text": "Exclusivité Nexity Dans un immeuble de standing, en étage élevé avec\nascenseur, Appartement Type 3 de 67 m² exposé Sud / Ouest, un séjour avec\nbalcon et double exposition vue dégagée. Deux chambres dont une avec balcon,\nsalle de douches, WC séparé, cave et parking en sous-sol.\n\n",
"phone": null,
"photos": [
{
"id": "f9b2da6dfa184759aa0c349edb1cd037.jpg",
"url": "http://thbr.figarocms.net/images/2qEDBqRV-QNlp4fHVNhSCWlt6rU=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/f9b2da6dfa184759aa0c349edb1cd037.jpg",
"data": null
},
{
"id": "3f2cc9dc429d4e3dbb9f4216f109d224.jpg",
"url": "http://thbr.figarocms.net/images/DulZQyZkkwa0ZFBT1nYD9rUD0A4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/3f2cc9dc429d4e3dbb9f4216f109d224.jpg",
"data": null
},
{
"id": "56ae1db620f44af6b860df10eba55870.jpg",
"url": "http://thbr.figarocms.net/images/EpvEffLcFbBT7spEZB2dcOHaZwA=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/56ae1db620f44af6b860df10eba55870.jpg",
"data": null
},
{
"id": "5acdef1f05314fe19111a0c3d92b8fe5.jpg",
"url": "http://thbr.figarocms.net/images/wHtDlJMwIrMC3cWXi8ASN4I6Zl4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/5acdef1f05314fe19111a0c3d92b8fe5.jpg",
"data": null
},
{
"id": "16c686ea91b248129fe60011d61e060b.jpg",
"url": "http://thbr.figarocms.net/images/SD5VT1gxRSXSlt3pAz8r_SI3rqw=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/16c686ea91b248129fe60011d61e060b.jpg",
"data": null
},
{
"id": "e6a67d42709d443481da0feb9a7e11a1.jpg",
"url": "http://thbr.figarocms.net/images/u8PGKXqC0CL9AyEOI5T9TFeGs-Y=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/e6a67d42709d443481da0feb9a7e11a1.jpg",
"data": null
},
{
"id": "6888cc7bc823402198205e480c8cab6c.jpg",
"url": "http://thbr.figarocms.net/images/-3AseFCRaleidG2vsDJpA5BLBa4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/6888cc7bc823402198205e480c8cab6c.jpg",
"data": null
},
{
"id": "d40dbeea9e424ea2a846f5683746ea9e.jpg",
"url": "http://thbr.figarocms.net/images/TMKBtBuucYge-BgCoUGRjxZjdBE=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/d40dbeea9e424ea2a846f5683746ea9e.jpg",
"data": null
}
],
"rooms": 3,
"bedrooms": 2,
"details": {
"available": true,
"heatingType": "",
"agency": "NEXITY LAMY, 6 avenue Jean Janvier, 35000, Rennes",
"bathrooms": 0,
"exposure": "Non précisé",
"floor": "15",
"energy": "C",
"bedrooms": 2,
"greenhouseGasEmission": null,
"isFurnished": false,
"rooms": 3,
"fees": 0,
"creationDate": 1512455998000,
"agencyFees": 0,
"availabilityDate": null,
"guarantee": 0
}
}

204
flatisfy/tests.py Normal file
View File

@ -0,0 +1,204 @@
# coding: utf-8
"""
This module contains unit testing functions.
"""
import random
import logging
import unittest
import copy
import os
import json
from flatisfy import tools
from flatisfy.filters import duplicates
from flatisfy.filters.cache import ImageCache
from flatisfy.constants import BACKENDS_BY_PRECEDENCE
LOGGER = logging.getLogger(__name__)
TESTS_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + "/test_files/"
class TestTexts(unittest.TestCase):
def test_roman_numbers(self):
"""
Checks roman numbers replacement.
"""
self.assertEqual(
"14",
tools.normalize_string("XIV")
)
def test_multiple_whitespaces(self):
"""
Checks whitespaces are collapsed.
"""
self.assertEqual(
"avec ascenseur",
tools.normalize_string("avec ascenseur")
)
def test_accents(self):
"""
Checks accents are replaced.
"""
self.assertEqual(
"éèêàüï",
tools.normalize_string("eeeaui")
)
class TestPhoneNumbers(unittest.TestCase):
def test_prefix(self):
"""
Checks phone numbers with international prefixes.
"""
self.assertEqual(
"0605040302",
duplicates.homogeneize_phone_number("+33605040302")
)
def test_dots_separators(self):
"""
Checks phone numbers with dots.
"""
self.assertEqual(
"0605040302",
duplicates.homogeneize_phone_number("06.05.04.03.02")
)
def test_spaces_separators(self):
"""
Checks phone numbers with spaces.
"""
self.assertEqual(
"0605040302",
duplicates.homogeneize_phone_number("06 05 04 03 02")
)
class TestDuplicates(unittest.TestCase):
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15
IMAGE_CACHE = ImageCache()
def generate_fake_flat(self):
"""
Generates a fake flat post.
"""
backend = BACKENDS_BY_PRECEDENCE[random.randint(0, len(BACKENDS_BY_PRECEDENCE) - 1)]
return {
"id": str(random.randint(100000, 199999)) + "@" + backend,
"phone": "0607080910",
"rooms": random.randint(1, 4),
"utilities": "",
"area": random.randint(200, 1500) / 10,
"cost": random.randint(100000, 300000),
"bedrooms": random.randint(1, 4)
}
def load_files(self, file1, file2):
"""
Load two files
:return: A dict with two flats
"""
with open(TESTS_DATA_DIR + file1 + ".json", "r") as flat_file:
flat1 = json.loads(flat_file.read())
with open(TESTS_DATA_DIR + file2 + ".json", "r") as flat_file:
flat2 = json.loads(flat_file.read())
return [flat1, flat2]
def test_duplicates(self):
"""
Two identical flats should be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
self.assertTrue(score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
def test_different_prices(self):
"""
Two flats with different prices should not be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat2["cost"] += 1000
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
def test_different_rooms(self):
"""
Two flats with different rooms quantity should not be detected as
duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat2["rooms"] += 1
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
def test_different_areas(self):
"""
Two flats with different areas should not be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat2["area"] += 10
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
def test_different_areas_decimals(self):
"""
Two flats which areas integers are equal but decimals are present and
different should not be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat1["area"] = 50.65
flat2["area"] = 50.37
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
def test_different_phones(self):
"""
Two flats with different phone numbers should not be detected as duplicates.
"""
flat1 = self.generate_fake_flat()
flat2 = copy.deepcopy(flat1)
flat2["phone"] = "0708091011"
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
def test_real_duplicates(self):
"""
Two flats with same price, area and rooms quantity should be detected as
duplicates.
"""
flats = self.load_files(
"127028739@seloger",
"14428129@explorimmo"
)
score = duplicates.get_duplicate_score(flats[0], flats[1], TestDuplicates.IMAGE_CACHE)
self.assertTrue(score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS)
def run(config):
"""
Run all the tests
:param config: A config dict.
"""
LOGGER.info("Running tests…")
suite = unittest.TestLoader().loadTestsFromTestCase(TestTexts)
unittest.TextTestRunner(verbosity=2).run(suite)
suite = unittest.TestLoader().loadTestsFromTestCase(TestPhoneNumbers)
unittest.TextTestRunner(verbosity=2).run(suite)
suite = unittest.TestLoader().loadTestsFromTestCase(TestDuplicates)
unittest.TextTestRunner(verbosity=2).run(suite)