Add unit tests
This commit is contained in:
parent
48835c0d83
commit
5b85ad6e59
@ -16,6 +16,7 @@ from flatisfy import cmds
|
|||||||
from flatisfy import data
|
from flatisfy import data
|
||||||
from flatisfy import fetch
|
from flatisfy import fetch
|
||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
|
from flatisfy import tests
|
||||||
# pylint: enable=locally-disabled,wrong-import-position
|
# pylint: enable=locally-disabled,wrong-import-position
|
||||||
|
|
||||||
|
|
||||||
@ -113,6 +114,10 @@ def parse_args(argv=None):
|
|||||||
parser_serve.add_argument("--port", type=int, help="Port to bind to.")
|
parser_serve.add_argument("--port", type=int, help="Port to bind to.")
|
||||||
parser_serve.add_argument("--host", help="Host to listen on.")
|
parser_serve.add_argument("--host", help="Host to listen on.")
|
||||||
|
|
||||||
|
# Test subcommand parser
|
||||||
|
subparsers.add_parser("test", parents=[parent_parser],
|
||||||
|
help="Unit testing.")
|
||||||
|
|
||||||
return parser.parse_args(argv)
|
return parser.parse_args(argv)
|
||||||
|
|
||||||
|
|
||||||
@ -212,6 +217,10 @@ def main():
|
|||||||
elif args.cmd == "serve":
|
elif args.cmd == "serve":
|
||||||
cmds.serve(config)
|
cmds.serve(config)
|
||||||
return
|
return
|
||||||
|
# Tests command
|
||||||
|
elif args.cmd == "test":
|
||||||
|
tests.run(config)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -167,6 +167,103 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
|
|||||||
|
|
||||||
return unique_flats_list, duplicate_flats
|
return unique_flats_list, duplicate_flats
|
||||||
|
|
||||||
|
def get_duplicate_score(flat1, flat2, photo_cache):
|
||||||
|
n_common_items = 0
|
||||||
|
try:
|
||||||
|
# They should have the same area, up to one unit
|
||||||
|
assert abs(flat1["area"] - flat2["area"]) < 1
|
||||||
|
n_common_items += 1
|
||||||
|
|
||||||
|
# They should be at the same price, up to one unit
|
||||||
|
assert abs(flat1["cost"] - flat2["cost"]) < 1
|
||||||
|
n_common_items += 1
|
||||||
|
|
||||||
|
# They should have the same number of bedrooms if this was
|
||||||
|
# fetched for both
|
||||||
|
if flat1["bedrooms"] and flat2["bedrooms"]:
|
||||||
|
assert flat1["bedrooms"] == flat2["bedrooms"]
|
||||||
|
n_common_items += 1
|
||||||
|
|
||||||
|
# They should have the same utilities (included or excluded for
|
||||||
|
# both of them), if this was fetched for both
|
||||||
|
if flat1["utilities"] and flat2["utilities"]:
|
||||||
|
assert flat1["utilities"] == flat2["utilities"]
|
||||||
|
n_common_items += 1
|
||||||
|
|
||||||
|
# They should have the same number of rooms if it was fetched
|
||||||
|
# for both of them
|
||||||
|
if flat1["rooms"] and flat2["rooms"]:
|
||||||
|
assert flat1["rooms"] == flat2["rooms"]
|
||||||
|
n_common_items += 1
|
||||||
|
|
||||||
|
# They should have the same postal code, if available
|
||||||
|
if (
|
||||||
|
"flatisfy" in flat1 and "flatisfy" in flat2 and
|
||||||
|
flat1["flatisfy"].get("postal_code", None) and
|
||||||
|
flat2["flatisfy"].get("postal_code", None)
|
||||||
|
):
|
||||||
|
assert (
|
||||||
|
flat1["flatisfy"]["postal_code"] ==
|
||||||
|
flat2["flatisfy"]["postal_code"]
|
||||||
|
)
|
||||||
|
n_common_items += 1
|
||||||
|
|
||||||
|
# TODO: Better text comparison (one included in the other, fuzzymatch)
|
||||||
|
flat1_text = tools.normalize_string(flat1.get("text", ""))
|
||||||
|
flat2_text = tools.normalize_string(flat2.get("text", ""))
|
||||||
|
if flat1_text and flat2_text and flat1_text == flat2_text:
|
||||||
|
n_common_items += 1
|
||||||
|
|
||||||
|
# They should have the same phone number if it was fetched for
|
||||||
|
# both
|
||||||
|
flat1_phone = homogeneize_phone_number(flat1["phone"])
|
||||||
|
flat2_phone = homogeneize_phone_number(flat2["phone"])
|
||||||
|
if flat1_phone and flat2_phone:
|
||||||
|
assert flat1_phone == flat2_phone
|
||||||
|
n_common_items += 10 # Counts much more than the rest
|
||||||
|
|
||||||
|
# They should have at least one photo in common if there
|
||||||
|
# are some photos
|
||||||
|
if flat1.get("photos", []) and flat2.get("photos", []):
|
||||||
|
n_common_photos = find_number_common_photos(
|
||||||
|
photo_cache,
|
||||||
|
flat1["photos"],
|
||||||
|
flat2["photos"]
|
||||||
|
)
|
||||||
|
assert n_common_photos > 1
|
||||||
|
|
||||||
|
min_number_photos = min(len(flat1["photos"]),
|
||||||
|
len(flat2["photos"]))
|
||||||
|
|
||||||
|
# Either all the photos are the same, or there are at least
|
||||||
|
# three common photos.
|
||||||
|
if n_common_photos == min_number_photos:
|
||||||
|
n_common_items += 15
|
||||||
|
else:
|
||||||
|
n_common_items += 5 * min(n_common_photos, 3)
|
||||||
|
|
||||||
|
# If the two flats are from the same website and have a
|
||||||
|
# different float part, consider they cannot be duplicates. See
|
||||||
|
# https://framagit.org/phyks/Flatisfy/issues/100.
|
||||||
|
both_are_from_same_backend = (
|
||||||
|
flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
|
||||||
|
)
|
||||||
|
both_have_float_part = (
|
||||||
|
(flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
|
||||||
|
)
|
||||||
|
both_have_equal_float_part = (
|
||||||
|
(flat1["area"] % 1) == (flat2["area"] % 1)
|
||||||
|
)
|
||||||
|
if both_have_float_part and both_are_from_same_backend:
|
||||||
|
assert both_have_equal_float_part
|
||||||
|
except (AssertionError, TypeError):
|
||||||
|
# Skip and consider as not duplicates whenever the conditions
|
||||||
|
# are not met
|
||||||
|
# TypeError occurs when an area or a cost is None, which should
|
||||||
|
# not be considered as duplicates
|
||||||
|
n_common_items = 0
|
||||||
|
|
||||||
|
return n_common_items
|
||||||
|
|
||||||
def deep_detect(flats_list, config):
|
def deep_detect(flats_list, config):
|
||||||
"""
|
"""
|
||||||
@ -192,111 +289,20 @@ def deep_detect(flats_list, config):
|
|||||||
if flat2["id"] in matching_flats[flat1["id"]]:
|
if flat2["id"] in matching_flats[flat1["id"]]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
n_common_items = 0
|
n_common_items = get_duplicate_score(flat1, flat2, photo_cache)
|
||||||
try:
|
|
||||||
# They should have the same area, up to one unit
|
|
||||||
assert abs(flat1["area"] - flat2["area"]) < 1
|
|
||||||
n_common_items += 1
|
|
||||||
|
|
||||||
# They should be at the same price, up to one unit
|
# Minimal score to consider they are duplicates
|
||||||
assert abs(flat1["cost"] - flat2["cost"]) < 1
|
if n_common_items >= config["duplicate_threshold"]:
|
||||||
n_common_items += 1
|
# Mark flats as duplicates
|
||||||
|
LOGGER.info(
|
||||||
# They should have the same number of bedrooms if this was
|
("Found duplicates using deep detection: (%s, %s). "
|
||||||
# fetched for both
|
"Score is %d."),
|
||||||
if flat1["bedrooms"] and flat2["bedrooms"]:
|
flat1["id"],
|
||||||
assert flat1["bedrooms"] == flat2["bedrooms"]
|
flat2["id"],
|
||||||
n_common_items += 1
|
n_common_items
|
||||||
|
|
||||||
# They should have the same utilities (included or excluded for
|
|
||||||
# both of them), if this was fetched for both
|
|
||||||
if flat1["utilities"] and flat2["utilities"]:
|
|
||||||
assert flat1["utilities"] == flat2["utilities"]
|
|
||||||
n_common_items += 1
|
|
||||||
|
|
||||||
# They should have the same number of rooms if it was fetched
|
|
||||||
# for both of them
|
|
||||||
if flat1["rooms"] and flat2["rooms"]:
|
|
||||||
assert flat1["rooms"] == flat2["rooms"]
|
|
||||||
n_common_items += 1
|
|
||||||
|
|
||||||
# They should have the same postal code, if available
|
|
||||||
if (
|
|
||||||
"flatisfy" in flat1 and "flatisfy" in flat2 and
|
|
||||||
flat1["flatisfy"].get("postal_code", None) and
|
|
||||||
flat2["flatisfy"].get("postal_code", None)
|
|
||||||
):
|
|
||||||
assert (
|
|
||||||
flat1["flatisfy"]["postal_code"] ==
|
|
||||||
flat2["flatisfy"]["postal_code"]
|
|
||||||
)
|
|
||||||
n_common_items += 1
|
|
||||||
|
|
||||||
# TODO: Compare texts (one is included in another? fuzzymatch?)
|
|
||||||
|
|
||||||
# They should have the same phone number if it was fetched for
|
|
||||||
# both
|
|
||||||
flat1_phone = homogeneize_phone_number(flat1["phone"])
|
|
||||||
flat2_phone = homogeneize_phone_number(flat2["phone"])
|
|
||||||
if flat1_phone and flat2_phone:
|
|
||||||
assert flat1_phone == flat2_phone
|
|
||||||
n_common_items += 10 # Counts much more than the rest
|
|
||||||
|
|
||||||
# They should have at least one photo in common if there
|
|
||||||
# are some photos
|
|
||||||
if flat1["photos"] and flat2["photos"]:
|
|
||||||
n_common_photos = find_number_common_photos(
|
|
||||||
photo_cache,
|
|
||||||
flat1["photos"],
|
|
||||||
flat2["photos"]
|
|
||||||
)
|
|
||||||
assert n_common_photos > 1
|
|
||||||
|
|
||||||
min_number_photos = min(len(flat1["photos"]),
|
|
||||||
len(flat2["photos"]))
|
|
||||||
|
|
||||||
# Either all the photos are the same, or there are at least
|
|
||||||
# three common photos.
|
|
||||||
if n_common_photos == min_number_photos:
|
|
||||||
n_common_items += 15
|
|
||||||
else:
|
|
||||||
n_common_items += 5 * min(n_common_photos, 3)
|
|
||||||
|
|
||||||
# Minimal score to consider they are duplicates
|
|
||||||
assert n_common_items >= config["duplicate_threshold"]
|
|
||||||
|
|
||||||
# If the two flats are from the same website and have a
|
|
||||||
# different float part, consider they cannot be duplicates. See
|
|
||||||
# https://framagit.org/phyks/Flatisfy/issues/100.
|
|
||||||
both_are_from_same_backend = (
|
|
||||||
flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
|
|
||||||
)
|
)
|
||||||
both_have_float_part = (
|
matching_flats[flat1["id"]].append(flat2["id"])
|
||||||
(flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
|
matching_flats[flat2["id"]].append(flat1["id"])
|
||||||
)
|
|
||||||
both_have_different_float_part = (
|
|
||||||
(flat1["area"] % 1) != (flat2["area"] % 1)
|
|
||||||
)
|
|
||||||
if(both_have_float_part and both_are_from_same_backend and
|
|
||||||
both_have_different_float_part):
|
|
||||||
continue
|
|
||||||
except (AssertionError, TypeError):
|
|
||||||
# Skip and consider as not duplicates whenever the conditions
|
|
||||||
# are not met
|
|
||||||
# TypeError occurs when an area or a cost is None, which should
|
|
||||||
# not be considered as duplicates
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Mark flats as duplicates
|
|
||||||
LOGGER.info(
|
|
||||||
("Found duplicates using deep detection: (%s, %s). "
|
|
||||||
"Score is %d."),
|
|
||||||
flat1["id"],
|
|
||||||
flat2["id"],
|
|
||||||
n_common_items
|
|
||||||
)
|
|
||||||
matching_flats[flat1["id"]].append(flat2["id"])
|
|
||||||
matching_flats[flat2["id"]].append(flat1["id"])
|
|
||||||
|
|
||||||
if photo_cache.total():
|
if photo_cache.total():
|
||||||
LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",
|
LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",
|
||||||
|
73
flatisfy/test_files/127028739@seloger.json
Normal file
73
flatisfy/test_files/127028739@seloger.json
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
{
|
||||||
|
"id": "127028739@seloger",
|
||||||
|
"url": "http://www.seloger.com/annonces/achat/appartement/rennes-35/centre/127028739.htm?p=",
|
||||||
|
"title": "Appartement 3 pièces 67m² - Rennes",
|
||||||
|
"area": 67,
|
||||||
|
"cost": 155700,
|
||||||
|
"price_per_meter": 2323.8805970149256,
|
||||||
|
"currency": "€",
|
||||||
|
"utilities": "",
|
||||||
|
"date": "2018-01-12T02:10:00",
|
||||||
|
"location": "17 PLACE MARECHAL JUIN Rennes (35000)",
|
||||||
|
"station": "",
|
||||||
|
"text": "Exclusivité Nexity Dans un immeuble de standing, en étage élevé avec ascenseur, Appartement Type 3 de 67 m² exposé Sud / Ouest, un séjour avec balcon et double exposition vue dégagée. Deux chambres dont une avec balcon, salle de douches, WC séparé, cave et parking en sous-sol.",
|
||||||
|
"phone": null,
|
||||||
|
"photos": [
|
||||||
|
{
|
||||||
|
"id": "0an3yarge9y446j653dewxu0jwy33pmwar47k2qym.jpg",
|
||||||
|
"url": "https://v.seloger.com/s/width/800/visuels/0/a/n/3/0an3yarge9y446j653dewxu0jwy33pmwar47k2qym.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "1qnz6hpffcrd1c71htbooubgb7s57d82ie1v0zyf2.jpg",
|
||||||
|
"url": "https://v.seloger.com/s/width/800/visuels/1/q/n/z/1qnz6hpffcrd1c71htbooubgb7s57d82ie1v0zyf2.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "16bv8yqgytefa1fq57hyk6e0y6ox8t2mh8wj2dgxq.jpg",
|
||||||
|
"url": "https://v.seloger.com/s/width/800/visuels/1/6/b/v/16bv8yqgytefa1fq57hyk6e0y6ox8t2mh8wj2dgxq.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "1o23blwk87ew95e3vcq5ygyk10z2hy82fzo5j6hha.jpg",
|
||||||
|
"url": "https://v.seloger.com/s/width/800/visuels/1/o/2/3/1o23blwk87ew95e3vcq5ygyk10z2hy82fzo5j6hha.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "20vuxbdp160sot4ccryf6g7g4rwxrkhz3b3tmq7zy.jpg",
|
||||||
|
"url": "https://v.seloger.com/s/width/800/visuels/2/0/v/u/20vuxbdp160sot4ccryf6g7g4rwxrkhz3b3tmq7zy.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "00d9bpezie95lqtfmoccqg1ddrld2m64c2mcod5ha.jpg",
|
||||||
|
"url": "https://v.seloger.com/s/width/800/visuels/0/0/d/9/00d9bpezie95lqtfmoccqg1ddrld2m64c2mcod5ha.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "0lhqf881qm2j03hz5581d8ggplp1xwwchb2rtoqgu.jpg",
|
||||||
|
"url": "https://v.seloger.com/s/width/800/visuels/0/l/h/q/0lhqf881qm2j03hz5581d8ggplp1xwwchb2rtoqgu.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "0chwbagbf8tc0qf9sd3wryzl4gm7hkswcnrtnx2bi.jpg",
|
||||||
|
"url": "https://v.seloger.com/s/width/800/visuels/0/c/h/w/0chwbagbf8tc0qf9sd3wryzl4gm7hkswcnrtnx2bi.jpg",
|
||||||
|
"data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rooms": 3,
|
||||||
|
"bedrooms": 2,
|
||||||
|
"details": {
|
||||||
|
"Vue": "",
|
||||||
|
"Pièces": "3",
|
||||||
|
"Etage": "15",
|
||||||
|
"Reference": "MT0136601",
|
||||||
|
"Chambres": "2",
|
||||||
|
"Cave": "",
|
||||||
|
"Balcon": "5 m²",
|
||||||
|
"Surface": "67 m²",
|
||||||
|
"Ascenseur": "",
|
||||||
|
"Etages": "30",
|
||||||
|
"Parking": "1",
|
||||||
|
"Salle de Séjour": ""
|
||||||
|
}
|
||||||
|
}
|
77
flatisfy/test_files/14428129@explorimmo.json
Normal file
77
flatisfy/test_files/14428129@explorimmo.json
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
{
|
||||||
|
"id": "14428129@explorimmo",
|
||||||
|
"url": "http://www.explorimmo.com/annonce-14428129.html",
|
||||||
|
"title": "Vente appartement 3 pièces 67 m2",
|
||||||
|
"area": 67,
|
||||||
|
"cost": 155700,
|
||||||
|
"price_per_meter": 2323.8805970149256,
|
||||||
|
"currency": "EUR",
|
||||||
|
"utilities": "H.C.",
|
||||||
|
"date": "2017-12-05T07:40:00",
|
||||||
|
"location": "17 PLACE MARECHAL JUIN Rennes 35000",
|
||||||
|
"station": null,
|
||||||
|
"text": "Exclusivité Nexity Dans un immeuble de standing, en étage élevé avec\nascenseur, Appartement Type 3 de 67 m² exposé Sud / Ouest, un séjour avec\nbalcon et double exposition vue dégagée. Deux chambres dont une avec balcon,\nsalle de douches, WC séparé, cave et parking en sous-sol.\n\n",
|
||||||
|
"phone": null,
|
||||||
|
"photos": [
|
||||||
|
{
|
||||||
|
"id": "f9b2da6dfa184759aa0c349edb1cd037.jpg",
|
||||||
|
"url": "http://thbr.figarocms.net/images/2qEDBqRV-QNlp4fHVNhSCWlt6rU=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/f9b2da6dfa184759aa0c349edb1cd037.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "3f2cc9dc429d4e3dbb9f4216f109d224.jpg",
|
||||||
|
"url": "http://thbr.figarocms.net/images/DulZQyZkkwa0ZFBT1nYD9rUD0A4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/3f2cc9dc429d4e3dbb9f4216f109d224.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "56ae1db620f44af6b860df10eba55870.jpg",
|
||||||
|
"url": "http://thbr.figarocms.net/images/EpvEffLcFbBT7spEZB2dcOHaZwA=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/56ae1db620f44af6b860df10eba55870.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "5acdef1f05314fe19111a0c3d92b8fe5.jpg",
|
||||||
|
"url": "http://thbr.figarocms.net/images/wHtDlJMwIrMC3cWXi8ASN4I6Zl4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/5acdef1f05314fe19111a0c3d92b8fe5.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "16c686ea91b248129fe60011d61e060b.jpg",
|
||||||
|
"url": "http://thbr.figarocms.net/images/SD5VT1gxRSXSlt3pAz8r_SI3rqw=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/16c686ea91b248129fe60011d61e060b.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "e6a67d42709d443481da0feb9a7e11a1.jpg",
|
||||||
|
"url": "http://thbr.figarocms.net/images/u8PGKXqC0CL9AyEOI5T9TFeGs-Y=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/e6a67d42709d443481da0feb9a7e11a1.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "6888cc7bc823402198205e480c8cab6c.jpg",
|
||||||
|
"url": "http://thbr.figarocms.net/images/-3AseFCRaleidG2vsDJpA5BLBa4=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/6888cc7bc823402198205e480c8cab6c.jpg",
|
||||||
|
"data": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "d40dbeea9e424ea2a846f5683746ea9e.jpg",
|
||||||
|
"url": "http://thbr.figarocms.net/images/TMKBtBuucYge-BgCoUGRjxZjdBE=/560x420/filters:fill(f6f6f6):quality(80):strip_icc()/d40dbeea9e424ea2a846f5683746ea9e.jpg",
|
||||||
|
"data": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rooms": 3,
|
||||||
|
"bedrooms": 2,
|
||||||
|
"details": {
|
||||||
|
"available": true,
|
||||||
|
"heatingType": "",
|
||||||
|
"agency": "NEXITY LAMY, 6 avenue Jean Janvier, 35000, Rennes",
|
||||||
|
"bathrooms": 0,
|
||||||
|
"exposure": "Non précisé",
|
||||||
|
"floor": "15",
|
||||||
|
"energy": "C",
|
||||||
|
"bedrooms": 2,
|
||||||
|
"greenhouseGasEmission": null,
|
||||||
|
"isFurnished": false,
|
||||||
|
"rooms": 3,
|
||||||
|
"fees": 0,
|
||||||
|
"creationDate": 1512455998000,
|
||||||
|
"agencyFees": 0,
|
||||||
|
"availabilityDate": null,
|
||||||
|
"guarantee": 0
|
||||||
|
}
|
||||||
|
}
|
204
flatisfy/tests.py
Normal file
204
flatisfy/tests.py
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
"""
|
||||||
|
This module contains unit testing functions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import random
|
||||||
|
import logging
|
||||||
|
import unittest
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from flatisfy import tools
|
||||||
|
from flatisfy.filters import duplicates
|
||||||
|
from flatisfy.filters.cache import ImageCache
|
||||||
|
from flatisfy.constants import BACKENDS_BY_PRECEDENCE
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
TESTS_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + "/test_files/"
|
||||||
|
|
||||||
|
class TestTexts(unittest.TestCase):
|
||||||
|
def test_roman_numbers(self):
|
||||||
|
"""
|
||||||
|
Checks roman numbers replacement.
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
"14",
|
||||||
|
tools.normalize_string("XIV")
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_multiple_whitespaces(self):
|
||||||
|
"""
|
||||||
|
Checks whitespaces are collapsed.
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
"avec ascenseur",
|
||||||
|
tools.normalize_string("avec ascenseur")
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_accents(self):
|
||||||
|
"""
|
||||||
|
Checks accents are replaced.
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
"éèêàüï",
|
||||||
|
tools.normalize_string("eeeaui")
|
||||||
|
)
|
||||||
|
|
||||||
|
class TestPhoneNumbers(unittest.TestCase):
|
||||||
|
def test_prefix(self):
|
||||||
|
"""
|
||||||
|
Checks phone numbers with international prefixes.
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
"0605040302",
|
||||||
|
duplicates.homogeneize_phone_number("+33605040302")
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_dots_separators(self):
|
||||||
|
"""
|
||||||
|
Checks phone numbers with dots.
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
"0605040302",
|
||||||
|
duplicates.homogeneize_phone_number("06.05.04.03.02")
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_spaces_separators(self):
|
||||||
|
"""
|
||||||
|
Checks phone numbers with spaces.
|
||||||
|
"""
|
||||||
|
self.assertEqual(
|
||||||
|
"0605040302",
|
||||||
|
duplicates.homogeneize_phone_number("06 05 04 03 02")
|
||||||
|
)
|
||||||
|
|
||||||
|
class TestDuplicates(unittest.TestCase):
|
||||||
|
DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS = 14
|
||||||
|
DUPLICATES_MIN_SCORE_WITH_PHOTOS = 15
|
||||||
|
IMAGE_CACHE = ImageCache()
|
||||||
|
|
||||||
|
def generate_fake_flat(self):
|
||||||
|
"""
|
||||||
|
Generates a fake flat post.
|
||||||
|
"""
|
||||||
|
backend = BACKENDS_BY_PRECEDENCE[random.randint(0, len(BACKENDS_BY_PRECEDENCE) - 1)]
|
||||||
|
return {
|
||||||
|
"id": str(random.randint(100000, 199999)) + "@" + backend,
|
||||||
|
"phone": "0607080910",
|
||||||
|
"rooms": random.randint(1, 4),
|
||||||
|
"utilities": "",
|
||||||
|
"area": random.randint(200, 1500) / 10,
|
||||||
|
"cost": random.randint(100000, 300000),
|
||||||
|
"bedrooms": random.randint(1, 4)
|
||||||
|
}
|
||||||
|
|
||||||
|
def load_files(self, file1, file2):
|
||||||
|
"""
|
||||||
|
Load two files
|
||||||
|
|
||||||
|
:return: A dict with two flats
|
||||||
|
"""
|
||||||
|
with open(TESTS_DATA_DIR + file1 + ".json", "r") as flat_file:
|
||||||
|
flat1 = json.loads(flat_file.read())
|
||||||
|
|
||||||
|
with open(TESTS_DATA_DIR + file2 + ".json", "r") as flat_file:
|
||||||
|
flat2 = json.loads(flat_file.read())
|
||||||
|
|
||||||
|
return [flat1, flat2]
|
||||||
|
|
||||||
|
def test_duplicates(self):
|
||||||
|
"""
|
||||||
|
Two identical flats should be detected as duplicates.
|
||||||
|
"""
|
||||||
|
flat1 = self.generate_fake_flat()
|
||||||
|
flat2 = copy.deepcopy(flat1)
|
||||||
|
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
|
||||||
|
self.assertTrue(score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
|
||||||
|
|
||||||
|
def test_different_prices(self):
|
||||||
|
"""
|
||||||
|
Two flats with different prices should not be detected as duplicates.
|
||||||
|
"""
|
||||||
|
flat1 = self.generate_fake_flat()
|
||||||
|
flat2 = copy.deepcopy(flat1)
|
||||||
|
flat2["cost"] += 1000
|
||||||
|
|
||||||
|
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
|
||||||
|
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
|
||||||
|
|
||||||
|
def test_different_rooms(self):
|
||||||
|
"""
|
||||||
|
Two flats with different rooms quantity should not be detected as
|
||||||
|
duplicates.
|
||||||
|
"""
|
||||||
|
flat1 = self.generate_fake_flat()
|
||||||
|
flat2 = copy.deepcopy(flat1)
|
||||||
|
flat2["rooms"] += 1
|
||||||
|
|
||||||
|
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
|
||||||
|
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
|
||||||
|
|
||||||
|
def test_different_areas(self):
|
||||||
|
"""
|
||||||
|
Two flats with different areas should not be detected as duplicates.
|
||||||
|
"""
|
||||||
|
flat1 = self.generate_fake_flat()
|
||||||
|
flat2 = copy.deepcopy(flat1)
|
||||||
|
flat2["area"] += 10
|
||||||
|
|
||||||
|
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
|
||||||
|
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
|
||||||
|
|
||||||
|
def test_different_areas_decimals(self):
|
||||||
|
"""
|
||||||
|
Two flats which areas integers are equal but decimals are present and
|
||||||
|
different should not be detected as duplicates.
|
||||||
|
"""
|
||||||
|
flat1 = self.generate_fake_flat()
|
||||||
|
flat2 = copy.deepcopy(flat1)
|
||||||
|
flat1["area"] = 50.65
|
||||||
|
flat2["area"] = 50.37
|
||||||
|
|
||||||
|
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
|
||||||
|
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
|
||||||
|
|
||||||
|
def test_different_phones(self):
|
||||||
|
"""
|
||||||
|
Two flats with different phone numbers should not be detected as duplicates.
|
||||||
|
"""
|
||||||
|
flat1 = self.generate_fake_flat()
|
||||||
|
flat2 = copy.deepcopy(flat1)
|
||||||
|
flat2["phone"] = "0708091011"
|
||||||
|
|
||||||
|
score = duplicates.get_duplicate_score(flat1, flat2, TestDuplicates.IMAGE_CACHE)
|
||||||
|
self.assertTrue(score < TestDuplicates.DUPLICATES_MIN_SCORE_WITHOUT_PHOTOS)
|
||||||
|
|
||||||
|
def test_real_duplicates(self):
|
||||||
|
"""
|
||||||
|
Two flats with same price, area and rooms quantity should be detected as
|
||||||
|
duplicates.
|
||||||
|
"""
|
||||||
|
flats = self.load_files(
|
||||||
|
"127028739@seloger",
|
||||||
|
"14428129@explorimmo"
|
||||||
|
)
|
||||||
|
|
||||||
|
score = duplicates.get_duplicate_score(flats[0], flats[1], TestDuplicates.IMAGE_CACHE)
|
||||||
|
self.assertTrue(score >= TestDuplicates.DUPLICATES_MIN_SCORE_WITH_PHOTOS)
|
||||||
|
|
||||||
|
def run(config):
|
||||||
|
"""
|
||||||
|
Run all the tests
|
||||||
|
|
||||||
|
:param config: A config dict.
|
||||||
|
"""
|
||||||
|
LOGGER.info("Running tests…")
|
||||||
|
suite = unittest.TestLoader().loadTestsFromTestCase(TestTexts)
|
||||||
|
unittest.TextTestRunner(verbosity=2).run(suite)
|
||||||
|
|
||||||
|
suite = unittest.TestLoader().loadTestsFromTestCase(TestPhoneNumbers)
|
||||||
|
unittest.TextTestRunner(verbosity=2).run(suite)
|
||||||
|
|
||||||
|
suite = unittest.TestLoader().loadTestsFromTestCase(TestDuplicates)
|
||||||
|
unittest.TextTestRunner(verbosity=2).run(suite)
|
Loading…
Reference in New Issue
Block a user