Better deduplication
* Improve deduplication on URLs (match sets). * Keep track of duplicates and update their status on refiltering.
This commit is contained in:
parent
1d98c631e0
commit
5f2f4d0ccf
@ -68,6 +68,8 @@ List of configuration options:
|
|||||||
* `webserver` is a server to use instead of the default Bottle built-in
|
* `webserver` is a server to use instead of the default Bottle built-in
|
||||||
webserver, see [Bottle deployment
|
webserver, see [Bottle deployment
|
||||||
doc](http://bottlepy.org/docs/dev/deployment.html).
|
doc](http://bottlepy.org/docs/dev/deployment.html).
|
||||||
|
* `backends` is a list of Weboob backends to enable. It defaults to any
|
||||||
|
available and supported Weboob backend.
|
||||||
|
|
||||||
_Note:_ In production, you can either use the `serve` command with a reliable
|
_Note:_ In production, you can either use the `serve` command with a reliable
|
||||||
webserver instead of the default Bottle webserver (specifying a `webserver`
|
webserver instead of the default Bottle webserver (specifying a `webserver`
|
||||||
|
@ -155,8 +155,8 @@ def main():
|
|||||||
if args.cmd == "fetch":
|
if args.cmd == "fetch":
|
||||||
# Fetch and filter flats list
|
# Fetch and filter flats list
|
||||||
flats_list = fetch.fetch_flats_list(config)
|
flats_list = fetch.fetch_flats_list(config)
|
||||||
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
|
flats_list = cmds.filter_flats(config, flats_list=flats_list,
|
||||||
fetch_details=True)
|
fetch_details=True)["new"]
|
||||||
# Sort by cost
|
# Sort by cost
|
||||||
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
|
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
|
||||||
|
|
||||||
@ -169,8 +169,8 @@ def main():
|
|||||||
if args.input:
|
if args.input:
|
||||||
flats_list = fetch.load_flats_list_from_file(args.input)
|
flats_list = fetch.load_flats_list_from_file(args.input)
|
||||||
|
|
||||||
flats_list, _ = cmds.filter_flats(config, flats_list=flats_list,
|
flats_list = cmds.filter_flats(config, flats_list=flats_list,
|
||||||
fetch_details=False)
|
fetch_details=False)["new"]
|
||||||
|
|
||||||
# Sort by cost
|
# Sort by cost
|
||||||
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
|
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
|
||||||
|
@ -4,6 +4,7 @@ Main commands available for flatisfy.
|
|||||||
"""
|
"""
|
||||||
from __future__ import absolute_import, print_function, unicode_literals
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
|
|
||||||
|
import collections
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import flatisfy.filters
|
import flatisfy.filters
|
||||||
@ -11,13 +12,14 @@ from flatisfy import database
|
|||||||
from flatisfy.models import flat as flat_model
|
from flatisfy.models import flat as flat_model
|
||||||
from flatisfy import fetch
|
from flatisfy import fetch
|
||||||
from flatisfy import tools
|
from flatisfy import tools
|
||||||
|
from flatisfy.filters import metadata
|
||||||
from flatisfy.web import app as web_app
|
from flatisfy.web import app as web_app
|
||||||
|
|
||||||
|
|
||||||
LOGGER = logging.getLogger(__name__)
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def filter_flats(config, flats_list=None, fetch_details=True):
|
def filter_flats(config, flats_list, fetch_details=True):
|
||||||
"""
|
"""
|
||||||
Filter the available flats list. Then, filter it according to criteria.
|
Filter the available flats list. Then, filter it according to criteria.
|
||||||
|
|
||||||
@ -25,30 +27,43 @@ def filter_flats(config, flats_list=None, fetch_details=True):
|
|||||||
:param fetch_details: Whether additional details should be fetched between
|
:param fetch_details: Whether additional details should be fetched between
|
||||||
the two passes.
|
the two passes.
|
||||||
:param flats_list: The initial list of flat objects to filter.
|
:param flats_list: The initial list of flat objects to filter.
|
||||||
:return: A tuple of the list of all matching flats and the list of ignored
|
:return: A dict mapping flat status and list of flat objects.
|
||||||
flats.
|
|
||||||
"""
|
"""
|
||||||
|
# Add the flatisfy metadata entry and prepare the flat objects
|
||||||
|
flats_list = metadata.init(flats_list)
|
||||||
|
|
||||||
|
first_pass_result = collections.defaultdict(list)
|
||||||
|
second_pass_result = collections.defaultdict(list)
|
||||||
# Do a first pass with the available infos to try to remove as much
|
# Do a first pass with the available infos to try to remove as much
|
||||||
# unwanted postings as possible
|
# unwanted postings as possible
|
||||||
if config["passes"] > 0:
|
if config["passes"] > 0:
|
||||||
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
|
first_pass_result = flatisfy.filters.first_pass(flats_list,
|
||||||
config)
|
config)
|
||||||
|
else:
|
||||||
|
first_pass_result["new"] = flats_list
|
||||||
|
|
||||||
|
# Load additional infos
|
||||||
|
if fetch_details:
|
||||||
|
for i, flat in enumerate(first_pass_result["new"]):
|
||||||
|
details = fetch.fetch_details(config, flat["id"])
|
||||||
|
first_pass_result["new"][i] = tools.merge_dicts(flat, details)
|
||||||
|
|
||||||
# Do a second pass to consolidate all the infos we found and make use of
|
# Do a second pass to consolidate all the infos we found and make use of
|
||||||
# additional infos
|
# additional infos
|
||||||
if config["passes"] > 1:
|
if config["passes"] > 1:
|
||||||
# Load additional infos
|
second_pass_result = flatisfy.filters.second_pass(
|
||||||
if fetch_details:
|
first_pass_result["new"], config
|
||||||
for i, flat in enumerate(flats_list):
|
|
||||||
details = fetch.fetch_details(config, flat["id"])
|
|
||||||
flats_list[i] = tools.merge_dicts(flat, details)
|
|
||||||
|
|
||||||
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
|
|
||||||
flats_list, config
|
|
||||||
)
|
)
|
||||||
ignored_flats.extend(extra_ignored_flats)
|
else:
|
||||||
|
second_pass_result["new"] = first_pass_result["new"]
|
||||||
|
|
||||||
return flats_list, ignored_flats
|
return {
|
||||||
|
"new": second_pass_result["new"],
|
||||||
|
"duplicate": first_pass_result["duplicate"],
|
||||||
|
"ignored": (
|
||||||
|
first_pass_result["ignored"] + second_pass_result["ignored"]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def import_and_filter(config, load_from_db=False):
|
def import_and_filter(config, load_from_db=False):
|
||||||
@ -66,19 +81,16 @@ def import_and_filter(config, load_from_db=False):
|
|||||||
flats_list = fetch.load_flats_list_from_db(config)
|
flats_list = fetch.load_flats_list_from_db(config)
|
||||||
else:
|
else:
|
||||||
flats_list = fetch.fetch_flats_list(config)
|
flats_list = fetch.fetch_flats_list(config)
|
||||||
flats_list, ignored_list = filter_flats(config, flats_list=flats_list,
|
flats_list_by_status = filter_flats(config, flats_list=flats_list,
|
||||||
fetch_details=True)
|
fetch_details=True)
|
||||||
# Create database connection
|
# Create database connection
|
||||||
get_session = database.init_db(config["database"])
|
get_session = database.init_db(config["database"])
|
||||||
|
|
||||||
with get_session() as session:
|
with get_session() as session:
|
||||||
|
for status, flats_list in flats_list_by_status.items():
|
||||||
for flat_dict in flats_list:
|
for flat_dict in flats_list:
|
||||||
flat = flat_model.Flat.from_dict(flat_dict)
|
flat = flat_model.Flat.from_dict(flat_dict)
|
||||||
session.merge(flat)
|
flat.status = getattr(flat_model.FlatStatus, status)
|
||||||
|
|
||||||
for flat_dict in ignored_list:
|
|
||||||
flat = flat_model.Flat.from_dict(flat_dict)
|
|
||||||
flat.status = flat_model.FlatStatus.ignored
|
|
||||||
session.merge(flat)
|
session.merge(flat)
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,25 +89,23 @@ def first_pass(flats_list, config):
|
|||||||
|
|
||||||
:param flats_list: A list of flats dict to filter.
|
:param flats_list: A list of flats dict to filter.
|
||||||
:param config: A config dict.
|
:param config: A config dict.
|
||||||
:return: A tuple of processed flats and ignored flats.
|
:return: A dict mapping flat status and list of flat objects.
|
||||||
"""
|
"""
|
||||||
LOGGER.info("Running first filtering pass.")
|
LOGGER.info("Running first filtering pass.")
|
||||||
|
|
||||||
# Handle duplicates based on ids
|
# Handle duplicates based on ids
|
||||||
# Just remove them (no merge) as they should be the exact same object.
|
# Just remove them (no merge) as they should be the exact same object.
|
||||||
flats_list = duplicates.detect(
|
flats_list, duplicates_by_id = duplicates.detect(
|
||||||
flats_list, key="id", merge=False
|
flats_list, key="id", merge=False, should_intersect=False
|
||||||
)
|
)
|
||||||
# Also merge duplicates based on url (these may come from different
|
# Also merge duplicates based on urls (these may come from different
|
||||||
# flatboob backends)
|
# flatboob backends)
|
||||||
# This is especially useful as some websites such as entreparticuliers
|
# This is especially useful as some websites such as entreparticuliers
|
||||||
# contains a lot of leboncoin housings posts.
|
# contains a lot of leboncoin housings posts.
|
||||||
flats_list = duplicates.detect(
|
flats_list, duplicates_by_urls = duplicates.detect(
|
||||||
flats_list, key="url", merge=True
|
flats_list, key="urls", merge=True, should_intersect=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the flatisfy metadata entry and prepare the flat objects
|
|
||||||
flats_list = metadata.init(flats_list)
|
|
||||||
# Guess the postal codes
|
# Guess the postal codes
|
||||||
flats_list = metadata.guess_postal_code(flats_list, config)
|
flats_list = metadata.guess_postal_code(flats_list, config)
|
||||||
# Try to match with stations
|
# Try to match with stations
|
||||||
@ -115,7 +113,11 @@ def first_pass(flats_list, config):
|
|||||||
# Remove returned housing posts that do not match criteria
|
# Remove returned housing posts that do not match criteria
|
||||||
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
||||||
|
|
||||||
return (flats_list, ignored_list)
|
return {
|
||||||
|
"new": flats_list,
|
||||||
|
"ignored": ignored_list,
|
||||||
|
"duplicate": duplicates_by_id + duplicates_by_urls
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def second_pass(flats_list, config):
|
def second_pass(flats_list, config):
|
||||||
@ -131,7 +133,7 @@ def second_pass(flats_list, config):
|
|||||||
|
|
||||||
:param flats_list: A list of flats dict to filter.
|
:param flats_list: A list of flats dict to filter.
|
||||||
:param config: A config dict.
|
:param config: A config dict.
|
||||||
:return: A tuple of processed flats and ignored flats.
|
:return: A dict mapping flat status and list of flat objects.
|
||||||
"""
|
"""
|
||||||
LOGGER.info("Running second filtering pass.")
|
LOGGER.info("Running second filtering pass.")
|
||||||
# Assumed to run after first pass, so there should be no obvious duplicates
|
# Assumed to run after first pass, so there should be no obvious duplicates
|
||||||
@ -151,4 +153,7 @@ def second_pass(flats_list, config):
|
|||||||
# Remove returned housing posts that do not match criteria
|
# Remove returned housing posts that do not match criteria
|
||||||
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
flats_list, ignored_list = refine_with_housing_criteria(flats_list, config)
|
||||||
|
|
||||||
return (flats_list, ignored_list)
|
return {
|
||||||
|
"new": flats_list,
|
||||||
|
"ignored": ignored_list
|
||||||
|
}
|
||||||
|
@ -23,7 +23,7 @@ BACKENDS_PRECEDENCE = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def detect(flats_list, key="id", merge=True):
|
def detect(flats_list, key="id", merge=True, should_intersect=False):
|
||||||
"""
|
"""
|
||||||
Detect obvious duplicates within a given list of flats.
|
Detect obvious duplicates within a given list of flats.
|
||||||
|
|
||||||
@ -38,18 +38,32 @@ def detect(flats_list, key="id", merge=True):
|
|||||||
done.
|
done.
|
||||||
:param merge: Whether the found duplicates should be merged or we should
|
:param merge: Whether the found duplicates should be merged or we should
|
||||||
only keep one of them.
|
only keep one of them.
|
||||||
|
:param should_intersect: Set to ``True`` if the values in the flat dicts
|
||||||
|
are lists and you want to deduplicate on non-empty intersection (typically
|
||||||
|
if they have a common url).
|
||||||
|
|
||||||
:return: A deduplicated list of flat dicts.
|
:return: A tuple of the deduplicated list of flat dicts and the list of all
|
||||||
|
the flats objects that should be removed and considered as duplicates (they
|
||||||
|
were already merged).
|
||||||
"""
|
"""
|
||||||
# ``seen`` is a dict mapping aggregating the flats by the deduplication
|
# ``seen`` is a dict mapping aggregating the flats by the deduplication
|
||||||
# keys. We basically make buckets of flats for every key value. Flats in
|
# keys. We basically make buckets of flats for every key value. Flats in
|
||||||
# the same bucket should be merged together afterwards.
|
# the same bucket should be merged together afterwards.
|
||||||
seen = collections.defaultdict(list)
|
seen = collections.defaultdict(list)
|
||||||
for flat in flats_list:
|
for flat in flats_list:
|
||||||
|
if should_intersect:
|
||||||
|
# We add each value separately. We will add some flats multiple
|
||||||
|
# times, but we deduplicate again on id below to compensate.
|
||||||
|
for value in flat.get(key, []):
|
||||||
|
seen[value].append(flat)
|
||||||
|
else:
|
||||||
seen[flat.get(key, None)].append(flat)
|
seen[flat.get(key, None)].append(flat)
|
||||||
|
|
||||||
# Generate the unique flats list based on these buckets
|
# Generate the unique flats list based on these buckets
|
||||||
unique_flats_list = []
|
unique_flats_list = []
|
||||||
|
# Keep track of all the flats that were removed by deduplication
|
||||||
|
duplicate_flats = []
|
||||||
|
|
||||||
for flat_key, matching_flats in seen.items():
|
for flat_key, matching_flats in seen.items():
|
||||||
if flat_key is None:
|
if flat_key is None:
|
||||||
# If the key is None, it means Weboob could not load the data. In
|
# If the key is None, it means Weboob could not load the data. In
|
||||||
@ -67,7 +81,8 @@ def detect(flats_list, key="id", merge=True):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if len(matching_flats) > 1:
|
if len(matching_flats) > 1:
|
||||||
LOGGER.info("Found duplicates: %s.",
|
LOGGER.info("Found duplicates using key \"%s\": %s.",
|
||||||
|
key,
|
||||||
[flat["id"] for flat in matching_flats])
|
[flat["id"] for flat in matching_flats])
|
||||||
# Otherwise, check the policy
|
# Otherwise, check the policy
|
||||||
if merge:
|
if merge:
|
||||||
@ -76,6 +91,19 @@ def detect(flats_list, key="id", merge=True):
|
|||||||
tools.merge_dicts(*matching_flats)
|
tools.merge_dicts(*matching_flats)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Otherwise, just keep any of them
|
# Otherwise, just keep the most important of them
|
||||||
unique_flats_list.append(matching_flats[0])
|
unique_flats_list.append(matching_flats[-1])
|
||||||
return unique_flats_list
|
|
||||||
|
# The ID of the added merged flat will be the one of the last item
|
||||||
|
# in ``matching_flats``. Then, any flat object that was before in
|
||||||
|
# the ``matching_flats`` list is to be considered as a duplicate
|
||||||
|
# and should have a ``duplicate`` status.
|
||||||
|
duplicate_flats.extend(matching_flats[:-1])
|
||||||
|
|
||||||
|
if should_intersect:
|
||||||
|
# We added some flats twice with the above method, let's deduplicate on
|
||||||
|
# id.
|
||||||
|
unique_flats_list, _ = detect(unique_flats_list, key="id", merge=True,
|
||||||
|
should_intersect=False)
|
||||||
|
|
||||||
|
return unique_flats_list, duplicate_flats
|
||||||
|
@ -33,6 +33,7 @@ class FlatStatus(enum.Enum):
|
|||||||
An enum of the possible status for a flat entry.
|
An enum of the possible status for a flat entry.
|
||||||
"""
|
"""
|
||||||
user_deleted = -100
|
user_deleted = -100
|
||||||
|
duplicate = -20
|
||||||
ignored = -10
|
ignored = -10
|
||||||
new = 0
|
new = 0
|
||||||
followed = 10
|
followed = 10
|
||||||
@ -83,6 +84,7 @@ class Flat(BASE):
|
|||||||
"""
|
"""
|
||||||
# Handle flatisfy metadata
|
# Handle flatisfy metadata
|
||||||
flat_dict = flat_dict.copy()
|
flat_dict = flat_dict.copy()
|
||||||
|
if "flatisfy" in flat_dict:
|
||||||
flat_dict["flatisfy_stations"] = (
|
flat_dict["flatisfy_stations"] = (
|
||||||
flat_dict["flatisfy"].get("matched_stations", [])
|
flat_dict["flatisfy"].get("matched_stations", [])
|
||||||
)
|
)
|
||||||
|
@ -55,7 +55,7 @@
|
|||||||
</th>
|
</th>
|
||||||
<td>
|
<td>
|
||||||
<template v-if="flat.flatisfy_postal_code.postal_code">
|
<template v-if="flat.flatisfy_postal_code.postal_code">
|
||||||
{{ flat.flatisfy_postal_code.name }} ( {{ flat.flatisfy_postal_code.postal_code }} )
|
{{ flat.flatisfy_postal_code.name }} ({{ flat.flatisfy_postal_code.postal_code }})
|
||||||
</template>
|
</template>
|
||||||
<template v-else>
|
<template v-else>
|
||||||
?
|
?
|
||||||
|
Loading…
Reference in New Issue
Block a user