From a45eba65c7546afdae0d8890173c98245208d36e Mon Sep 17 00:00:00 2001 From: "Phyks (Lucas Verney)" Date: Fri, 19 Jan 2018 11:50:11 +0100 Subject: [PATCH] Improve cities fuzzy matching * Convert arabic numerals to roman ones in fuzzy comparison, to ensure there is no more discrepancies between "Paris 20" and "Paris XX" for instance. Fix #112 and improve on top of #110. * Improve handling of opendata postal codes (no more duplicates, better capitalization). Note: You should `pip install -r requirements.txt` and rebuild the database (`python -m flatisfy build-data --config config.json`) after this commit. Thanks @nicofrand for building the basic blocks for this! --- flatisfy/data.py | 2 +- flatisfy/data_files/__init__.py | 26 ++++++- flatisfy/exceptions.py | 2 +- flatisfy/tests.py | 28 +++---- flatisfy/tools.py | 129 +++++++++++++------------------- requirements.txt | 1 + 6 files changed, 93 insertions(+), 95 deletions(-) diff --git a/flatisfy/data.py b/flatisfy/data.py index 7f3209f..74e3828 100644 --- a/flatisfy/data.py +++ b/flatisfy/data.py @@ -1,4 +1,4 @@ -# coding : utf-8 +# coding: utf-8 """ This module contains all the code related to building necessary data files from the source opendata files. diff --git a/flatisfy/data_files/__init__.py b/flatisfy/data_files/__init__.py index f85d25b..4e848cf 100644 --- a/flatisfy/data_files/__init__.py +++ b/flatisfy/data_files/__init__.py @@ -1,16 +1,20 @@ -# coding : utf-8 +# coding: utf-8 """ Preprocessing functions to convert input opendata files into SQLAlchemy objects ready to be stored in the database. """ +from __future__ import absolute_import, print_function, unicode_literals import io import json import logging import os import sys +import titlecase + from flatisfy.models.postal_code import PostalCode from flatisfy.models.public_transport import PublicTransport +from flatisfy.tools import normalize_string if sys.version_info >= (3, 0): import csv @@ -21,6 +25,12 @@ else: LOGGER = logging.getLogger(__name__) MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) +titlecase.set_small_word_list( + # Add French small words + r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|" + + titlecase.SMALL +) + TRANSPORT_DATA_FILES = { "FR-IDF": "stops_fr-idf.txt", "FR-NW": "stops_fr-nw.txt", @@ -109,6 +119,9 @@ def _preprocess_laposte(): # Build postal codes to other infos file postal_codes_data = [] + # Keep track of seen (postal_codes, names) to avoid inserting useless + # duplicates (already in the OpenData file) + seen_postal_codes = [] for item in raw_laposte_data: fields = item["fields"] try: @@ -120,10 +133,19 @@ def _preprocess_laposte(): ) continue + name = normalize_string( + titlecase.titlecase(fields["nom_de_la_commune"]), + lowercase=False + ) + + if (fields["code_postal"], name) in seen_postal_codes: + continue + + seen_postal_codes.append((fields["code_postal"], name)) postal_codes_data.append(PostalCode( area=area, postal_code=fields["code_postal"], - name=fields["nom_de_la_commune"].title(), + name=name, lat=fields["coordonnees_gps"][0], lng=fields["coordonnees_gps"][1] )) diff --git a/flatisfy/exceptions.py b/flatisfy/exceptions.py index 07489ca..7df69be 100644 --- a/flatisfy/exceptions.py +++ b/flatisfy/exceptions.py @@ -1,4 +1,4 @@ -# coding : utf-8 +# coding: utf-8 """ This module contains all the exceptions definitions for the Flatisfy-specific exceptions. diff --git a/flatisfy/tests.py b/flatisfy/tests.py index 131ce21..df3e48e 100644 --- a/flatisfy/tests.py +++ b/flatisfy/tests.py @@ -27,38 +27,34 @@ class TestTexts(unittest.TestCase): """ Checks roman numbers replacement. """ - tester = tools.RomanNumbers() - self.assertTrue(tester.check_valid("XIV")) - self.assertTrue(not tester.check_valid("ABC")) - self.assertEqual( - "14", - tester.convert_to_arabic("XIV") + "XIV", + tools.convert_arabic_to_roman("14") ) self.assertEqual( - "1987", - tester.convert_to_arabic("MCMLXXXVII") + "MCMLXXXVII", + tools.convert_arabic_to_roman("1987") ) self.assertEqual( - "Dans le 15e arrondissement", - tester.convert_to_arabic_in_text("Dans le XVe arrondissement") + "Dans le XVe arrondissement", + tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement") ) self.assertEqual( - "20eme arr.", - tester.convert_to_arabic_in_text("XXeme arr.") + "XXeme arr.", + tools.convert_arabic_to_roman_in_text("20eme arr.") ) self.assertEqual( "A AIX EN PROVENCE", - tester.convert_to_arabic_in_text("A AIX EN PROVENCE") + tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE") ) self.assertEqual( "Montigny Le Bretonneux", - tester.convert_to_arabic_in_text("Montigny Le Bretonneux") + tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux") ) def test_roman_numbers_in_text(self): @@ -67,8 +63,8 @@ class TestTexts(unittest.TestCase): normalization. """ self.assertEqual( - "dans le 15e arrondissement", - tools.normalize_string("Dans le XVe arrondissement") + "dans le XVe arrondissement", + tools.normalize_string("Dans le 15e arrondissement") ) def test_multiple_whitespaces(self): diff --git a/flatisfy/tools.py b/flatisfy/tools.py index ec62f09..f50d7c7 100644 --- a/flatisfy/tools.py +++ b/flatisfy/tools.py @@ -25,80 +25,48 @@ LOGGER = logging.getLogger(__name__) NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys" -class RomanNumbers(object): +def convert_arabic_to_roman(arabic): """ - Utilities to check and convert roman numbers. + Convert an arabic literal to a roman one. - Part of the conversions is based on - https://gist.github.com/riverrun/ac91218bb1678b857c12 + ..note:: + Based on https://gist.github.com/riverrun/ac91218bb1678b857c12. + + :param arabic: An arabic number, as string. + :returns: The corresponding roman one, as string. """ - @staticmethod - def check_valid(roman): - """ - Check whether a roman literal is a valid roman literal. + to_roman = { + 1: 'I', 2: 'II', 3: 'III', 4: 'IV', 5: 'V', 6: 'VI', 7: 'VII', + 8: 'VIII', 9: 'IX', 10: 'X', + 20: 'XX', 30: 'XXX', 40: 'XL', 50: 'L', 60: 'LX', 70: 'LXX', + 80: 'LXXX', 90: 'XC', + 100: 'C', 200: 'CC', 300: 'CCC', 400: 'CD', 500: 'D', 600: 'DC', + 700: 'DCC', 800: 'DCCC', 900: 'CM', + 1000: 'M', 2000: 'MM', 3000: 'MMM' + } + roman_chars_list = [] + count = 1 + for digit in arabic[::-1]: + digit = int(digit) + if digit != 0: + roman_chars_list.append(to_roman[digit * count]) + count *= 10 + return ''.join(roman_chars_list[::-1]) - :param roman: A roman literal, as string. - :returns: ``True`` if it is a valid roman literal, ``False`` otherwise. - """ - if not re.match('^[MDCLXVI]+$', roman): - return False - invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM'] - if any(sub in roman for sub in invalid): - return False +def convert_arabic_to_roman_in_text(text): + """ + Convert roman literals to arabic one in a text. - # TODO: check M does not appear after any other, etc. - return True - - @staticmethod - def convert_to_arabic(roman): - """ - Convert a roman literal to arabic one. - - :param roman: A roman number, as string. - :returns: The corresponding arabic one, as string. - """ - if not RomanNumbers.check_valid(roman): - return roman - - keys = [ - 'IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V', - 'X', 'L', 'C', 'D', 'M' - ] - to_arabic = { - 'IV': '4', - 'IX': '9', - 'XL': '40', - 'XC': '90', - 'CD': '400', - 'CM': '900', - 'I': '1', - 'V': '5', - 'X': '10', - 'L': '50', - 'C': '100', - 'D': '500', - 'M': '1000' - } - for key in keys: - if key in roman: - roman = roman.replace(key, ' {}'.format(to_arabic.get(key))) - return str(sum(int(num) for num in roman.split())) - - @staticmethod - def convert_to_arabic_in_text(text): - """ - Convert roman literals to arabic one in a text. - - :param text: Some text to convert roman literals from. - :returns: The corresponding text with roman literals converted to - arabic. - """ - return re.sub( - r'(?>> normalize_string("tétéà 14ème-XIV, foobar") + 'tetea XIVeme xiv, foobar' + + >>> normalize_string("tétéà 14ème-XIV, foobar", False) 'tetea 14eme xiv, foobar' + + :param string: The string to normalize. + :param lowercase: Whether to convert string to lowercase or not. Defaults + to ``True``. + :param convert_arabic_numerals: Whether to convert arabic numerals to roman + ones. Defaults to ``True``. + :return: The normalized string. """ # ASCIIfy the string string = unidecode.unidecode(string) @@ -237,12 +215,13 @@ def normalize_string(string): # Keep some basic punctuation to keep syntaxic units string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string) - # Convert roman numbers to arabic numbers - # TODO: Fix this :) - # string = RomanNumbers.convert_to_arabic_in_text(string) - # Convert to lowercase - string = string.lower() + if lowercase: + string = string.lower() + + # Convert arabic numbers to roman numbers + if convert_arabic_numerals: + string = convert_arabic_to_roman_in_text(string) # Collapse multiple spaces, replace tabulations and newlines by space string = re.sub(r"\s+", " ", string) diff --git a/requirements.txt b/requirements.txt index 78fd4be..ca1cda6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ imagehash pillow requests sqlalchemy +titlecase unidecode vobject whoosh