Improve cities fuzzy matching

* Convert arabic numerals to roman ones in fuzzy comparison, to ensure there is no more discrepancies between "Paris 20" and "Paris XX" for instance. Fix #112 and improve on top of #110. * Improve handling of opendata postal codes (no more duplicates, better capitalization). Note: You should `pip install -r requirements.txt` and rebuild the database (`python -m flatisfy build-data --config config.json`) after this commit. Thanks @nicofrand for building the basic blocks for this!
2018-01-19 11:50:11 +01:00 · 2018-01-19 11:50:11 +01:00 · a45eba65c7
parent 7bf08adbce
commit a45eba65c7
6 changed files with 93 additions and 95 deletions
--- a/flatisfy/data.py
+++ b/flatisfy/data.py
@ -1,4 +1,4 @@
-# coding : utf-8
+# coding: utf-8
 """
 This module contains all the code related to building necessary data files from
 the source opendata files.
--- a/flatisfy/data_files/init.py
+++ b/flatisfy/data_files/init.py
@ -1,16 +1,20 @@
-# coding : utf-8
+# coding: utf-8
 """
 Preprocessing functions to convert input opendata files into SQLAlchemy objects
 ready to be stored in the database.
 """
 from __future__ import absolute_import, print_function, unicode_literals
 import io
 import json
 import logging
 import os
 import sys
 import titlecase
 from flatisfy.models.postal_code import PostalCode
 from flatisfy.models.public_transport import PublicTransport
 from flatisfy.tools import normalize_string
 if sys.version_info >= (3, 0):
    import csv
@ -21,6 +25,12 @@ else:
 LOGGER = logging.getLogger(__name__)
 MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 titlecase.set_small_word_list(
    # Add French small words
    r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|" +
    titlecase.SMALL
 )
 TRANSPORT_DATA_FILES = {
    "FR-IDF": "stops_fr-idf.txt",
    "FR-NW": "stops_fr-nw.txt",
@ -109,6 +119,9 @@ def _preprocess_laposte():
    # Build postal codes to other infos file
    postal_codes_data = []
    # Keep track of seen (postal_codes, names) to avoid inserting useless
    # duplicates (already in the OpenData file)
    seen_postal_codes = []
    for item in raw_laposte_data:
        fields = item["fields"]
        try:
@ -120,10 +133,19 @@ def _preprocess_laposte():
                )
                continue
            name = normalize_string(
                titlecase.titlecase(fields["nom_de_la_commune"]),
                lowercase=False
            )
            if (fields["code_postal"], name) in seen_postal_codes:
                continue
            seen_postal_codes.append((fields["code_postal"], name))
            postal_codes_data.append(PostalCode(
                area=area,
                postal_code=fields["code_postal"],
-                name=fields["nom_de_la_commune"].title(),
+                name=name,
                lat=fields["coordonnees_gps"][0],
                lng=fields["coordonnees_gps"][1]
            ))
--- a/flatisfy/exceptions.py
+++ b/flatisfy/exceptions.py
@ -1,4 +1,4 @@
-# coding : utf-8
+# coding: utf-8
 """
 This module contains all the exceptions definitions for the Flatisfy-specific
 exceptions.
--- a/flatisfy/tests.py
+++ b/flatisfy/tests.py
@ -27,38 +27,34 @@ class TestTexts(unittest.TestCase):
        """
        Checks roman numbers replacement.
        """
        tester = tools.RomanNumbers()
        self.assertTrue(tester.check_valid("XIV"))
        self.assertTrue(not tester.check_valid("ABC"))
        self.assertEqual(
-            "14",
+            "XIV",
-            tester.convert_to_arabic("XIV")
+            tools.convert_arabic_to_roman("14")
        )
        self.assertEqual(
-            "1987",
+            "MCMLXXXVII",
-            tester.convert_to_arabic("MCMLXXXVII")
+            tools.convert_arabic_to_roman("1987")
        )
        self.assertEqual(
-            "Dans le 15e arrondissement",
+            "Dans le XVe arrondissement",
-            tester.convert_to_arabic_in_text("Dans le XVe arrondissement")
+            tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement")
        )
        self.assertEqual(
-            "20eme arr.",
+            "XXeme arr.",
-            tester.convert_to_arabic_in_text("XXeme arr.")
+            tools.convert_arabic_to_roman_in_text("20eme arr.")
        )
        self.assertEqual(
            "A AIX EN PROVENCE",
-            tester.convert_to_arabic_in_text("A AIX EN PROVENCE")
+            tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE")
        )
        self.assertEqual(
            "Montigny Le Bretonneux",
-            tester.convert_to_arabic_in_text("Montigny Le Bretonneux")
+            tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux")
        )
    def test_roman_numbers_in_text(self):
@ -67,8 +63,8 @@ class TestTexts(unittest.TestCase):
        normalization.
        """
        self.assertEqual(
-            "dans le 15e arrondissement",
+            "dans le XVe arrondissement",
-            tools.normalize_string("Dans le XVe arrondissement")
+            tools.normalize_string("Dans le 15e arrondissement")
        )
    def test_multiple_whitespaces(self):
--- a/flatisfy/tools.py
+++ b/flatisfy/tools.py
@ -25,68 +25,36 @@ LOGGER = logging.getLogger(__name__)
 NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
-class RomanNumbers(object):
+def convert_arabic_to_roman(arabic):
    """
-    Utilities to check and convert roman numbers.
+    Convert an arabic literal to a roman one.
-    Part of the conversions is based on
+    ..note::
-    https://gist.github.com/riverrun/ac91218bb1678b857c12
+        Based on https://gist.github.com/riverrun/ac91218bb1678b857c12.
    :param arabic: An arabic number, as string.
    :returns: The corresponding roman one, as string.
    """
-    @staticmethod
+    to_roman = {
-    def check_valid(roman):
+        1: 'I', 2: 'II', 3: 'III', 4: 'IV', 5: 'V', 6: 'VI', 7: 'VII',
-        """
+        8: 'VIII', 9: 'IX', 10: 'X',
-        Check whether a roman literal is a valid roman literal.
+        20: 'XX', 30: 'XXX', 40: 'XL', 50: 'L', 60: 'LX', 70: 'LXX',
-
+        80: 'LXXX', 90: 'XC',
-        :param roman: A roman literal, as string.
+        100: 'C', 200: 'CC', 300: 'CCC', 400: 'CD', 500: 'D', 600: 'DC',
-        :returns: ``True`` if it is a valid roman literal, ``False`` otherwise.
+        700: 'DCC', 800: 'DCCC', 900: 'CM',
-        """
+        1000: 'M', 2000: 'MM', 3000: 'MMM'
        if not re.match('^[MDCLXVI]+$', roman):
            return False
        invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
        if any(sub in roman for sub in invalid):
            return False
        # TODO: check M does not appear after any other, etc.
        return True
    @staticmethod
    def convert_to_arabic(roman):
        """
        Convert a roman literal to arabic one.
        :param roman: A roman number, as string.
        :returns: The corresponding arabic one, as string.
        """
        if not RomanNumbers.check_valid(roman):
            return roman
        keys = [
            'IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V',
            'X', 'L', 'C', 'D', 'M'
        ]
        to_arabic = {
            'IV': '4',
            'IX': '9',
            'XL': '40',
            'XC': '90',
            'CD': '400',
            'CM': '900',
            'I': '1',
            'V': '5',
            'X': '10',
            'L': '50',
            'C': '100',
            'D': '500',
            'M': '1000'
    }
-        for key in keys:
+    roman_chars_list = []
-            if key in roman:
+    count = 1
-                roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
+    for digit in arabic[::-1]:
-        return str(sum(int(num) for num in roman.split()))
+        digit = int(digit)
        if digit != 0:
            roman_chars_list.append(to_roman[digit * count])
        count *= 10
    return ''.join(roman_chars_list[::-1])
-    @staticmethod
+
-    def convert_to_arabic_in_text(text):
+def convert_arabic_to_roman_in_text(text):
    """
    Convert roman literals to arabic one in a text.
@ -95,8 +63,8 @@ class RomanNumbers(object):
        arabic.
    """
    return re.sub(
-            r'(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
+        r'(\d+)',
-            lambda matchobj: RomanNumbers.convert_to_arabic(matchobj.group(0)),
+        lambda matchobj: convert_arabic_to_roman(matchobj.group(0)),
        text
    )
@ -221,14 +189,24 @@ def is_within_interval(value, min_value=None, max_value=None):
    return all(checks)
-def normalize_string(string):
+def normalize_string(string, lowercase=True, convert_arabic_numerals=True):
    """
    Normalize the given string for matching.
-    :Example:
+    Example::
        >>> normalize_string("tétéà 14ème-XIV,  foobar")
        'tetea XIVeme xiv, foobar'
        >>> normalize_string("tétéà 14ème-XIV,  foobar", False)
        'tetea 14eme xiv, foobar'
    :param string: The string to normalize.
    :param lowercase: Whether to convert string to lowercase or not. Defaults
        to ``True``.
    :param convert_arabic_numerals: Whether to convert arabic numerals to roman
        ones. Defaults to ``True``.
    :return: The normalized string.
    """
    # ASCIIfy the string
    string = unidecode.unidecode(string)
@ -237,13 +215,14 @@ def normalize_string(string):
    # Keep some basic punctuation to keep syntaxic units
    string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
    # Convert roman numbers to arabic numbers
    # TODO: Fix this :)
    # string = RomanNumbers.convert_to_arabic_in_text(string)
    # Convert to lowercase
    if lowercase:
        string = string.lower()
    # Convert arabic numbers to roman numbers
    if convert_arabic_numerals:
        string = convert_arabic_to_roman_in_text(string)
    # Collapse multiple spaces, replace tabulations and newlines by space
    string = re.sub(r"\s+", " ", string)
--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,7 @@ imagehash
 pillow
 requests
 sqlalchemy
 titlecase
 unidecode
 vobject
 whoosh