Improve cities fuzzy matching

* Convert arabic numerals to roman ones in fuzzy comparison, to ensure there is no more discrepancies between "Paris 20" and "Paris XX" for instance. Fix #112 and improve on top of #110. * Improve handling of opendata postal codes (no more duplicates, better capitalization). Note: You should `pip install -r requirements.txt` and rebuild the database (`python -m flatisfy build-data --config config.json`) after this commit. Thanks @nicofrand for building the basic blocks for this!
2018-01-19 11:50:11 +01:00 · 2018-01-19 11:50:11 +01:00 · a45eba65c7
commit a45eba65c7
parent 7bf08adbce
6 changed files with 93 additions and 95 deletions
--- a/flatisfy/data.py
+++ b/flatisfy/data.py
@ -1,4 +1,4 @@
-# coding : utf-8
+# coding: utf-8
 """
 This module contains all the code related to building necessary data files from
 the source opendata files.
--- a/flatisfy/data_files/init.py
+++ b/flatisfy/data_files/init.py
@ -1,16 +1,20 @@
-# coding : utf-8
+# coding: utf-8
 """
 Preprocessing functions to convert input opendata files into SQLAlchemy objects
 ready to be stored in the database.
 """
+from __future__ import absolute_import, print_function, unicode_literals
 import io
 import json
 import logging
 import os
 import sys

+import titlecase
+
 from flatisfy.models.postal_code import PostalCode
 from flatisfy.models.public_transport import PublicTransport
+from flatisfy.tools import normalize_string

 if sys.version_info >= (3, 0):
    import csv
@ -21,6 +25,12 @@ else:
 LOGGER = logging.getLogger(__name__)
 MODULE_DIR = os.path.dirname(os.path.realpath(__file__))

+titlecase.set_small_word_list(
+    # Add French small words
+    r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|" +
+    titlecase.SMALL
+)
+
 TRANSPORT_DATA_FILES = {
    "FR-IDF": "stops_fr-idf.txt",
    "FR-NW": "stops_fr-nw.txt",
@ -109,6 +119,9 @@ def _preprocess_laposte():

    # Build postal codes to other infos file
    postal_codes_data = []
+    # Keep track of seen (postal_codes, names) to avoid inserting useless
+    # duplicates (already in the OpenData file)
+    seen_postal_codes = []
    for item in raw_laposte_data:
        fields = item["fields"]
        try:
@ -120,10 +133,19 @@ def _preprocess_laposte():
                )
                continue

+            name = normalize_string(
+                titlecase.titlecase(fields["nom_de_la_commune"]),
+                lowercase=False
+            )
+
+            if (fields["code_postal"], name) in seen_postal_codes:
+                continue
+
+            seen_postal_codes.append((fields["code_postal"], name))
            postal_codes_data.append(PostalCode(
                area=area,
                postal_code=fields["code_postal"],
-                name=fields["nom_de_la_commune"].title(),
+                name=name,
                lat=fields["coordonnees_gps"][0],
                lng=fields["coordonnees_gps"][1]
            ))
--- a/flatisfy/exceptions.py
+++ b/flatisfy/exceptions.py
@ -1,4 +1,4 @@
-# coding : utf-8
+# coding: utf-8
 """
 This module contains all the exceptions definitions for the Flatisfy-specific
 exceptions.
--- a/flatisfy/tests.py
+++ b/flatisfy/tests.py
@ -27,38 +27,34 @@ class TestTexts(unittest.TestCase):
        """
        Checks roman numbers replacement.
        """
-        tester = tools.RomanNumbers()
-        self.assertTrue(tester.check_valid("XIV"))
-        self.assertTrue(not tester.check_valid("ABC"))
-
        self.assertEqual(
-            "14",
-            tester.convert_to_arabic("XIV")
+            "XIV",
+            tools.convert_arabic_to_roman("14")
        )

        self.assertEqual(
-            "1987",
-            tester.convert_to_arabic("MCMLXXXVII")
+            "MCMLXXXVII",
+            tools.convert_arabic_to_roman("1987")
        )

        self.assertEqual(
-            "Dans le 15e arrondissement",
-            tester.convert_to_arabic_in_text("Dans le XVe arrondissement")
+            "Dans le XVe arrondissement",
+            tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement")
        )

        self.assertEqual(
-            "20eme arr.",
-            tester.convert_to_arabic_in_text("XXeme arr.")
+            "XXeme arr.",
+            tools.convert_arabic_to_roman_in_text("20eme arr.")
        )

        self.assertEqual(
            "A AIX EN PROVENCE",
-            tester.convert_to_arabic_in_text("A AIX EN PROVENCE")
+            tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE")
        )

        self.assertEqual(
            "Montigny Le Bretonneux",
-            tester.convert_to_arabic_in_text("Montigny Le Bretonneux")
+            tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux")
        )

    def test_roman_numbers_in_text(self):
@ -67,8 +63,8 @@ class TestTexts(unittest.TestCase):
        normalization.
        """
        self.assertEqual(
-            "dans le 15e arrondissement",
-            tools.normalize_string("Dans le XVe arrondissement")
+            "dans le XVe arrondissement",
+            tools.normalize_string("Dans le 15e arrondissement")
        )

    def test_multiple_whitespaces(self):
--- a/flatisfy/tools.py
+++ b/flatisfy/tools.py
@ -25,68 +25,36 @@ LOGGER = logging.getLogger(__name__)
 NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"


-class RomanNumbers(object):
+def convert_arabic_to_roman(arabic):
    """
-    Utilities to check and convert roman numbers.
+    Convert an arabic literal to a roman one.

-    Part of the conversions is based on
-    https://gist.github.com/riverrun/ac91218bb1678b857c12
+    ..note::
+        Based on https://gist.github.com/riverrun/ac91218bb1678b857c12.
+
+    :param arabic: An arabic number, as string.
+    :returns: The corresponding roman one, as string.
    """
-    @staticmethod
-    def check_valid(roman):
-        """
-        Check whether a roman literal is a valid roman literal.
-
-        :param roman: A roman literal, as string.
-        :returns: ``True`` if it is a valid roman literal, ``False`` otherwise.
-        """
-        if not re.match('^[MDCLXVI]+$', roman):
-            return False
-
-        invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
-        if any(sub in roman for sub in invalid):
-            return False
-
-        # TODO: check M does not appear after any other, etc.
-        return True
-
-    @staticmethod
-    def convert_to_arabic(roman):
-        """
-        Convert a roman literal to arabic one.
-
-        :param roman: A roman number, as string.
-        :returns: The corresponding arabic one, as string.
-        """
-        if not RomanNumbers.check_valid(roman):
-            return roman
-
-        keys = [
-            'IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V',
-            'X', 'L', 'C', 'D', 'M'
-        ]
-        to_arabic = {
-            'IV': '4',
-            'IX': '9',
-            'XL': '40',
-            'XC': '90',
-            'CD': '400',
-            'CM': '900',
-            'I': '1',
-            'V': '5',
-            'X': '10',
-            'L': '50',
-            'C': '100',
-            'D': '500',
-            'M': '1000'
+    to_roman = {
+        1: 'I', 2: 'II', 3: 'III', 4: 'IV', 5: 'V', 6: 'VI', 7: 'VII',
+        8: 'VIII', 9: 'IX', 10: 'X',
+        20: 'XX', 30: 'XXX', 40: 'XL', 50: 'L', 60: 'LX', 70: 'LXX',
+        80: 'LXXX', 90: 'XC',
+        100: 'C', 200: 'CC', 300: 'CCC', 400: 'CD', 500: 'D', 600: 'DC',
+        700: 'DCC', 800: 'DCCC', 900: 'CM',
+        1000: 'M', 2000: 'MM', 3000: 'MMM'
    }
-        for key in keys:
-            if key in roman:
-                roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
-        return str(sum(int(num) for num in roman.split()))
+    roman_chars_list = []
+    count = 1
+    for digit in arabic[::-1]:
+        digit = int(digit)
+        if digit != 0:
+            roman_chars_list.append(to_roman[digit * count])
+        count *= 10
+    return ''.join(roman_chars_list[::-1])

-    @staticmethod
-    def convert_to_arabic_in_text(text):
+
+def convert_arabic_to_roman_in_text(text):
    """
    Convert roman literals to arabic one in a text.

@ -95,8 +63,8 @@ class RomanNumbers(object):
        arabic.
    """
    return re.sub(
-            r'(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
-            lambda matchobj: RomanNumbers.convert_to_arabic(matchobj.group(0)),
+        r'(\d+)',
+        lambda matchobj: convert_arabic_to_roman(matchobj.group(0)),
        text
    )

@ -221,14 +189,24 @@ def is_within_interval(value, min_value=None, max_value=None):
    return all(checks)


-def normalize_string(string):
+def normalize_string(string, lowercase=True, convert_arabic_numerals=True):
    """
    Normalize the given string for matching.

-    :Example:
+    Example::

        >>> normalize_string("tétéà 14ème-XIV,  foobar")
+        'tetea XIVeme xiv, foobar'
+
+        >>> normalize_string("tétéà 14ème-XIV,  foobar", False)
        'tetea 14eme xiv, foobar'
+
+    :param string: The string to normalize.
+    :param lowercase: Whether to convert string to lowercase or not. Defaults
+        to ``True``.
+    :param convert_arabic_numerals: Whether to convert arabic numerals to roman
+        ones. Defaults to ``True``.
+    :return: The normalized string.
    """
    # ASCIIfy the string
    string = unidecode.unidecode(string)
@ -237,13 +215,14 @@ def normalize_string(string):
    # Keep some basic punctuation to keep syntaxic units
    string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)

-    # Convert roman numbers to arabic numbers
-    # TODO: Fix this :)
-    # string = RomanNumbers.convert_to_arabic_in_text(string)
-
    # Convert to lowercase
+    if lowercase:
        string = string.lower()

+    # Convert arabic numbers to roman numbers
+    if convert_arabic_numerals:
+        string = convert_arabic_to_roman_in_text(string)
+
    # Collapse multiple spaces, replace tabulations and newlines by space
    string = re.sub(r"\s+", " ", string)

--- a/requirements.txt
+++ b/requirements.txt
@ -11,6 +11,7 @@ imagehash
 pillow
 requests
 sqlalchemy
+titlecase
 unidecode
 vobject
 whoosh