Merge branch 'fix-roman-numbers' into 'master'

Fix roman numbers See merge request phyks/Flatisfy!18
2018-01-18 14:17:57 +01:00 · 2018-01-18 14:17:57 +01:00 · 6bcfb62e8d
commit 6bcfb62e8d
parent caa79f245b 62907a621c
2 changed files with 75 additions and 5 deletions
--- a/flatisfy/tests.py
+++ b/flatisfy/tests.py
@ -22,9 +22,39 @@ class TestTexts(unittest.TestCase):
        """
        Checks roman numbers replacement.
        """
        tester = tools.RomanNumbers()
        self.assertTrue(tester.check_valid("XIV"))
        self.assertTrue(not tester.check_valid("ABC"))
        self.assertEqual(
            "14",
-            tools.normalize_string("XIV")
+            tester.convert_to_arabic("XIV")
        )
        self.assertEqual(
            "1987",
            tester.convert_to_arabic("MCMLXXXVII")
        )
        self.assertEqual(
            "Dans le 15e arrondissement",
            tester.convert_to_arabic_in_text("Dans le XVe arrondissement")
        )
        self.assertEqual(
            "20eme arr.",
            tester.convert_to_arabic_in_text("XXeme arr.")
        )
        self.assertEqual(
            "A AIX EN PROVENCE",
            tester.convert_to_arabic_in_text("A AIX EN PROVENCE")
        )
    def test_roman_numbers_in_text(self):
        self.assertEqual(
            "dans le 15e arrondissement",
            tools.normalize_string("Dans le XVe arrondissement")
        )
    def test_multiple_whitespaces(self):
@ -32,8 +62,8 @@ class TestTexts(unittest.TestCase):
        Checks whitespaces are collapsed.
        """
        self.assertEqual(
-            "avec   ascenseur",
+            "avec ascenseur",
-            tools.normalize_string("avec ascenseur")
+            tools.normalize_string("avec   ascenseur")
        )
    def test_accents(self):
@ -41,8 +71,8 @@ class TestTexts(unittest.TestCase):
        Checks accents are replaced.
        """
        self.assertEqual(
-            "éèêàüï",
+            "eeeaui",
-            tools.normalize_string("eeeaui")
+            tools.normalize_string(u"éèêàüï")
        )
 class TestPhoneNumbers(unittest.TestCase):
--- a/flatisfy/tools.py
+++ b/flatisfy/tools.py
@ -24,6 +24,42 @@ LOGGER = logging.getLogger(__name__)
 # Constants
 NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
 class RomanNumbers():
    """
    Utilities to check and convert roman numbers.
    Part of the convertions are based on
    https://gist.github.com/riverrun/ac91218bb1678b857c12
    """
    def check_valid(self, roman):
        if not re.match('^[MDCLXVI]+$', roman):
            return False
        invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
        if any(sub in roman for sub in invalid):
            return False
        # TODO: check M does not appear after any other, etc.
        return True
    def convert_to_arabic(self, roman):
        if not self.check_valid(roman):
            return roman
        keys = ['IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V', 'X', 'L', 'C', 'D', 'M']
        to_arabic = {'IV': '4', 'IX': '9', 'XL': '40', 'XC': '90', 'CD': '400', 'CM': '900',
                'I': '1', 'V': '5', 'X': '10', 'L': '50', 'C': '100', 'D': '500', 'M': '1000'}
        for key in keys:
            if key in roman:
                roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
        return str(sum(int(num) for num in roman.split()))
    def convert_to_arabic_in_text(self, text):
        return re.sub(
            '(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
            lambda matchobj: self.convert_to_arabic(matchobj.group(0)),
            text
        )
 def hash_dict(func):
    """
@ -162,6 +198,10 @@ def normalize_string(string):
    # Keep some basic punctuation to keep syntaxic units
    string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
    # Convert roman numbers to arabic numbers
    converter = RomanNumbers()
    string = converter.convert_to_arabic_in_text(string)
    # Convert to lowercase
    string = string.lower()