Fix roman numbers convertion

This commit is contained in:
nicofrand 2018-01-18 13:15:09 +01:00
parent 02420d7a1b
commit 07955af574
2 changed files with 65 additions and 1 deletions

View File

@ -22,9 +22,39 @@ class TestTexts(unittest.TestCase):
"""
Checks roman numbers replacement.
"""
tester = tools.RomanNumbers()
self.assertTrue(tester.check_valid("XIV"))
self.assertTrue(not tester.check_valid("ABC"))
self.assertEqual(
"14",
tools.normalize_string("XIV")
tester.convert_to_arabic("XIV")
)
self.assertEqual(
"1987",
tester.convert_to_arabic("MCMLXXXVII")
)
self.assertEqual(
"Dans le 15e arrondissement",
tester.convert_to_arabic_in_text("Dans le XVe arrondissement")
)
self.assertEqual(
"20eme arr.",
tester.convert_to_arabic_in_text("XXeme arr.")
)
self.assertEqual(
"A AIX EN PROVENCE",
tester.convert_to_arabic_in_text("A AIX EN PROVENCE")
)
def test_roman_numbers_in_text(self):
self.assertEqual(
"dans le 15e arrondissement",
tools.normalize_string("Dans le XVe arrondissement")
)
def test_multiple_whitespaces(self):

View File

@ -24,6 +24,36 @@ LOGGER = logging.getLogger(__name__)
# Constants
NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
class RomanNumbers():
def check_valid(self, roman):
if not re.match('^[MDCLXVI]+$', roman):
return False
invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
if any(sub in roman for sub in invalid):
return False
# TODO: check M does not appear after any other, etc.
return True
def convert_to_arabic(self, roman):
if not self.check_valid(roman):
return roman
keys = ['IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V', 'X', 'L', 'C', 'D', 'M']
to_arabic = {'IV': '4', 'IX': '9', 'XL': '40', 'XC': '90', 'CD': '400', 'CM': '900',
'I': '1', 'V': '5', 'X': '10', 'L': '50', 'C': '100', 'D': '500', 'M': '1000'}
for key in keys:
if key in roman:
roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
return str(sum(int(num) for num in roman.split()))
def convert_to_arabic_in_text(self, text):
return re.sub(
'(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
lambda matchobj: self.convert_to_arabic(matchobj.group(0)),
text
)
def hash_dict(func):
"""
@ -162,6 +192,10 @@ def normalize_string(string):
# Keep some basic punctuation to keep syntaxic units
string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
# Convert roman numbers to arabic numbers
converter = RomanNumbers()
string = converter.convert_to_arabic_in_text(string)
# Convert to lowercase
string = string.lower()