Merge branch 'fix-roman-numbers' into 'master'
Fix roman numbers See merge request phyks/Flatisfy!18
This commit is contained in:
commit
6bcfb62e8d
@ -22,9 +22,39 @@ class TestTexts(unittest.TestCase):
|
||||
"""
|
||||
Checks roman numbers replacement.
|
||||
"""
|
||||
tester = tools.RomanNumbers()
|
||||
self.assertTrue(tester.check_valid("XIV"))
|
||||
self.assertTrue(not tester.check_valid("ABC"))
|
||||
|
||||
self.assertEqual(
|
||||
"14",
|
||||
tools.normalize_string("XIV")
|
||||
tester.convert_to_arabic("XIV")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"1987",
|
||||
tester.convert_to_arabic("MCMLXXXVII")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"Dans le 15e arrondissement",
|
||||
tester.convert_to_arabic_in_text("Dans le XVe arrondissement")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"20eme arr.",
|
||||
tester.convert_to_arabic_in_text("XXeme arr.")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"A AIX EN PROVENCE",
|
||||
tester.convert_to_arabic_in_text("A AIX EN PROVENCE")
|
||||
)
|
||||
|
||||
def test_roman_numbers_in_text(self):
|
||||
self.assertEqual(
|
||||
"dans le 15e arrondissement",
|
||||
tools.normalize_string("Dans le XVe arrondissement")
|
||||
)
|
||||
|
||||
def test_multiple_whitespaces(self):
|
||||
@ -41,8 +71,8 @@ class TestTexts(unittest.TestCase):
|
||||
Checks accents are replaced.
|
||||
"""
|
||||
self.assertEqual(
|
||||
"éèêàüï",
|
||||
tools.normalize_string("eeeaui")
|
||||
"eeeaui",
|
||||
tools.normalize_string(u"éèêàüï")
|
||||
)
|
||||
|
||||
class TestPhoneNumbers(unittest.TestCase):
|
||||
|
@ -24,6 +24,42 @@ LOGGER = logging.getLogger(__name__)
|
||||
# Constants
|
||||
NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
|
||||
|
||||
class RomanNumbers():
|
||||
"""
|
||||
Utilities to check and convert roman numbers.
|
||||
Part of the convertions are based on
|
||||
https://gist.github.com/riverrun/ac91218bb1678b857c12
|
||||
"""
|
||||
|
||||
def check_valid(self, roman):
|
||||
if not re.match('^[MDCLXVI]+$', roman):
|
||||
return False
|
||||
|
||||
invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
|
||||
if any(sub in roman for sub in invalid):
|
||||
return False
|
||||
|
||||
# TODO: check M does not appear after any other, etc.
|
||||
return True
|
||||
|
||||
def convert_to_arabic(self, roman):
|
||||
if not self.check_valid(roman):
|
||||
return roman
|
||||
|
||||
keys = ['IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V', 'X', 'L', 'C', 'D', 'M']
|
||||
to_arabic = {'IV': '4', 'IX': '9', 'XL': '40', 'XC': '90', 'CD': '400', 'CM': '900',
|
||||
'I': '1', 'V': '5', 'X': '10', 'L': '50', 'C': '100', 'D': '500', 'M': '1000'}
|
||||
for key in keys:
|
||||
if key in roman:
|
||||
roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
|
||||
return str(sum(int(num) for num in roman.split()))
|
||||
|
||||
def convert_to_arabic_in_text(self, text):
|
||||
return re.sub(
|
||||
'(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
|
||||
lambda matchobj: self.convert_to_arabic(matchobj.group(0)),
|
||||
text
|
||||
)
|
||||
|
||||
def hash_dict(func):
|
||||
"""
|
||||
@ -162,6 +198,10 @@ def normalize_string(string):
|
||||
# Keep some basic punctuation to keep syntaxic units
|
||||
string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
|
||||
|
||||
# Convert roman numbers to arabic numbers
|
||||
converter = RomanNumbers()
|
||||
string = converter.convert_to_arabic_in_text(string)
|
||||
|
||||
# Convert to lowercase
|
||||
string = string.lower()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user