Improve cities fuzzy matching
* Convert arabic numerals to roman ones in fuzzy comparison, to ensure there is no more discrepancies between "Paris 20" and "Paris XX" for instance. Fix #112 and improve on top of #110. * Improve handling of opendata postal codes (no more duplicates, better capitalization). Note: You should `pip install -r requirements.txt` and rebuild the database (`python -m flatisfy build-data --config config.json`) after this commit. Thanks @nicofrand for building the basic blocks for this!
This commit is contained in:
parent
7bf08adbce
commit
a45eba65c7
@ -3,14 +3,18 @@
|
||||
Preprocessing functions to convert input opendata files into SQLAlchemy objects
|
||||
ready to be stored in the database.
|
||||
"""
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
import titlecase
|
||||
|
||||
from flatisfy.models.postal_code import PostalCode
|
||||
from flatisfy.models.public_transport import PublicTransport
|
||||
from flatisfy.tools import normalize_string
|
||||
|
||||
if sys.version_info >= (3, 0):
|
||||
import csv
|
||||
@ -21,6 +25,12 @@ else:
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
titlecase.set_small_word_list(
|
||||
# Add French small words
|
||||
r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|" +
|
||||
titlecase.SMALL
|
||||
)
|
||||
|
||||
TRANSPORT_DATA_FILES = {
|
||||
"FR-IDF": "stops_fr-idf.txt",
|
||||
"FR-NW": "stops_fr-nw.txt",
|
||||
@ -109,6 +119,9 @@ def _preprocess_laposte():
|
||||
|
||||
# Build postal codes to other infos file
|
||||
postal_codes_data = []
|
||||
# Keep track of seen (postal_codes, names) to avoid inserting useless
|
||||
# duplicates (already in the OpenData file)
|
||||
seen_postal_codes = []
|
||||
for item in raw_laposte_data:
|
||||
fields = item["fields"]
|
||||
try:
|
||||
@ -120,10 +133,19 @@ def _preprocess_laposte():
|
||||
)
|
||||
continue
|
||||
|
||||
name = normalize_string(
|
||||
titlecase.titlecase(fields["nom_de_la_commune"]),
|
||||
lowercase=False
|
||||
)
|
||||
|
||||
if (fields["code_postal"], name) in seen_postal_codes:
|
||||
continue
|
||||
|
||||
seen_postal_codes.append((fields["code_postal"], name))
|
||||
postal_codes_data.append(PostalCode(
|
||||
area=area,
|
||||
postal_code=fields["code_postal"],
|
||||
name=fields["nom_de_la_commune"].title(),
|
||||
name=name,
|
||||
lat=fields["coordonnees_gps"][0],
|
||||
lng=fields["coordonnees_gps"][1]
|
||||
))
|
||||
|
@ -27,38 +27,34 @@ class TestTexts(unittest.TestCase):
|
||||
"""
|
||||
Checks roman numbers replacement.
|
||||
"""
|
||||
tester = tools.RomanNumbers()
|
||||
self.assertTrue(tester.check_valid("XIV"))
|
||||
self.assertTrue(not tester.check_valid("ABC"))
|
||||
|
||||
self.assertEqual(
|
||||
"14",
|
||||
tester.convert_to_arabic("XIV")
|
||||
"XIV",
|
||||
tools.convert_arabic_to_roman("14")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"1987",
|
||||
tester.convert_to_arabic("MCMLXXXVII")
|
||||
"MCMLXXXVII",
|
||||
tools.convert_arabic_to_roman("1987")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"Dans le 15e arrondissement",
|
||||
tester.convert_to_arabic_in_text("Dans le XVe arrondissement")
|
||||
"Dans le XVe arrondissement",
|
||||
tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"20eme arr.",
|
||||
tester.convert_to_arabic_in_text("XXeme arr.")
|
||||
"XXeme arr.",
|
||||
tools.convert_arabic_to_roman_in_text("20eme arr.")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"A AIX EN PROVENCE",
|
||||
tester.convert_to_arabic_in_text("A AIX EN PROVENCE")
|
||||
tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE")
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"Montigny Le Bretonneux",
|
||||
tester.convert_to_arabic_in_text("Montigny Le Bretonneux")
|
||||
tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux")
|
||||
)
|
||||
|
||||
def test_roman_numbers_in_text(self):
|
||||
@ -67,8 +63,8 @@ class TestTexts(unittest.TestCase):
|
||||
normalization.
|
||||
"""
|
||||
self.assertEqual(
|
||||
"dans le 15e arrondissement",
|
||||
tools.normalize_string("Dans le XVe arrondissement")
|
||||
"dans le XVe arrondissement",
|
||||
tools.normalize_string("Dans le 15e arrondissement")
|
||||
)
|
||||
|
||||
def test_multiple_whitespaces(self):
|
||||
|
@ -25,68 +25,36 @@ LOGGER = logging.getLogger(__name__)
|
||||
NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
|
||||
|
||||
|
||||
class RomanNumbers(object):
|
||||
def convert_arabic_to_roman(arabic):
|
||||
"""
|
||||
Utilities to check and convert roman numbers.
|
||||
Convert an arabic literal to a roman one.
|
||||
|
||||
Part of the conversions is based on
|
||||
https://gist.github.com/riverrun/ac91218bb1678b857c12
|
||||
..note::
|
||||
Based on https://gist.github.com/riverrun/ac91218bb1678b857c12.
|
||||
|
||||
:param arabic: An arabic number, as string.
|
||||
:returns: The corresponding roman one, as string.
|
||||
"""
|
||||
@staticmethod
|
||||
def check_valid(roman):
|
||||
"""
|
||||
Check whether a roman literal is a valid roman literal.
|
||||
|
||||
:param roman: A roman literal, as string.
|
||||
:returns: ``True`` if it is a valid roman literal, ``False`` otherwise.
|
||||
"""
|
||||
if not re.match('^[MDCLXVI]+$', roman):
|
||||
return False
|
||||
|
||||
invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
|
||||
if any(sub in roman for sub in invalid):
|
||||
return False
|
||||
|
||||
# TODO: check M does not appear after any other, etc.
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def convert_to_arabic(roman):
|
||||
"""
|
||||
Convert a roman literal to arabic one.
|
||||
|
||||
:param roman: A roman number, as string.
|
||||
:returns: The corresponding arabic one, as string.
|
||||
"""
|
||||
if not RomanNumbers.check_valid(roman):
|
||||
return roman
|
||||
|
||||
keys = [
|
||||
'IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V',
|
||||
'X', 'L', 'C', 'D', 'M'
|
||||
]
|
||||
to_arabic = {
|
||||
'IV': '4',
|
||||
'IX': '9',
|
||||
'XL': '40',
|
||||
'XC': '90',
|
||||
'CD': '400',
|
||||
'CM': '900',
|
||||
'I': '1',
|
||||
'V': '5',
|
||||
'X': '10',
|
||||
'L': '50',
|
||||
'C': '100',
|
||||
'D': '500',
|
||||
'M': '1000'
|
||||
to_roman = {
|
||||
1: 'I', 2: 'II', 3: 'III', 4: 'IV', 5: 'V', 6: 'VI', 7: 'VII',
|
||||
8: 'VIII', 9: 'IX', 10: 'X',
|
||||
20: 'XX', 30: 'XXX', 40: 'XL', 50: 'L', 60: 'LX', 70: 'LXX',
|
||||
80: 'LXXX', 90: 'XC',
|
||||
100: 'C', 200: 'CC', 300: 'CCC', 400: 'CD', 500: 'D', 600: 'DC',
|
||||
700: 'DCC', 800: 'DCCC', 900: 'CM',
|
||||
1000: 'M', 2000: 'MM', 3000: 'MMM'
|
||||
}
|
||||
for key in keys:
|
||||
if key in roman:
|
||||
roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
|
||||
return str(sum(int(num) for num in roman.split()))
|
||||
roman_chars_list = []
|
||||
count = 1
|
||||
for digit in arabic[::-1]:
|
||||
digit = int(digit)
|
||||
if digit != 0:
|
||||
roman_chars_list.append(to_roman[digit * count])
|
||||
count *= 10
|
||||
return ''.join(roman_chars_list[::-1])
|
||||
|
||||
@staticmethod
|
||||
def convert_to_arabic_in_text(text):
|
||||
|
||||
def convert_arabic_to_roman_in_text(text):
|
||||
"""
|
||||
Convert roman literals to arabic one in a text.
|
||||
|
||||
@ -95,8 +63,8 @@ class RomanNumbers(object):
|
||||
arabic.
|
||||
"""
|
||||
return re.sub(
|
||||
r'(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
|
||||
lambda matchobj: RomanNumbers.convert_to_arabic(matchobj.group(0)),
|
||||
r'(\d+)',
|
||||
lambda matchobj: convert_arabic_to_roman(matchobj.group(0)),
|
||||
text
|
||||
)
|
||||
|
||||
@ -221,14 +189,24 @@ def is_within_interval(value, min_value=None, max_value=None):
|
||||
return all(checks)
|
||||
|
||||
|
||||
def normalize_string(string):
|
||||
def normalize_string(string, lowercase=True, convert_arabic_numerals=True):
|
||||
"""
|
||||
Normalize the given string for matching.
|
||||
|
||||
:Example:
|
||||
Example::
|
||||
|
||||
>>> normalize_string("tétéà 14ème-XIV, foobar")
|
||||
'tetea XIVeme xiv, foobar'
|
||||
|
||||
>>> normalize_string("tétéà 14ème-XIV, foobar", False)
|
||||
'tetea 14eme xiv, foobar'
|
||||
|
||||
:param string: The string to normalize.
|
||||
:param lowercase: Whether to convert string to lowercase or not. Defaults
|
||||
to ``True``.
|
||||
:param convert_arabic_numerals: Whether to convert arabic numerals to roman
|
||||
ones. Defaults to ``True``.
|
||||
:return: The normalized string.
|
||||
"""
|
||||
# ASCIIfy the string
|
||||
string = unidecode.unidecode(string)
|
||||
@ -237,13 +215,14 @@ def normalize_string(string):
|
||||
# Keep some basic punctuation to keep syntaxic units
|
||||
string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
|
||||
|
||||
# Convert roman numbers to arabic numbers
|
||||
# TODO: Fix this :)
|
||||
# string = RomanNumbers.convert_to_arabic_in_text(string)
|
||||
|
||||
# Convert to lowercase
|
||||
if lowercase:
|
||||
string = string.lower()
|
||||
|
||||
# Convert arabic numbers to roman numbers
|
||||
if convert_arabic_numerals:
|
||||
string = convert_arabic_to_roman_in_text(string)
|
||||
|
||||
# Collapse multiple spaces, replace tabulations and newlines by space
|
||||
string = re.sub(r"\s+", " ", string)
|
||||
|
||||
|
@ -11,6 +11,7 @@ imagehash
|
||||
pillow
|
||||
requests
|
||||
sqlalchemy
|
||||
titlecase
|
||||
unidecode
|
||||
vobject
|
||||
whoosh
|
||||
|
Loading…
Reference in New Issue
Block a user