Improve cities fuzzy matching
* Convert arabic numerals to roman ones in fuzzy comparison, to ensure there is no more discrepancies between "Paris 20" and "Paris XX" for instance. Fix #112 and improve on top of #110. * Improve handling of opendata postal codes (no more duplicates, better capitalization). Note: You should `pip install -r requirements.txt` and rebuild the database (`python -m flatisfy build-data --config config.json`) after this commit. Thanks @nicofrand for building the basic blocks for this!
This commit is contained in:
parent
7bf08adbce
commit
a45eba65c7
@ -3,14 +3,18 @@
|
|||||||
Preprocessing functions to convert input opendata files into SQLAlchemy objects
|
Preprocessing functions to convert input opendata files into SQLAlchemy objects
|
||||||
ready to be stored in the database.
|
ready to be stored in the database.
|
||||||
"""
|
"""
|
||||||
|
from __future__ import absolute_import, print_function, unicode_literals
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import titlecase
|
||||||
|
|
||||||
from flatisfy.models.postal_code import PostalCode
|
from flatisfy.models.postal_code import PostalCode
|
||||||
from flatisfy.models.public_transport import PublicTransport
|
from flatisfy.models.public_transport import PublicTransport
|
||||||
|
from flatisfy.tools import normalize_string
|
||||||
|
|
||||||
if sys.version_info >= (3, 0):
|
if sys.version_info >= (3, 0):
|
||||||
import csv
|
import csv
|
||||||
@ -21,6 +25,12 @@ else:
|
|||||||
LOGGER = logging.getLogger(__name__)
|
LOGGER = logging.getLogger(__name__)
|
||||||
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
|
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
titlecase.set_small_word_list(
|
||||||
|
# Add French small words
|
||||||
|
r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|" +
|
||||||
|
titlecase.SMALL
|
||||||
|
)
|
||||||
|
|
||||||
TRANSPORT_DATA_FILES = {
|
TRANSPORT_DATA_FILES = {
|
||||||
"FR-IDF": "stops_fr-idf.txt",
|
"FR-IDF": "stops_fr-idf.txt",
|
||||||
"FR-NW": "stops_fr-nw.txt",
|
"FR-NW": "stops_fr-nw.txt",
|
||||||
@ -109,6 +119,9 @@ def _preprocess_laposte():
|
|||||||
|
|
||||||
# Build postal codes to other infos file
|
# Build postal codes to other infos file
|
||||||
postal_codes_data = []
|
postal_codes_data = []
|
||||||
|
# Keep track of seen (postal_codes, names) to avoid inserting useless
|
||||||
|
# duplicates (already in the OpenData file)
|
||||||
|
seen_postal_codes = []
|
||||||
for item in raw_laposte_data:
|
for item in raw_laposte_data:
|
||||||
fields = item["fields"]
|
fields = item["fields"]
|
||||||
try:
|
try:
|
||||||
@ -120,10 +133,19 @@ def _preprocess_laposte():
|
|||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
name = normalize_string(
|
||||||
|
titlecase.titlecase(fields["nom_de_la_commune"]),
|
||||||
|
lowercase=False
|
||||||
|
)
|
||||||
|
|
||||||
|
if (fields["code_postal"], name) in seen_postal_codes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_postal_codes.append((fields["code_postal"], name))
|
||||||
postal_codes_data.append(PostalCode(
|
postal_codes_data.append(PostalCode(
|
||||||
area=area,
|
area=area,
|
||||||
postal_code=fields["code_postal"],
|
postal_code=fields["code_postal"],
|
||||||
name=fields["nom_de_la_commune"].title(),
|
name=name,
|
||||||
lat=fields["coordonnees_gps"][0],
|
lat=fields["coordonnees_gps"][0],
|
||||||
lng=fields["coordonnees_gps"][1]
|
lng=fields["coordonnees_gps"][1]
|
||||||
))
|
))
|
||||||
|
@ -27,38 +27,34 @@ class TestTexts(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
Checks roman numbers replacement.
|
Checks roman numbers replacement.
|
||||||
"""
|
"""
|
||||||
tester = tools.RomanNumbers()
|
|
||||||
self.assertTrue(tester.check_valid("XIV"))
|
|
||||||
self.assertTrue(not tester.check_valid("ABC"))
|
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
"14",
|
"XIV",
|
||||||
tester.convert_to_arabic("XIV")
|
tools.convert_arabic_to_roman("14")
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
"1987",
|
"MCMLXXXVII",
|
||||||
tester.convert_to_arabic("MCMLXXXVII")
|
tools.convert_arabic_to_roman("1987")
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
"Dans le 15e arrondissement",
|
"Dans le XVe arrondissement",
|
||||||
tester.convert_to_arabic_in_text("Dans le XVe arrondissement")
|
tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement")
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
"20eme arr.",
|
"XXeme arr.",
|
||||||
tester.convert_to_arabic_in_text("XXeme arr.")
|
tools.convert_arabic_to_roman_in_text("20eme arr.")
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
"A AIX EN PROVENCE",
|
"A AIX EN PROVENCE",
|
||||||
tester.convert_to_arabic_in_text("A AIX EN PROVENCE")
|
tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE")
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
"Montigny Le Bretonneux",
|
"Montigny Le Bretonneux",
|
||||||
tester.convert_to_arabic_in_text("Montigny Le Bretonneux")
|
tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux")
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_roman_numbers_in_text(self):
|
def test_roman_numbers_in_text(self):
|
||||||
@ -67,8 +63,8 @@ class TestTexts(unittest.TestCase):
|
|||||||
normalization.
|
normalization.
|
||||||
"""
|
"""
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
"dans le 15e arrondissement",
|
"dans le XVe arrondissement",
|
||||||
tools.normalize_string("Dans le XVe arrondissement")
|
tools.normalize_string("Dans le 15e arrondissement")
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_multiple_whitespaces(self):
|
def test_multiple_whitespaces(self):
|
||||||
|
@ -25,68 +25,36 @@ LOGGER = logging.getLogger(__name__)
|
|||||||
NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
|
NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
|
||||||
|
|
||||||
|
|
||||||
class RomanNumbers(object):
|
def convert_arabic_to_roman(arabic):
|
||||||
"""
|
"""
|
||||||
Utilities to check and convert roman numbers.
|
Convert an arabic literal to a roman one.
|
||||||
|
|
||||||
Part of the conversions is based on
|
..note::
|
||||||
https://gist.github.com/riverrun/ac91218bb1678b857c12
|
Based on https://gist.github.com/riverrun/ac91218bb1678b857c12.
|
||||||
|
|
||||||
|
:param arabic: An arabic number, as string.
|
||||||
|
:returns: The corresponding roman one, as string.
|
||||||
"""
|
"""
|
||||||
@staticmethod
|
to_roman = {
|
||||||
def check_valid(roman):
|
1: 'I', 2: 'II', 3: 'III', 4: 'IV', 5: 'V', 6: 'VI', 7: 'VII',
|
||||||
"""
|
8: 'VIII', 9: 'IX', 10: 'X',
|
||||||
Check whether a roman literal is a valid roman literal.
|
20: 'XX', 30: 'XXX', 40: 'XL', 50: 'L', 60: 'LX', 70: 'LXX',
|
||||||
|
80: 'LXXX', 90: 'XC',
|
||||||
:param roman: A roman literal, as string.
|
100: 'C', 200: 'CC', 300: 'CCC', 400: 'CD', 500: 'D', 600: 'DC',
|
||||||
:returns: ``True`` if it is a valid roman literal, ``False`` otherwise.
|
700: 'DCC', 800: 'DCCC', 900: 'CM',
|
||||||
"""
|
1000: 'M', 2000: 'MM', 3000: 'MMM'
|
||||||
if not re.match('^[MDCLXVI]+$', roman):
|
|
||||||
return False
|
|
||||||
|
|
||||||
invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
|
|
||||||
if any(sub in roman for sub in invalid):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# TODO: check M does not appear after any other, etc.
|
|
||||||
return True
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def convert_to_arabic(roman):
|
|
||||||
"""
|
|
||||||
Convert a roman literal to arabic one.
|
|
||||||
|
|
||||||
:param roman: A roman number, as string.
|
|
||||||
:returns: The corresponding arabic one, as string.
|
|
||||||
"""
|
|
||||||
if not RomanNumbers.check_valid(roman):
|
|
||||||
return roman
|
|
||||||
|
|
||||||
keys = [
|
|
||||||
'IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V',
|
|
||||||
'X', 'L', 'C', 'D', 'M'
|
|
||||||
]
|
|
||||||
to_arabic = {
|
|
||||||
'IV': '4',
|
|
||||||
'IX': '9',
|
|
||||||
'XL': '40',
|
|
||||||
'XC': '90',
|
|
||||||
'CD': '400',
|
|
||||||
'CM': '900',
|
|
||||||
'I': '1',
|
|
||||||
'V': '5',
|
|
||||||
'X': '10',
|
|
||||||
'L': '50',
|
|
||||||
'C': '100',
|
|
||||||
'D': '500',
|
|
||||||
'M': '1000'
|
|
||||||
}
|
}
|
||||||
for key in keys:
|
roman_chars_list = []
|
||||||
if key in roman:
|
count = 1
|
||||||
roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
|
for digit in arabic[::-1]:
|
||||||
return str(sum(int(num) for num in roman.split()))
|
digit = int(digit)
|
||||||
|
if digit != 0:
|
||||||
|
roman_chars_list.append(to_roman[digit * count])
|
||||||
|
count *= 10
|
||||||
|
return ''.join(roman_chars_list[::-1])
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def convert_to_arabic_in_text(text):
|
def convert_arabic_to_roman_in_text(text):
|
||||||
"""
|
"""
|
||||||
Convert roman literals to arabic one in a text.
|
Convert roman literals to arabic one in a text.
|
||||||
|
|
||||||
@ -95,8 +63,8 @@ class RomanNumbers(object):
|
|||||||
arabic.
|
arabic.
|
||||||
"""
|
"""
|
||||||
return re.sub(
|
return re.sub(
|
||||||
r'(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
|
r'(\d+)',
|
||||||
lambda matchobj: RomanNumbers.convert_to_arabic(matchobj.group(0)),
|
lambda matchobj: convert_arabic_to_roman(matchobj.group(0)),
|
||||||
text
|
text
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -221,14 +189,24 @@ def is_within_interval(value, min_value=None, max_value=None):
|
|||||||
return all(checks)
|
return all(checks)
|
||||||
|
|
||||||
|
|
||||||
def normalize_string(string):
|
def normalize_string(string, lowercase=True, convert_arabic_numerals=True):
|
||||||
"""
|
"""
|
||||||
Normalize the given string for matching.
|
Normalize the given string for matching.
|
||||||
|
|
||||||
:Example:
|
Example::
|
||||||
|
|
||||||
>>> normalize_string("tétéà 14ème-XIV, foobar")
|
>>> normalize_string("tétéà 14ème-XIV, foobar")
|
||||||
|
'tetea XIVeme xiv, foobar'
|
||||||
|
|
||||||
|
>>> normalize_string("tétéà 14ème-XIV, foobar", False)
|
||||||
'tetea 14eme xiv, foobar'
|
'tetea 14eme xiv, foobar'
|
||||||
|
|
||||||
|
:param string: The string to normalize.
|
||||||
|
:param lowercase: Whether to convert string to lowercase or not. Defaults
|
||||||
|
to ``True``.
|
||||||
|
:param convert_arabic_numerals: Whether to convert arabic numerals to roman
|
||||||
|
ones. Defaults to ``True``.
|
||||||
|
:return: The normalized string.
|
||||||
"""
|
"""
|
||||||
# ASCIIfy the string
|
# ASCIIfy the string
|
||||||
string = unidecode.unidecode(string)
|
string = unidecode.unidecode(string)
|
||||||
@ -237,13 +215,14 @@ def normalize_string(string):
|
|||||||
# Keep some basic punctuation to keep syntaxic units
|
# Keep some basic punctuation to keep syntaxic units
|
||||||
string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
|
string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
|
||||||
|
|
||||||
# Convert roman numbers to arabic numbers
|
|
||||||
# TODO: Fix this :)
|
|
||||||
# string = RomanNumbers.convert_to_arabic_in_text(string)
|
|
||||||
|
|
||||||
# Convert to lowercase
|
# Convert to lowercase
|
||||||
|
if lowercase:
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
|
|
||||||
|
# Convert arabic numbers to roman numbers
|
||||||
|
if convert_arabic_numerals:
|
||||||
|
string = convert_arabic_to_roman_in_text(string)
|
||||||
|
|
||||||
# Collapse multiple spaces, replace tabulations and newlines by space
|
# Collapse multiple spaces, replace tabulations and newlines by space
|
||||||
string = re.sub(r"\s+", " ", string)
|
string = re.sub(r"\s+", " ", string)
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ imagehash
|
|||||||
pillow
|
pillow
|
||||||
requests
|
requests
|
||||||
sqlalchemy
|
sqlalchemy
|
||||||
|
titlecase
|
||||||
unidecode
|
unidecode
|
||||||
vobject
|
vobject
|
||||||
whoosh
|
whoosh
|
||||||
|
Loading…
Reference in New Issue
Block a user