Improve cities fuzzy matching

* Convert arabic numerals to roman ones in fuzzy comparison, to ensure
there is no more discrepancies between "Paris 20" and "Paris XX" for
instance. Fix #112 and improve on top of #110.
* Improve handling of opendata postal codes (no more duplicates, better
capitalization).

Note: You should `pip install -r requirements.txt` and rebuild the
database (`python -m flatisfy build-data --config config.json`) after
this commit.

Thanks @nicofrand for building the basic blocks for this!
This commit is contained in:
Lucas Verney 2018-01-19 11:50:11 +01:00
parent 7bf08adbce
commit a45eba65c7
6 changed files with 93 additions and 95 deletions

View File

@ -1,4 +1,4 @@
# coding : utf-8
# coding: utf-8
"""
This module contains all the code related to building necessary data files from
the source opendata files.

View File

@ -1,16 +1,20 @@
# coding : utf-8
# coding: utf-8
"""
Preprocessing functions to convert input opendata files into SQLAlchemy objects
ready to be stored in the database.
"""
from __future__ import absolute_import, print_function, unicode_literals
import io
import json
import logging
import os
import sys
import titlecase
from flatisfy.models.postal_code import PostalCode
from flatisfy.models.public_transport import PublicTransport
from flatisfy.tools import normalize_string
if sys.version_info >= (3, 0):
import csv
@ -21,6 +25,12 @@ else:
LOGGER = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
titlecase.set_small_word_list(
# Add French small words
r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|" +
titlecase.SMALL
)
TRANSPORT_DATA_FILES = {
"FR-IDF": "stops_fr-idf.txt",
"FR-NW": "stops_fr-nw.txt",
@ -109,6 +119,9 @@ def _preprocess_laposte():
# Build postal codes to other infos file
postal_codes_data = []
# Keep track of seen (postal_codes, names) to avoid inserting useless
# duplicates (already in the OpenData file)
seen_postal_codes = []
for item in raw_laposte_data:
fields = item["fields"]
try:
@ -120,10 +133,19 @@ def _preprocess_laposte():
)
continue
name = normalize_string(
titlecase.titlecase(fields["nom_de_la_commune"]),
lowercase=False
)
if (fields["code_postal"], name) in seen_postal_codes:
continue
seen_postal_codes.append((fields["code_postal"], name))
postal_codes_data.append(PostalCode(
area=area,
postal_code=fields["code_postal"],
name=fields["nom_de_la_commune"].title(),
name=name,
lat=fields["coordonnees_gps"][0],
lng=fields["coordonnees_gps"][1]
))

View File

@ -1,4 +1,4 @@
# coding : utf-8
# coding: utf-8
"""
This module contains all the exceptions definitions for the Flatisfy-specific
exceptions.

View File

@ -27,38 +27,34 @@ class TestTexts(unittest.TestCase):
"""
Checks roman numbers replacement.
"""
tester = tools.RomanNumbers()
self.assertTrue(tester.check_valid("XIV"))
self.assertTrue(not tester.check_valid("ABC"))
self.assertEqual(
"14",
tester.convert_to_arabic("XIV")
"XIV",
tools.convert_arabic_to_roman("14")
)
self.assertEqual(
"1987",
tester.convert_to_arabic("MCMLXXXVII")
"MCMLXXXVII",
tools.convert_arabic_to_roman("1987")
)
self.assertEqual(
"Dans le 15e arrondissement",
tester.convert_to_arabic_in_text("Dans le XVe arrondissement")
"Dans le XVe arrondissement",
tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement")
)
self.assertEqual(
"20eme arr.",
tester.convert_to_arabic_in_text("XXeme arr.")
"XXeme arr.",
tools.convert_arabic_to_roman_in_text("20eme arr.")
)
self.assertEqual(
"A AIX EN PROVENCE",
tester.convert_to_arabic_in_text("A AIX EN PROVENCE")
tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE")
)
self.assertEqual(
"Montigny Le Bretonneux",
tester.convert_to_arabic_in_text("Montigny Le Bretonneux")
tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux")
)
def test_roman_numbers_in_text(self):
@ -67,8 +63,8 @@ class TestTexts(unittest.TestCase):
normalization.
"""
self.assertEqual(
"dans le 15e arrondissement",
tools.normalize_string("Dans le XVe arrondissement")
"dans le XVe arrondissement",
tools.normalize_string("Dans le 15e arrondissement")
)
def test_multiple_whitespaces(self):

View File

@ -25,68 +25,36 @@ LOGGER = logging.getLogger(__name__)
NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
class RomanNumbers(object):
def convert_arabic_to_roman(arabic):
"""
Utilities to check and convert roman numbers.
Convert an arabic literal to a roman one.
Part of the conversions is based on
https://gist.github.com/riverrun/ac91218bb1678b857c12
..note::
Based on https://gist.github.com/riverrun/ac91218bb1678b857c12.
:param arabic: An arabic number, as string.
:returns: The corresponding roman one, as string.
"""
@staticmethod
def check_valid(roman):
"""
Check whether a roman literal is a valid roman literal.
:param roman: A roman literal, as string.
:returns: ``True`` if it is a valid roman literal, ``False`` otherwise.
"""
if not re.match('^[MDCLXVI]+$', roman):
return False
invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM']
if any(sub in roman for sub in invalid):
return False
# TODO: check M does not appear after any other, etc.
return True
@staticmethod
def convert_to_arabic(roman):
"""
Convert a roman literal to arabic one.
:param roman: A roman number, as string.
:returns: The corresponding arabic one, as string.
"""
if not RomanNumbers.check_valid(roman):
return roman
keys = [
'IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V',
'X', 'L', 'C', 'D', 'M'
]
to_arabic = {
'IV': '4',
'IX': '9',
'XL': '40',
'XC': '90',
'CD': '400',
'CM': '900',
'I': '1',
'V': '5',
'X': '10',
'L': '50',
'C': '100',
'D': '500',
'M': '1000'
to_roman = {
1: 'I', 2: 'II', 3: 'III', 4: 'IV', 5: 'V', 6: 'VI', 7: 'VII',
8: 'VIII', 9: 'IX', 10: 'X',
20: 'XX', 30: 'XXX', 40: 'XL', 50: 'L', 60: 'LX', 70: 'LXX',
80: 'LXXX', 90: 'XC',
100: 'C', 200: 'CC', 300: 'CCC', 400: 'CD', 500: 'D', 600: 'DC',
700: 'DCC', 800: 'DCCC', 900: 'CM',
1000: 'M', 2000: 'MM', 3000: 'MMM'
}
for key in keys:
if key in roman:
roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
return str(sum(int(num) for num in roman.split()))
roman_chars_list = []
count = 1
for digit in arabic[::-1]:
digit = int(digit)
if digit != 0:
roman_chars_list.append(to_roman[digit * count])
count *= 10
return ''.join(roman_chars_list[::-1])
@staticmethod
def convert_to_arabic_in_text(text):
def convert_arabic_to_roman_in_text(text):
"""
Convert roman literals to arabic one in a text.
@ -95,8 +63,8 @@ class RomanNumbers(object):
arabic.
"""
return re.sub(
r'(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
lambda matchobj: RomanNumbers.convert_to_arabic(matchobj.group(0)),
r'(\d+)',
lambda matchobj: convert_arabic_to_roman(matchobj.group(0)),
text
)
@ -221,14 +189,24 @@ def is_within_interval(value, min_value=None, max_value=None):
return all(checks)
def normalize_string(string):
def normalize_string(string, lowercase=True, convert_arabic_numerals=True):
"""
Normalize the given string for matching.
:Example:
Example::
>>> normalize_string("tétéà 14ème-XIV, foobar")
'tetea XIVeme xiv, foobar'
>>> normalize_string("tétéà 14ème-XIV, foobar", False)
'tetea 14eme xiv, foobar'
:param string: The string to normalize.
:param lowercase: Whether to convert string to lowercase or not. Defaults
to ``True``.
:param convert_arabic_numerals: Whether to convert arabic numerals to roman
ones. Defaults to ``True``.
:return: The normalized string.
"""
# ASCIIfy the string
string = unidecode.unidecode(string)
@ -237,13 +215,14 @@ def normalize_string(string):
# Keep some basic punctuation to keep syntaxic units
string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
# Convert roman numbers to arabic numbers
# TODO: Fix this :)
# string = RomanNumbers.convert_to_arabic_in_text(string)
# Convert to lowercase
if lowercase:
string = string.lower()
# Convert arabic numbers to roman numbers
if convert_arabic_numerals:
string = convert_arabic_to_roman_in_text(string)
# Collapse multiple spaces, replace tabulations and newlines by space
string = re.sub(r"\s+", " ", string)

View File

@ -11,6 +11,7 @@ imagehash
pillow
requests
sqlalchemy
titlecase
unidecode
vobject
whoosh