Improve cities fuzzy matching

* Convert arabic numerals to roman ones in fuzzy comparison, to ensure
there is no more discrepancies between "Paris 20" and "Paris XX" for
instance. Fix #112 and improve on top of #110.
* Improve handling of opendata postal codes (no more duplicates, better
capitalization).

Note: You should `pip install -r requirements.txt` and rebuild the
database (`python -m flatisfy build-data --config config.json`) after
this commit.

Thanks @nicofrand for building the basic blocks for this!
This commit is contained in:
Lucas Verney 2018-01-19 11:50:11 +01:00
parent 7bf08adbce
commit a45eba65c7
6 changed files with 93 additions and 95 deletions

View File

@ -1,4 +1,4 @@
# coding : utf-8 # coding: utf-8
""" """
This module contains all the code related to building necessary data files from This module contains all the code related to building necessary data files from
the source opendata files. the source opendata files.

View File

@ -1,16 +1,20 @@
# coding : utf-8 # coding: utf-8
""" """
Preprocessing functions to convert input opendata files into SQLAlchemy objects Preprocessing functions to convert input opendata files into SQLAlchemy objects
ready to be stored in the database. ready to be stored in the database.
""" """
from __future__ import absolute_import, print_function, unicode_literals
import io import io
import json import json
import logging import logging
import os import os
import sys import sys
import titlecase
from flatisfy.models.postal_code import PostalCode from flatisfy.models.postal_code import PostalCode
from flatisfy.models.public_transport import PublicTransport from flatisfy.models.public_transport import PublicTransport
from flatisfy.tools import normalize_string
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
import csv import csv
@ -21,6 +25,12 @@ else:
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
titlecase.set_small_word_list(
# Add French small words
r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|" +
titlecase.SMALL
)
TRANSPORT_DATA_FILES = { TRANSPORT_DATA_FILES = {
"FR-IDF": "stops_fr-idf.txt", "FR-IDF": "stops_fr-idf.txt",
"FR-NW": "stops_fr-nw.txt", "FR-NW": "stops_fr-nw.txt",
@ -109,6 +119,9 @@ def _preprocess_laposte():
# Build postal codes to other infos file # Build postal codes to other infos file
postal_codes_data = [] postal_codes_data = []
# Keep track of seen (postal_codes, names) to avoid inserting useless
# duplicates (already in the OpenData file)
seen_postal_codes = []
for item in raw_laposte_data: for item in raw_laposte_data:
fields = item["fields"] fields = item["fields"]
try: try:
@ -120,10 +133,19 @@ def _preprocess_laposte():
) )
continue continue
name = normalize_string(
titlecase.titlecase(fields["nom_de_la_commune"]),
lowercase=False
)
if (fields["code_postal"], name) in seen_postal_codes:
continue
seen_postal_codes.append((fields["code_postal"], name))
postal_codes_data.append(PostalCode( postal_codes_data.append(PostalCode(
area=area, area=area,
postal_code=fields["code_postal"], postal_code=fields["code_postal"],
name=fields["nom_de_la_commune"].title(), name=name,
lat=fields["coordonnees_gps"][0], lat=fields["coordonnees_gps"][0],
lng=fields["coordonnees_gps"][1] lng=fields["coordonnees_gps"][1]
)) ))

View File

@ -1,4 +1,4 @@
# coding : utf-8 # coding: utf-8
""" """
This module contains all the exceptions definitions for the Flatisfy-specific This module contains all the exceptions definitions for the Flatisfy-specific
exceptions. exceptions.

View File

@ -27,38 +27,34 @@ class TestTexts(unittest.TestCase):
""" """
Checks roman numbers replacement. Checks roman numbers replacement.
""" """
tester = tools.RomanNumbers()
self.assertTrue(tester.check_valid("XIV"))
self.assertTrue(not tester.check_valid("ABC"))
self.assertEqual( self.assertEqual(
"14", "XIV",
tester.convert_to_arabic("XIV") tools.convert_arabic_to_roman("14")
) )
self.assertEqual( self.assertEqual(
"1987", "MCMLXXXVII",
tester.convert_to_arabic("MCMLXXXVII") tools.convert_arabic_to_roman("1987")
) )
self.assertEqual( self.assertEqual(
"Dans le 15e arrondissement", "Dans le XVe arrondissement",
tester.convert_to_arabic_in_text("Dans le XVe arrondissement") tools.convert_arabic_to_roman_in_text("Dans le 15e arrondissement")
) )
self.assertEqual( self.assertEqual(
"20eme arr.", "XXeme arr.",
tester.convert_to_arabic_in_text("XXeme arr.") tools.convert_arabic_to_roman_in_text("20eme arr.")
) )
self.assertEqual( self.assertEqual(
"A AIX EN PROVENCE", "A AIX EN PROVENCE",
tester.convert_to_arabic_in_text("A AIX EN PROVENCE") tools.convert_arabic_to_roman_in_text("A AIX EN PROVENCE")
) )
self.assertEqual( self.assertEqual(
"Montigny Le Bretonneux", "Montigny Le Bretonneux",
tester.convert_to_arabic_in_text("Montigny Le Bretonneux") tools.convert_arabic_to_roman_in_text("Montigny Le Bretonneux")
) )
def test_roman_numbers_in_text(self): def test_roman_numbers_in_text(self):
@ -67,8 +63,8 @@ class TestTexts(unittest.TestCase):
normalization. normalization.
""" """
self.assertEqual( self.assertEqual(
"dans le 15e arrondissement", "dans le XVe arrondissement",
tools.normalize_string("Dans le XVe arrondissement") tools.normalize_string("Dans le 15e arrondissement")
) )
def test_multiple_whitespaces(self): def test_multiple_whitespaces(self):

View File

@ -25,80 +25,48 @@ LOGGER = logging.getLogger(__name__)
NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys" NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
class RomanNumbers(object): def convert_arabic_to_roman(arabic):
""" """
Utilities to check and convert roman numbers. Convert an arabic literal to a roman one.
Part of the conversions is based on ..note::
https://gist.github.com/riverrun/ac91218bb1678b857c12 Based on https://gist.github.com/riverrun/ac91218bb1678b857c12.
:param arabic: An arabic number, as string.
:returns: The corresponding roman one, as string.
""" """
@staticmethod to_roman = {
def check_valid(roman): 1: 'I', 2: 'II', 3: 'III', 4: 'IV', 5: 'V', 6: 'VI', 7: 'VII',
""" 8: 'VIII', 9: 'IX', 10: 'X',
Check whether a roman literal is a valid roman literal. 20: 'XX', 30: 'XXX', 40: 'XL', 50: 'L', 60: 'LX', 70: 'LXX',
80: 'LXXX', 90: 'XC',
100: 'C', 200: 'CC', 300: 'CCC', 400: 'CD', 500: 'D', 600: 'DC',
700: 'DCC', 800: 'DCCC', 900: 'CM',
1000: 'M', 2000: 'MM', 3000: 'MMM'
}
roman_chars_list = []
count = 1
for digit in arabic[::-1]:
digit = int(digit)
if digit != 0:
roman_chars_list.append(to_roman[digit * count])
count *= 10
return ''.join(roman_chars_list[::-1])
:param roman: A roman literal, as string.
:returns: ``True`` if it is a valid roman literal, ``False`` otherwise.
"""
if not re.match('^[MDCLXVI]+$', roman):
return False
invalid = ['IIII', 'VV', 'XXXX', 'LL', 'CCCC', 'DD', 'MMMM'] def convert_arabic_to_roman_in_text(text):
if any(sub in roman for sub in invalid): """
return False Convert roman literals to arabic one in a text.
# TODO: check M does not appear after any other, etc. :param text: Some text to convert roman literals from.
return True :returns: The corresponding text with roman literals converted to
arabic.
@staticmethod """
def convert_to_arabic(roman): return re.sub(
""" r'(\d+)',
Convert a roman literal to arabic one. lambda matchobj: convert_arabic_to_roman(matchobj.group(0)),
text
:param roman: A roman number, as string. )
:returns: The corresponding arabic one, as string.
"""
if not RomanNumbers.check_valid(roman):
return roman
keys = [
'IV', 'IX', 'XL', 'XC', 'CD', 'CM', 'I', 'V',
'X', 'L', 'C', 'D', 'M'
]
to_arabic = {
'IV': '4',
'IX': '9',
'XL': '40',
'XC': '90',
'CD': '400',
'CM': '900',
'I': '1',
'V': '5',
'X': '10',
'L': '50',
'C': '100',
'D': '500',
'M': '1000'
}
for key in keys:
if key in roman:
roman = roman.replace(key, ' {}'.format(to_arabic.get(key)))
return str(sum(int(num) for num in roman.split()))
@staticmethod
def convert_to_arabic_in_text(text):
"""
Convert roman literals to arabic one in a text.
:param text: Some text to convert roman literals from.
:returns: The corresponding text with roman literals converted to
arabic.
"""
return re.sub(
r'(?<![\S])+([MDCLXVI]+)(?=[eè\s$])',
lambda matchobj: RomanNumbers.convert_to_arabic(matchobj.group(0)),
text
)
def hash_dict(func): def hash_dict(func):
@ -221,14 +189,24 @@ def is_within_interval(value, min_value=None, max_value=None):
return all(checks) return all(checks)
def normalize_string(string): def normalize_string(string, lowercase=True, convert_arabic_numerals=True):
""" """
Normalize the given string for matching. Normalize the given string for matching.
:Example: Example::
>>> normalize_string("tétéà 14ème-XIV, foobar") >>> normalize_string("tétéà 14ème-XIV, foobar")
'tetea XIVeme xiv, foobar'
>>> normalize_string("tétéà 14ème-XIV, foobar", False)
'tetea 14eme xiv, foobar' 'tetea 14eme xiv, foobar'
:param string: The string to normalize.
:param lowercase: Whether to convert string to lowercase or not. Defaults
to ``True``.
:param convert_arabic_numerals: Whether to convert arabic numerals to roman
ones. Defaults to ``True``.
:return: The normalized string.
""" """
# ASCIIfy the string # ASCIIfy the string
string = unidecode.unidecode(string) string = unidecode.unidecode(string)
@ -237,12 +215,13 @@ def normalize_string(string):
# Keep some basic punctuation to keep syntaxic units # Keep some basic punctuation to keep syntaxic units
string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string) string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
# Convert roman numbers to arabic numbers
# TODO: Fix this :)
# string = RomanNumbers.convert_to_arabic_in_text(string)
# Convert to lowercase # Convert to lowercase
string = string.lower() if lowercase:
string = string.lower()
# Convert arabic numbers to roman numbers
if convert_arabic_numerals:
string = convert_arabic_to_roman_in_text(string)
# Collapse multiple spaces, replace tabulations and newlines by space # Collapse multiple spaces, replace tabulations and newlines by space
string = re.sub(r"\s+", " ", string) string = re.sub(r"\s+", " ", string)

View File

@ -11,6 +11,7 @@ imagehash
pillow pillow
requests requests
sqlalchemy sqlalchemy
titlecase
unidecode unidecode
vobject vobject
whoosh whoosh