In [75]:
import json
import os

import requests
import sklearn

In [88]:
if not os.path.isfile('data/categories.json'):
    categories = requests.get('https://fr.openfoodfacts.org/categories.json').json()
    categories = categories['tags']
    with open('data/categories.json', 'w') as fh:
        json.dump(categories, fh)

In [91]:
os.makedirs('data/categories', exist_ok=True)
for category in categories:
    if os.path.isfile('data/categories/%s.json' % category['id']):
        continue
    try:
        products = requests.get('%s.json' % category['url']).json()
    except json.JSONDecodeError:
        continue
    products = [
        {'id': p['_id'], 'categories': p['categories'], 'name': p['product_name'], 'url': p['url']}
        for p in products['products']
    ]
    with open('data/categories/%s.json' % category['id'], 'w') as fh:
        json.dump(products, fh)

KeyboardInterrupt: 

In [162]:
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [126]:
products = []
for category in os.listdir('data/categories'):
    with open('data/categories/%s' % category, 'r') as fh:
        products.extend(json.load(fh))

In [184]:
X = [p['name'] for p in products]
Y = [p['categories'] for p in products]

In [185]:
Y = []
for p in products:
    product_categories = []
    for category in p['categories'].split(','):
        try:
            product_categories.append(next(x['id'] for x in categories if x['name'] == category))
        except StopIteration:
            pass
    Y.append(product_categories)

In [186]:
XY = [z for z in zip(X, Y) if len(z[1]) > 0]
X = [x[0] for x in XY]
Y = [x[1] for x in XY]

In [187]:
train = random.sample(range(len(X)), len(X) * 50 // 100)
X_train = [X[i] for i in train]
y_train_text = [Y[i] for i in train]

X_test = list(set(X) - set(X_train))

In [188]:
mlb = MultiLabelBinarizer()
Y_train = mlb.fit_transform(y_train_text)

classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [196]:
predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)
for item, labels in zip(X_test, all_labels):
    print('{0} => {1}'.format(item, ', '.join(labels)))

Foie Gras de Canard entier en gelée => en:fish-and-meat-and-eggs, fr:foies-gras, fr:foies-gras-de-canard, fr:foies-gras-entiers
Pain grillé brioché => en:toasts
Le viennois Le fouetté Chocolat => 
Kania Basilikum gerebelt => 
Capellini => 
thé glacé pêche => en:beverages
Chicorée Café => en:beverages, en:coffees, en:hot-beverages
Pomme Nature => 
Fumet Poisson pour Sauces et Cuissons => en:dehydrated-broths
Grattons de Canard à l'Échalote => en:duck-dishes
Pomme Violette => en:desserts, en:fruits-based-foods, en:plant-based-foods-and-beverages
Couscous aux falafels => en:cereal-semolinas, en:cereals-and-potatoes, en:cereals-and-their-products, en:durum-wheat-semolinas, en:durum-wheat-semolinas-for-couscous, en:plant-based-foods, en:plant-based-foods-and-beverages, en:wheat-semolinas
Sauce Végétale Ail & Fines herbes => 
Tortils Quinoa => 
La Salvetat => 
mini gâteaux pépites chocolat => en:biscuits-and-cakes, en:cakes, en:chocolate-cakes, en:sugary-snacks
Saumon fumé supérieur  => en:w