In [2]:
import collections
import itertools
import json
import os

import numpy as np
import pandas
import random

from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC

MIN_NUMBER_PRODUCTS_PER_CATEGORY = 25
TRAINING_DATASET_SIZE = 50 / 100

## Load data from CSV dump

In [26]:
csv = pandas.read_csv(
    'en.openfoodfacts.org.products.csv',
    sep='\t',
    usecols=[0, 7, 15],
    dtype={'code': 'str', 'product_name': 'str'},
    converters={'categories_tags': lambda x: x.split(',') if x else np.NaN}
)
# Filter products with and without categories in two different DataFrames
products_with_categories = csv[pandas.notnull(csv['categories_tags'])]
products_without_categories = csv[pandas.isnull(csv['categories_tags'])]

## Fitting on dataset

In [27]:
# Let's build vectors of products and categories, for training purpose.
categories = [
    category
    for category, count in collections.Counter(
        category for category_list in products_with_categories['categories_tags'] for category in category_list
    ).items()
    if count > MIN_NUMBER_PRODUCTS_PER_CATEGORY and category != ''
]
# Filter out empty lists of categories
XY = products_with_categories.copy()
XY['categories_tags'] = XY['categories_tags'].map(lambda c_list: [c for c in c_list if c in categories])
mask = XY['categories_tags'].str.len() > 0
XY = XY[mask]
# TODO: We should ensure each category is sufficiently represented

In [28]:
# Select training sample
XY_training = XY.sample(frac=0.5)
X_train = XY_training['product_name'].values.astype('U')
Y_train = [np.array(c).astype('U') for c in XY_training['categories_tags'].values]

In [29]:
mlb = MultiLabelBinarizer()
Y_train_transformed = mlb.fit_transform(Y_train)

In [335]:
# Fit our classifier
classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, Y_train_transformed)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

In [380]:
# Check score on a testing set
training_indices = np.in1d(XY['product_name'].values.astype('U'), X_train, invert=True)
X_test = XY['product_name'][training_indices]
Y_test = [np.array(c).astype('U') for c in XY['categories_tags'][training_indices].values]

predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

In [394]:
testing_dataframe = pandas.DataFrame({
    'product_name': X_test,
    'original_labels': Y_test,
    'guessed_labels': all_labels
}, columns=['product_name', 'original_labels', 'guessed_labels'])
testing_dataframe.to_csv('testing.csv', index=False)
testing_dataframe

Unnamed: 0,product_name,original_labels,guessed_labels
176,Salade Cesar,"[en:plant-based-foods-and-beverages, en:plant-...","(en:meals, fr:salades-composees)"
184,lentilles vertes,"[en:plant-based-foods-and-beverages, en:plant-...","(en:green-lentils, en:legume-seeds, en:legumes..."
185,Root Beer,"[en:beverages, en:carbonated-drinks, en:sodas,...","(en:alcoholic-beverages, en:beers, en:beverage..."
186,Biscuits sablés fourrage au cacao,"[en:sugary-snacks, en:biscuits-and-cakes, en:b...","(en:biscuits, en:biscuits-and-cakes, en:chocol..."
238,Blle Pet 50CL Coca Cola Cherry,"[en:beverages, en:sugared-beverages]","(en:beverages, en:non-sugared-beverages, en:so..."
244,Cauliflower,"[en:plant-based-foods-and-beverages, en:plant-...","(en:fruits-and-vegetables-based-foods, en:leaf..."
247,Salsa de mostaza,"[en:groceries, en:condiments, en:sauces, en:mu...","(en:condiments, en:groceries, en:mustards, en:..."
249,7Up,"[en:plant-based-foods-and-beverages, en:bevera...","(en:beverages, en:non-sugared-beverages)"
276,Mehrkomponeneten Protein 90 C6 Haselnuß,"[en:dietary-supplements, en:bodybuilding-suppl...","(en:bodybuilding-supplements, en:dietary-suppl..."
292,Cakes aux Fruits,"[en:sugary-snacks, en:biscuits-and-cakes, en:d...","(en:biscuits-and-cakes, en:cakes, en:desserts,..."


In [30]:
# Dump the classifier
joblib.dump((mlb, classifier), 'offClassifier.pkl')

['offClassifier.pkl']

## Predict!

In [15]:
def batch(iterable, size):
    """
    Get items from a sequence a batch at a time.

    :param iterable: The iterable to get the items from.
    :param size: The size of the batches.
    :return: A new iterable.
    """
    sourceiter = iter(iterable)
    while True:
        batchiter = itertools.islice(sourceiter, size)
        yield itertools.chain([next(batchiter)], batchiter)

In [31]:
# Load the classifier
mlb, classifier = joblib.load('offClassifier.pkl')

In [32]:
X_predicted = products_without_categories['product_name'].values.astype('U')

all_labels = []

for i in batch(X_predicted, 30000):
    predicted = classifier.predict(list(i))
    all_labels.extend(mlb.inverse_transform(predicted))

  """


In [33]:
prediction_dataframe = pandas.DataFrame({
    'product_name': products_without_categories['product_name'].values.astype('U'),
    'guessed_labels': all_labels
}, columns=['product_name', 'guessed_labels'])
prediction_dataframe.to_csv('prediction.csv', index=False)
prediction_dataframe

Unnamed: 0,product_name,guessed_labels
0,Farine de blé noir,"(en:cereal-flours, en:cereals-and-potatoes, en..."
1,Banana Chips Sweetened (Whole),()
2,Peanuts,"(en:legumes, en:legumes-and-their-products, en..."
3,Organic Salted Nut Mix,()
4,Organic Polenta,"(en:cereals-and-potatoes, en:cereals-and-their..."
5,Breadshop Honey Gone Nuts Granola,"(en:breakfasts, en:sugary-snacks)"
6,Organic Long Grain White Rice,"(en:cereal-grains, en:cereals-and-potatoes, en..."
7,Organic Muesli,"(en:breakfast-cereals, en:breakfasts, en:cerea..."
8,Organic Dark Chocolate Minis,"(en:chocolates, en:dark-chocolates, en:sugary-..."
9,Organic Sunflower Oil,"(en:fats, en:plant-based-foods, en:plant-based..."


## Port to JS

In [3]:
from sklearn_porter import Porter

In [4]:
# Load the classifier
mlb, classifier = joblib.load('offClassifier.pkl')

In [5]:
porter = Porter(classifier, language='javascript')
output = porter.export()

ValueError: The given model 'Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])' isn't supported.

In [8]:
algorithm_name = str(type(classifier).__name__)
print(algorithm_name)

Pipeline
