offclassification/notebook.ipynb

2233 lines
96 KiB
Plaintext
Raw Normal View History

2017-09-23 02:16:39 +02:00
{
"cells": [
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 90,
2017-09-23 02:16:39 +02:00
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"import itertools\n",
2017-09-23 02:16:39 +02:00
"import json\n",
"import os\n",
"\n",
"import numpy as np\n",
"import pandas\n",
"import random\n",
"\n",
"from sklearn.externals import joblib\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
2017-09-26 21:28:44 +02:00
"from sklearn.metrics import accuracy_score\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.multiclass import OneVsRestClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.svm import LinearSVC\n",
"\n",
2017-09-26 21:28:44 +02:00
"MIN_NUMBER_PRODUCTS_PER_CATEGORY = 3000\n",
"TRAINING_DATASET_SIZE = 25 / 100"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data from CSV dump"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 91,
2017-09-23 02:16:39 +02:00
"metadata": {},
"outputs": [],
"source": [
"csv = pandas.read_csv(\n",
" 'en.openfoodfacts.org.products.csv',\n",
" sep='\\t',\n",
" usecols=[0, 7, 15],\n",
" dtype={'code': 'str', 'product_name': 'str'},\n",
" converters={'categories_tags': lambda x: x.split(',') if x else np.NaN}\n",
")\n",
"# Filter products with and without categories in two different DataFrames\n",
"products_with_categories = csv[pandas.notnull(csv['categories_tags'])]\n",
"products_without_categories = csv[pandas.isnull(csv['categories_tags'])]"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "markdown",
"metadata": {},
2017-09-23 02:16:39 +02:00
"source": [
"## Fitting on dataset"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 92,
2017-09-23 02:16:39 +02:00
"metadata": {},
2017-09-26 21:28:44 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Keeping 33 categories.\n"
]
}
],
2017-09-23 02:16:39 +02:00
"source": [
"# Let's build vectors of products and categories, for training purpose.\n",
"categories = [\n",
" category\n",
" for category, count in collections.Counter(\n",
" category for category_list in products_with_categories['categories_tags'] for category in category_list\n",
" ).items()\n",
2017-09-26 21:28:44 +02:00
" # Filter out categories without enough products\n",
" if count > MIN_NUMBER_PRODUCTS_PER_CATEGORY and category != ''\n",
"]\n",
2017-09-26 21:28:44 +02:00
"print('Keeping %d categories.' % len(categories))\n",
"# Filter out empty lists of categories\n",
"XY = products_with_categories.copy()\n",
"XY['categories_tags'] = XY['categories_tags'].map(lambda c_list: [c for c in c_list if c in categories])\n",
"mask = XY['categories_tags'].str.len() > 0\n",
2017-09-26 21:28:44 +02:00
"XY = XY[mask]"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 93,
2017-09-23 02:16:39 +02:00
"metadata": {},
2017-09-26 21:28:44 +02:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/lverney/.local/share/virtualenvs/machine_learning/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
" FutureWarning)\n"
]
},
{
"data": {
"text/plain": [
"741"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
2017-09-23 02:16:39 +02:00
"source": [
2017-09-26 21:28:44 +02:00
"# Select training and testing sample\n",
"X = XY['product_name'].values.astype('U')\n",
"Y = [np.array(c).astype('U') for c in XY['categories_tags'].values]\n",
"\n",
"X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, train_size=TRAINING_DATASET_SIZE)\n",
"\n",
"# Check each category is sufficiently represented\n",
"min([\n",
" count\n",
" for _, count in collections.Counter(\n",
" category for category_list in Y_train for category in category_list\n",
" ).items()\n",
"])"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 94,
2017-09-23 02:16:39 +02:00
"metadata": {},
"outputs": [],
"source": [
"mlb = MultiLabelBinarizer()\n",
"Y_train_transformed = mlb.fit_transform(Y_train)"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 95,
2017-09-23 02:16:39 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(memory=None,\n",
" steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
" verbose=0),\n",
" n_jobs=1))])"
]
},
2017-09-26 21:28:44 +02:00
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
2017-09-23 02:16:39 +02:00
"source": [
"# Fit our classifier\n",
"classifier = Pipeline([\n",
" ('vectorizer', CountVectorizer()),\n",
" ('tfidf', TfidfTransformer()),\n",
2017-09-26 21:28:44 +02:00
" ('clf', OneVsRestClassifier(LinearSVC()))\n",
"])\n",
"\n",
"classifier.fit(X_train, Y_train_transformed)"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 96,
2017-09-23 02:16:39 +02:00
"metadata": {},
2017-09-26 21:28:44 +02:00
"outputs": [
{
"data": {
"text/plain": [
"0.52062784115275751"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
2017-09-23 02:16:39 +02:00
"source": [
2017-09-26 21:28:44 +02:00
"# Compute predictions for testing set\n",
"predicted = classifier.predict(X_test)\n",
2017-09-26 21:28:44 +02:00
"all_labels = mlb.inverse_transform(predicted)\n",
"\n",
"accuracy_score(mlb.fit_transform(Y_test), predicted)"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 97,
2017-09-23 02:16:39 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>product_name</th>\n",
" <th>original_labels</th>\n",
" <th>guessed_labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>0</th>\n",
" <td>Fernandes Cherry Soda - Ga Mee Naar Suruname &amp;...</td>\n",
" <td>[en:beverages, en:sugared-beverages]</td>\n",
" <td>(en:beverages, en:plant-based-foods-and-bevera...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Compotes allégée en sucres Pommes Carrefour</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:desserts, en:fruits-and-vegetables-based-f...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>2</th>\n",
" <td>Tortellini 4 Fromages, LunchBox</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>3</th>\n",
" <td>Orange and mango squash</td>\n",
" <td>[en:beverages]</td>\n",
" <td>(en:beverages, en:plant-based-foods-and-bevera...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>4</th>\n",
" <td>Crackers toast nature</td>\n",
" <td>[en:salty-snacks]</td>\n",
" <td>(en:cereals-and-potatoes, en:salty-snacks)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>5</th>\n",
" <td>Miellats du Maquis cru d'été 2016</td>\n",
" <td>[en:spreads, en:breakfasts]</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>6</th>\n",
" <td>Cebollas &amp;quot;Juan de Dios&amp;quot;</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:fresh-foods, en:fruits-and-vegetables-base...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>7</th>\n",
" <td>Moules fraîches de Hollande</td>\n",
" <td>[en:seafood]</td>\n",
" <td>(en:seafood,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>8</th>\n",
" <td>Thé citron</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:beverages, en:non-sugared-beverages, en:pl...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>9</th>\n",
" <td>Rêves de chocolat Assortiment de chocolats fin...</td>\n",
" <td>[en:sugary-snacks, en:confectioneries, en:dess...</td>\n",
" <td>(en:chocolates, en:desserts, en:sugary-snacks)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>10</th>\n",
" <td>Noir aux éclats de noisettes</td>\n",
" <td>[en:sugary-snacks, en:chocolates]</td>\n",
" <td>(en:chocolates, en:sugary-snacks)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>11</th>\n",
" <td>Couscous royal poulet merguez</td>\n",
" <td>[en:canned-foods, en:meals]</td>\n",
" <td>(en:meals,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>12</th>\n",
" <td>Baguette Céréales Carrefour</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:cereals-and-potatoes, en:plant-based-foods...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>13</th>\n",
" <td>Pomme Fraise</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:desserts, en:fruits-and-vegetables-based-f...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>14</th>\n",
" <td>Risotto Champignons</td>\n",
" <td>[en:meals]</td>\n",
" <td>(en:meals, en:plant-based-foods, en:plant-base...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>15</th>\n",
" <td>Bio Chocolat Noir aux Éclats de Noisettes</td>\n",
" <td>[en:sugary-snacks, en:chocolates]</td>\n",
" <td>(en:chocolates, en:sugary-snacks)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>16</th>\n",
" <td>Angeliter Zitronenlimonade</td>\n",
" <td>[en:beverages, en:sugared-beverages]</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>17</th>\n",
" <td>Agar-Agar</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>18</th>\n",
" <td>Préparation à l'huile de thym, romarin et laur...</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>19</th>\n",
" <td>Velamints</td>\n",
" <td>[en:sugary-snacks, en:confectioneries]</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>20</th>\n",
" <td>Vegemil Black Bean Soymilk</td>\n",
" <td>[en:dairies]</td>\n",
" <td>(en:plant-based-foods-and-beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>21</th>\n",
" <td>Carottes Râpées</td>\n",
" <td>[en:meals]</td>\n",
" <td>(en:meals,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>22</th>\n",
" <td>Budweiser Budvar</td>\n",
" <td>[en:beverages, en:alcoholic-beverages]</td>\n",
" <td>(en:alcoholic-beverages, en:beverages)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>23</th>\n",
" <td>Mayonnaise aux oeufs frais</td>\n",
" <td>[en:groceries, en:sauces]</td>\n",
" <td>(en:groceries, en:sauces)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>24</th>\n",
" <td>Kremsi krem sir</td>\n",
" <td>[en:dairies, en:cheeses]</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>25</th>\n",
" <td>Domaine de l'Echauguette - 2010</td>\n",
" <td>[en:beverages, en:alcoholic-beverages]</td>\n",
" <td>(en:alcoholic-beverages, en:beverages)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>26</th>\n",
" <td>Strathmore Still Spring Water</td>\n",
" <td>[en:beverages, en:non-sugared-beverages]</td>\n",
" <td>(en:beverages, en:non-sugared-beverages)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>27</th>\n",
" <td>Mousse de Viennois (8 Chocolat)</td>\n",
" <td>[en:fresh-foods, en:sugary-snacks, en:dairies,...</td>\n",
" <td>(en:chocolates, en:dairies, en:desserts, en:fr...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>28</th>\n",
" <td>Poulet &amp; Riz basquaise</td>\n",
" <td>[en:meals]</td>\n",
" <td>(en:meals,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29</th>\n",
" <td>Le Norvège -25% sel</td>\n",
" <td>[en:seafood]</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69924</th>\n",
" <td>Cake Release Spray</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:biscuits-and-cakes, en:desserts, en:sugary...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69925</th>\n",
" <td>Maxi Quenelles à Gratiner, Jambon sauce béchamel</td>\n",
" <td>[en:canned-foods, en:meals]</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69926</th>\n",
" <td>Beurre gastronomique demi-sel</td>\n",
" <td>[en:spreads, en:dairies]</td>\n",
" <td>(en:dairies, en:spreads)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69927</th>\n",
" <td>Jambon à Griller</td>\n",
" <td>[en:meats, en:prepared-meats]</td>\n",
" <td>(en:meats, en:prepared-meats)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69928</th>\n",
" <td>Alpro Oat Almond</td>\n",
" <td>[en:beverages, en:dairies, en:non-sugared-beve...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69929</th>\n",
" <td>Bière trappiste</td>\n",
" <td>[en:beverages, en:alcoholic-beverages]</td>\n",
" <td>(en:alcoholic-beverages, en:beverages)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69930</th>\n",
" <td>Viré-Clessé 2010</td>\n",
" <td>[en:beverages, en:alcoholic-beverages]</td>\n",
" <td>(en:alcoholic-beverages, en:beverages)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69931</th>\n",
" <td>Cuillers aux œufs frais</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:sugary...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69932</th>\n",
" <td>Super Smoothie Antioxidant Innocent</td>\n",
" <td>[en:beverages, en:non-sugared-beverages]</td>\n",
" <td>(en:beverages, en:non-sugared-beverages, en:pl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69933</th>\n",
" <td>6 petits pains précuits</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:cereals-and-potatoes, en:plant-based-foods...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69934</th>\n",
" <td>Indian tonic</td>\n",
" <td>[en:beverages]</td>\n",
" <td>(en:beverages, en:sugared-beverages)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69935</th>\n",
" <td>Merlot 2015</td>\n",
" <td>[en:beverages, en:alcoholic-beverages]</td>\n",
" <td>(en:alcoholic-beverages, en:beverages)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69936</th>\n",
" <td>Jus multivitaminé 11 fruits</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
" <td>(en:beverages, en:fruit-based-beverages, en:pl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69937</th>\n",
" <td>Pain de mie complet</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:cereals-and-potatoes, en:plant-based-foods...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69938</th>\n",
" <td>Farfalle Zebra</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69939</th>\n",
" <td>Nectarinas</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69940</th>\n",
" <td>Ail et fines herbes fromage à tartiner (27% MG)</td>\n",
" <td>[en:spreads, en:dairies, en:cheeses]</td>\n",
" <td>(en:cheeses, en:dairies, en:spreads)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69941</th>\n",
" <td>Ratatouille cuisinée à la Provençale</td>\n",
" <td>[en:canned-foods, en:meals]</td>\n",
" <td>(en:meals,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69942</th>\n",
" <td>Confiture De Cerises Noires De Bâle</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:breakfasts, en:fruits-and-vegetables-based...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69943</th>\n",
" <td>Madeleines Marbrees chocolat</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:d...</td>\n",
" <td>(en:biscuits-and-cakes, en:desserts, en:sugary...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69944</th>\n",
" <td>Filet de poulet (-25% de sel) (4+2 gratuites)</td>\n",
" <td>[en:meats, en:prepared-meats]</td>\n",
" <td>(en:meats,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69945</th>\n",
" <td>Boisson gazeuse goût framboise</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:beverages, en:non-sugared-beverages)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69946</th>\n",
" <td>Mini billes Mozzarella di Bufala Campana AOP</td>\n",
" <td>[en:dairies, en:cheeses]</td>\n",
" <td>(en:cheeses, en:dairies)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69947</th>\n",
" <td>Crème fraîche épaisse de Montagne</td>\n",
" <td>[en:dairies]</td>\n",
" <td>(en:dairies,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69948</th>\n",
" <td>Cacahuètes grillées et salées</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69949</th>\n",
" <td>Potato Wedges</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:salty-snacks,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69950</th>\n",
" <td>Jus d'orange à base de jus d'orange concentré</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
" <td>(en:beverages, en:fruit-based-beverages, en:pl...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>69951</th>\n",
" <td>Fleur de Sel aux Épices Grillées</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69952</th>\n",
" <td>Pétales à la crevette</td>\n",
" <td>[en:salty-snacks]</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69953</th>\n",
" <td>Pommes rissolées Bio</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:frozen-foods, en:fruits-and-vegetables-bas...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>69954 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" product_name \\\n",
"0 Fernandes Cherry Soda - Ga Mee Naar Suruname &... \n",
"1 Compotes allégée en sucres Pommes Carrefour \n",
"2 Tortellini 4 Fromages, LunchBox \n",
"3 Orange and mango squash \n",
"4 Crackers toast nature \n",
"5 Miellats du Maquis cru d'été 2016 \n",
"6 Cebollas &quot;Juan de Dios&quot; \n",
"7 Moules fraîches de Hollande \n",
"8 Thé citron \n",
"9 Rêves de chocolat Assortiment de chocolats fin... \n",
"10 Noir aux éclats de noisettes \n",
"11 Couscous royal poulet merguez \n",
"12 Baguette Céréales Carrefour \n",
"13 Pomme Fraise \n",
"14 Risotto Champignons \n",
"15 Bio Chocolat Noir aux Éclats de Noisettes \n",
"16 Angeliter Zitronenlimonade \n",
"17 Agar-Agar \n",
"18 Préparation à l'huile de thym, romarin et laur... \n",
"19 Velamints \n",
"20 Vegemil Black Bean Soymilk \n",
"21 Carottes Râpées \n",
"22 Budweiser Budvar \n",
"23 Mayonnaise aux oeufs frais \n",
"24 Kremsi krem sir \n",
"25 Domaine de l'Echauguette - 2010 \n",
"26 Strathmore Still Spring Water \n",
"27 Mousse de Viennois (8 Chocolat) \n",
"28 Poulet & Riz basquaise \n",
"29 Le Norvège -25% sel \n",
"... ... \n",
"69924 Cake Release Spray \n",
"69925 Maxi Quenelles à Gratiner, Jambon sauce béchamel \n",
"69926 Beurre gastronomique demi-sel \n",
"69927 Jambon à Griller \n",
"69928 Alpro Oat Almond \n",
"69929 Bière trappiste \n",
"69930 Viré-Clessé 2010 \n",
"69931 Cuillers aux œufs frais \n",
"69932 Super Smoothie Antioxidant Innocent \n",
"69933 6 petits pains précuits \n",
"69934 Indian tonic \n",
"69935 Merlot 2015 \n",
"69936 Jus multivitaminé 11 fruits \n",
"69937 Pain de mie complet \n",
"69938 Farfalle Zebra \n",
"69939 Nectarinas \n",
"69940 Ail et fines herbes fromage à tartiner (27% MG) \n",
"69941 Ratatouille cuisinée à la Provençale \n",
"69942 Confiture De Cerises Noires De Bâle \n",
"69943 Madeleines Marbrees chocolat \n",
"69944 Filet de poulet (-25% de sel) (4+2 gratuites) \n",
"69945 Boisson gazeuse goût framboise \n",
"69946 Mini billes Mozzarella di Bufala Campana AOP \n",
"69947 Crème fraîche épaisse de Montagne \n",
"69948 Cacahuètes grillées et salées \n",
"69949 Potato Wedges \n",
"69950 Jus d'orange à base de jus d'orange concentré \n",
"69951 Fleur de Sel aux Épices Grillées \n",
"69952 Pétales à la crevette \n",
"69953 Pommes rissolées Bio \n",
"\n",
" original_labels \\\n",
"0 [en:beverages, en:sugared-beverages] \n",
"1 [en:plant-based-foods-and-beverages, en:plant-... \n",
"2 [en:plant-based-foods-and-beverages, en:plant-... \n",
"3 [en:beverages] \n",
"4 [en:salty-snacks] \n",
"5 [en:spreads, en:breakfasts] \n",
"6 [en:plant-based-foods-and-beverages, en:plant-... \n",
"7 [en:seafood] \n",
"8 [en:plant-based-foods-and-beverages, en:bevera... \n",
"9 [en:sugary-snacks, en:confectioneries, en:dess... \n",
"10 [en:sugary-snacks, en:chocolates] \n",
"11 [en:canned-foods, en:meals] \n",
"12 [en:plant-based-foods-and-beverages, en:plant-... \n",
"13 [en:plant-based-foods-and-beverages, en:plant-... \n",
"14 [en:meals] \n",
"15 [en:sugary-snacks, en:chocolates] \n",
"16 [en:beverages, en:sugared-beverages] \n",
"17 [en:plant-based-foods-and-beverages, en:plant-... \n",
"18 [en:plant-based-foods-and-beverages, en:plant-... \n",
"19 [en:sugary-snacks, en:confectioneries] \n",
"20 [en:dairies] \n",
"21 [en:meals] \n",
"22 [en:beverages, en:alcoholic-beverages] \n",
"23 [en:groceries, en:sauces] \n",
"24 [en:dairies, en:cheeses] \n",
"25 [en:beverages, en:alcoholic-beverages] \n",
"26 [en:beverages, en:non-sugared-beverages] \n",
"27 [en:fresh-foods, en:sugary-snacks, en:dairies,... \n",
"28 [en:meals] \n",
"29 [en:seafood] \n",
"... ... \n",
"69924 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69925 [en:canned-foods, en:meals] \n",
"69926 [en:spreads, en:dairies] \n",
"69927 [en:meats, en:prepared-meats] \n",
"69928 [en:beverages, en:dairies, en:non-sugared-beve... \n",
"69929 [en:beverages, en:alcoholic-beverages] \n",
"69930 [en:beverages, en:alcoholic-beverages] \n",
"69931 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"69932 [en:beverages, en:non-sugared-beverages] \n",
"69933 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69934 [en:beverages] \n",
"69935 [en:beverages, en:alcoholic-beverages] \n",
"69936 [en:plant-based-foods-and-beverages, en:bevera... \n",
"69937 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69938 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69939 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69940 [en:spreads, en:dairies, en:cheeses] \n",
"69941 [en:canned-foods, en:meals] \n",
"69942 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69943 [en:sugary-snacks, en:biscuits-and-cakes, en:d... \n",
"69944 [en:meats, en:prepared-meats] \n",
"69945 [en:plant-based-foods-and-beverages, en:bevera... \n",
"69946 [en:dairies, en:cheeses] \n",
"69947 [en:dairies] \n",
"69948 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69949 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69950 [en:plant-based-foods-and-beverages, en:bevera... \n",
"69951 [en:plant-based-foods-and-beverages, en:plant-... \n",
"69952 [en:salty-snacks] \n",
"69953 [en:plant-based-foods-and-beverages, en:plant-... \n",
"\n",
" guessed_labels \n",
"0 (en:beverages, en:plant-based-foods-and-bevera... \n",
"1 (en:desserts, en:fruits-and-vegetables-based-f... \n",
"2 (en:cereals-and-potatoes, en:cereals-and-their... \n",
"3 (en:beverages, en:plant-based-foods-and-bevera... \n",
"4 (en:cereals-and-potatoes, en:salty-snacks) \n",
"5 () \n",
"6 (en:fresh-foods, en:fruits-and-vegetables-base... \n",
"7 (en:seafood,) \n",
"8 (en:beverages, en:non-sugared-beverages, en:pl... \n",
"9 (en:chocolates, en:desserts, en:sugary-snacks) \n",
"10 (en:chocolates, en:sugary-snacks) \n",
"11 (en:meals,) \n",
"12 (en:cereals-and-potatoes, en:plant-based-foods... \n",
"13 (en:desserts, en:fruits-and-vegetables-based-f... \n",
"14 (en:meals, en:plant-based-foods, en:plant-base... \n",
"15 (en:chocolates, en:sugary-snacks) \n",
"16 () \n",
"17 () \n",
"18 (en:plant-based-foods, en:plant-based-foods-an... \n",
"19 () \n",
"20 (en:plant-based-foods-and-beverages,) \n",
"21 (en:meals,) \n",
"22 (en:alcoholic-beverages, en:beverages) \n",
"23 (en:groceries, en:sauces) \n",
"24 () \n",
"25 (en:alcoholic-beverages, en:beverages) \n",
"26 (en:beverages, en:non-sugared-beverages) \n",
"27 (en:chocolates, en:dairies, en:desserts, en:fr... \n",
"28 (en:meals,) \n",
"29 () \n",
"... ... \n",
"69924 (en:biscuits-and-cakes, en:desserts, en:sugary... \n",
"69925 () \n",
"69926 (en:dairies, en:spreads) \n",
"69927 (en:meats, en:prepared-meats) \n",
"69928 () \n",
"69929 (en:alcoholic-beverages, en:beverages) \n",
"69930 (en:alcoholic-beverages, en:beverages) \n",
"69931 (en:biscuits, en:biscuits-and-cakes, en:sugary... \n",
"69932 (en:beverages, en:non-sugared-beverages, en:pl... \n",
"69933 (en:cereals-and-potatoes, en:plant-based-foods... \n",
"69934 (en:beverages, en:sugared-beverages) \n",
"69935 (en:alcoholic-beverages, en:beverages) \n",
"69936 (en:beverages, en:fruit-based-beverages, en:pl... \n",
"69937 (en:cereals-and-potatoes, en:plant-based-foods... \n",
"69938 (en:cereals-and-potatoes, en:cereals-and-their... \n",
"69939 () \n",
"69940 (en:cheeses, en:dairies, en:spreads) \n",
"69941 (en:meals,) \n",
"69942 (en:breakfasts, en:fruits-and-vegetables-based... \n",
"69943 (en:biscuits-and-cakes, en:desserts, en:sugary... \n",
"69944 (en:meats,) \n",
"69945 (en:beverages, en:non-sugared-beverages) \n",
"69946 (en:cheeses, en:dairies) \n",
"69947 (en:dairies,) \n",
"69948 (en:plant-based-foods, en:plant-based-foods-an... \n",
"69949 (en:salty-snacks,) \n",
"69950 (en:beverages, en:fruit-based-beverages, en:pl... \n",
"69951 () \n",
"69952 () \n",
"69953 (en:frozen-foods, en:fruits-and-vegetables-bas... \n",
"\n",
"[69954 rows x 3 columns]"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testing_dataframe = pandas.DataFrame({\n",
" 'product_name': X_test,\n",
" 'original_labels': Y_test,\n",
" 'guessed_labels': all_labels\n",
"}, columns=['product_name', 'original_labels', 'guessed_labels'])\n",
"testing_dataframe.to_csv('testing.csv', index=False)\n",
"testing_dataframe"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>product_name</th>\n",
" <th>good_guessed_labels</th>\n",
" <th>extra_guessed_labels</th>\n",
" <th>missing_guessed_labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Fernandes Cherry Soda - Ga Mee Naar Suruname &amp;...</td>\n",
" <td>{en:beverages}</td>\n",
" <td>[en:plant-based-foods-and-beverages]</td>\n",
" <td>[en:sugared-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Miellats du Maquis cru d'été 2016</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:spreads, en:breakfasts]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Thé citron</td>\n",
" <td>{en:non-sugared-beverages, en:plant-based-food...</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Rêves de chocolat Assortiment de chocolats fin...</td>\n",
" <td>{en:chocolates, en:desserts, en:sugary-snacks}</td>\n",
" <td>[]</td>\n",
" <td>[en:confectioneries]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Couscous royal poulet merguez</td>\n",
" <td>{en:meals}</td>\n",
" <td>[]</td>\n",
" <td>[en:canned-foods]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Baguette Céréales Carrefour</td>\n",
" <td>{en:plant-based-foods, en:cereals-and-potatoes...</td>\n",
" <td>[]</td>\n",
" <td>[en:cereals-and-their-products]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Angeliter Zitronenlimonade</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:beverages, en:sugared-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Agar-Agar</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Velamints</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:sugary-snacks, en:confectioneries]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Vegemil Black Bean Soymilk</td>\n",
" <td>{}</td>\n",
" <td>[en:plant-based-foods-and-beverages]</td>\n",
" <td>[en:dairies]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Kremsi krem sir</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:dairies, en:cheeses]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Le Norvège -25% sel</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:seafood]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Petit Munster Géromé</td>\n",
" <td>{en:cheeses}</td>\n",
" <td>[]</td>\n",
" <td>[en:dairies]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>13</th>\n",
" <td>Nectarines jaunes</td>\n",
" <td>{en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" <td>[]</td>\n",
" <td>[en:fruits-and-vegetables-based-foods, en:frui...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Carotte nouvelle</td>\n",
" <td>{en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" <td>[]</td>\n",
" <td>[en:fresh-foods, en:fruits-and-vegetables-base...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Apple Real Fruit Pie</td>\n",
" <td>{}</td>\n",
" <td>[en:plant-based-foods-and-beverages]</td>\n",
" <td>[en:sugary-snacks]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Jus de pomme</td>\n",
" <td>{en:plant-based-foods-and-beverages, en:fruit-...</td>\n",
" <td>[]</td>\n",
" <td>[en:non-sugared-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Roquefort Premium</td>\n",
" <td>{en:dairies}</td>\n",
" <td>[]</td>\n",
" <td>[en:cheeses]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Antésite concentré de Réglisse Menthe</td>\n",
" <td>{}</td>\n",
" <td>[en:plant-based-foods-and-beverages]</td>\n",
" <td>[en:beverages, en:non-sugared-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>19</th>\n",
" <td>Menta piperita en bolsitas</td>\n",
" <td>{en:non-sugared-beverages, en:plant-based-food...</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods, en:groceries, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Demi-Lunes Cèpes aux oeufs frais</td>\n",
" <td>{en:plant-based-foods, en:cereals-and-potatoes...</td>\n",
" <td>[]</td>\n",
" <td>[en:fresh-foods, en:meals]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Sirop de Grenadine Bio</td>\n",
" <td>{en:beverages}</td>\n",
" <td>[en:sugared-beverages]</td>\n",
" <td>[en:non-sugared-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>Gulaschsuppe</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:meals]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>23</th>\n",
" <td>Déli'Pocket Montagnard</td>\n",
" <td>{}</td>\n",
" <td>[en:biscuits, en:biscuits-and-cakes, en:sugary...</td>\n",
" <td>[en:frozen-foods]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>Orzo by Sainsbury's</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>25</th>\n",
" <td>Vive soy Vainilla</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>26</th>\n",
" <td>Excellence 99% Cacao Noir Absolu</td>\n",
" <td>{en:chocolates, en:sugary-snacks}</td>\n",
" <td>[]</td>\n",
" <td>[en:confectioneries]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>27</th>\n",
" <td>nan</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:sugary-snacks, en:chocolates]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>28</th>\n",
" <td>jupiter</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:sugary-snacks, en:confectioneries]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29</th>\n",
" <td>Faux-Filet</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:meats]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29197</th>\n",
" <td>Noix d'épaule cuite choix désossée</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:canned-foods]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29198</th>\n",
" <td>Lait de coco</td>\n",
" <td>{en:non-sugared-beverages, en:beverages, en:pl...</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29199</th>\n",
" <td>Tagliatelles au poulet</td>\n",
" <td>{en:meals}</td>\n",
" <td>[en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" <td>[en:fresh-foods]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29200</th>\n",
" <td>Dim Sum Shao Mai</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29201</th>\n",
" <td>Waffel Blätter mit Vollmilchschokolade</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29202</th>\n",
" <td>Génépi de Savoie L'Ancienne</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:beverages, en:alcoholic-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29203</th>\n",
" <td>6 carrés fourrés, saveur amande</td>\n",
" <td>{en:biscuits-and-cakes, en:sugary-snacks}</td>\n",
" <td>[]</td>\n",
" <td>[en:desserts]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29204</th>\n",
" <td>Multivitamins</td>\n",
" <td>{en:plant-based-foods-and-beverages}</td>\n",
" <td>[]</td>\n",
" <td>[en:beverages, en:plant-based-beverages, en:no...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29205</th>\n",
" <td>Instantané aux châtaignes</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29206</th>\n",
" <td>Nuggets de Poulet</td>\n",
" <td>{en:meats}</td>\n",
" <td>[]</td>\n",
" <td>[en:fresh-foods, en:meals]</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>29207</th>\n",
" <td>Jus d'orange</td>\n",
" <td>{en:beverages}</td>\n",
" <td>[en:fruit-based-beverages, en:plant-based-beve...</td>\n",
" <td>[en:non-sugared-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29208</th>\n",
" <td>Зефир с ароматом крем-брюле</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:sugary-snacks]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29209</th>\n",
" <td>Coca-cola</td>\n",
" <td>{en:beverages}</td>\n",
" <td>[en:sugared-beverages]</td>\n",
" <td>[en:non-sugared-beverages]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29210</th>\n",
" <td>Citron vert bio</td>\n",
" <td>{}</td>\n",
" <td>[en:beverages]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29211</th>\n",
" <td>Weiße Riesenbohnen</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29212</th>\n",
" <td>Haricots Verts à la Périgourdine</td>\n",
" <td>{en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" <td>[en:fruits-and-vegetables-based-foods, en:vege...</td>\n",
" <td>[en:frozen-foods]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29213</th>\n",
" <td>Steaks Hachés Charolais Façon Bouchère</td>\n",
" <td>{en:meats}</td>\n",
" <td>[]</td>\n",
" <td>[en:frozen-foods]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29214</th>\n",
" <td>Tortilla Wraps Plain</td>\n",
" <td>{}</td>\n",
" <td>[en:salty-snacks]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29215</th>\n",
" <td>nan</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29216</th>\n",
" <td>Korma de légumes BIO</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:meals]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29217</th>\n",
" <td>Cake Release Spray</td>\n",
" <td>{}</td>\n",
" <td>[en:biscuits-and-cakes, en:desserts, en:sugary...</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29218</th>\n",
" <td>Maxi Quenelles à Gratiner, Jambon sauce béchamel</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:canned-foods, en:meals]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29219</th>\n",
" <td>Alpro Oat Almond</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:beverages, en:dairies, en:non-sugared-beve...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29220</th>\n",
" <td>Nectarinas</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29221</th>\n",
" <td>Ratatouille cuisinée à la Provençale</td>\n",
" <td>{en:meals}</td>\n",
" <td>[]</td>\n",
" <td>[en:canned-foods]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29222</th>\n",
" <td>Filet de poulet (-25% de sel) (4+2 gratuites)</td>\n",
" <td>{en:meats}</td>\n",
" <td>[]</td>\n",
" <td>[en:prepared-meats]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29223</th>\n",
" <td>Boisson gazeuse goût framboise</td>\n",
" <td>{en:beverages}</td>\n",
" <td>[en:non-sugared-beverages]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29224</th>\n",
" <td>Potato Wedges</td>\n",
" <td>{}</td>\n",
" <td>[en:salty-snacks]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29225</th>\n",
" <td>Fleur de Sel aux Épices Grillées</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29226</th>\n",
" <td>Pétales à la crevette</td>\n",
" <td>{}</td>\n",
" <td>[]</td>\n",
" <td>[en:salty-snacks]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
2017-09-26 21:28:44 +02:00
"<p>29227 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
2017-09-26 21:28:44 +02:00
" product_name \\\n",
"0 Fernandes Cherry Soda - Ga Mee Naar Suruname &... \n",
"1 Miellats du Maquis cru d'été 2016 \n",
"2 Thé citron \n",
"3 Rêves de chocolat Assortiment de chocolats fin... \n",
"4 Couscous royal poulet merguez \n",
"5 Baguette Céréales Carrefour \n",
"6 Angeliter Zitronenlimonade \n",
"7 Agar-Agar \n",
"8 Velamints \n",
"9 Vegemil Black Bean Soymilk \n",
"10 Kremsi krem sir \n",
"11 Le Norvège -25% sel \n",
"12 Petit Munster Géromé \n",
"13 Nectarines jaunes \n",
"14 Carotte nouvelle \n",
"15 Apple Real Fruit Pie \n",
"16 Jus de pomme \n",
"17 Roquefort Premium \n",
"18 Antésite concentré de Réglisse Menthe \n",
"19 Menta piperita en bolsitas \n",
"20 Demi-Lunes Cèpes aux oeufs frais \n",
"21 Sirop de Grenadine Bio \n",
"22 Gulaschsuppe \n",
"23 Déli'Pocket Montagnard \n",
"24 Orzo by Sainsbury's \n",
"25 Vive soy Vainilla \n",
"26 Excellence 99% Cacao Noir Absolu \n",
"27 nan \n",
"28 jupiter \n",
"29 Faux-Filet \n",
"... ... \n",
"29197 Noix d'épaule cuite choix désossée \n",
"29198 Lait de coco \n",
"29199 Tagliatelles au poulet \n",
"29200 Dim Sum Shao Mai \n",
"29201 Waffel Blätter mit Vollmilchschokolade \n",
"29202 Génépi de Savoie L'Ancienne \n",
"29203 6 carrés fourrés, saveur amande \n",
"29204 Multivitamins \n",
"29205 Instantané aux châtaignes \n",
"29206 Nuggets de Poulet \n",
"29207 Jus d'orange \n",
"29208 Зефир с ароматом крем-брюле \n",
"29209 Coca-cola \n",
"29210 Citron vert bio \n",
"29211 Weiße Riesenbohnen \n",
"29212 Haricots Verts à la Périgourdine \n",
"29213 Steaks Hachés Charolais Façon Bouchère \n",
"29214 Tortilla Wraps Plain \n",
"29215 nan \n",
"29216 Korma de légumes BIO \n",
"29217 Cake Release Spray \n",
"29218 Maxi Quenelles à Gratiner, Jambon sauce béchamel \n",
"29219 Alpro Oat Almond \n",
"29220 Nectarinas \n",
"29221 Ratatouille cuisinée à la Provençale \n",
"29222 Filet de poulet (-25% de sel) (4+2 gratuites) \n",
"29223 Boisson gazeuse goût framboise \n",
"29224 Potato Wedges \n",
"29225 Fleur de Sel aux Épices Grillées \n",
"29226 Pétales à la crevette \n",
"\n",
2017-09-26 21:28:44 +02:00
" good_guessed_labels \\\n",
"0 {en:beverages} \n",
"1 {} \n",
"2 {en:non-sugared-beverages, en:plant-based-food... \n",
"3 {en:chocolates, en:desserts, en:sugary-snacks} \n",
"4 {en:meals} \n",
"5 {en:plant-based-foods, en:cereals-and-potatoes... \n",
"6 {} \n",
"7 {} \n",
"8 {} \n",
"9 {} \n",
"10 {} \n",
"11 {} \n",
"12 {en:cheeses} \n",
"13 {en:plant-based-foods, en:plant-based-foods-an... \n",
"14 {en:plant-based-foods, en:plant-based-foods-an... \n",
"15 {} \n",
"16 {en:plant-based-foods-and-beverages, en:fruit-... \n",
"17 {en:dairies} \n",
"18 {} \n",
"19 {en:non-sugared-beverages, en:plant-based-food... \n",
"20 {en:plant-based-foods, en:cereals-and-potatoes... \n",
"21 {en:beverages} \n",
"22 {} \n",
"23 {} \n",
"24 {} \n",
"25 {} \n",
"26 {en:chocolates, en:sugary-snacks} \n",
"27 {} \n",
"28 {} \n",
"29 {} \n",
"... ... \n",
"29197 {} \n",
"29198 {en:non-sugared-beverages, en:beverages, en:pl... \n",
"29199 {en:meals} \n",
"29200 {} \n",
"29201 {} \n",
"29202 {} \n",
"29203 {en:biscuits-and-cakes, en:sugary-snacks} \n",
"29204 {en:plant-based-foods-and-beverages} \n",
"29205 {} \n",
"29206 {en:meats} \n",
"29207 {en:beverages} \n",
"29208 {} \n",
"29209 {en:beverages} \n",
"29210 {} \n",
"29211 {} \n",
"29212 {en:plant-based-foods, en:plant-based-foods-an... \n",
"29213 {en:meats} \n",
"29214 {} \n",
"29215 {} \n",
"29216 {} \n",
"29217 {} \n",
"29218 {} \n",
"29219 {} \n",
"29220 {} \n",
"29221 {en:meals} \n",
"29222 {en:meats} \n",
"29223 {en:beverages} \n",
"29224 {} \n",
"29225 {} \n",
"29226 {} \n",
"\n",
2017-09-26 21:28:44 +02:00
" extra_guessed_labels \\\n",
"0 [en:plant-based-foods-and-beverages] \n",
"1 [] \n",
"2 [] \n",
"3 [] \n",
"4 [] \n",
"5 [] \n",
"6 [] \n",
"7 [] \n",
"8 [] \n",
"9 [en:plant-based-foods-and-beverages] \n",
"10 [] \n",
"11 [] \n",
"12 [] \n",
"13 [] \n",
"14 [] \n",
"15 [en:plant-based-foods-and-beverages] \n",
"16 [] \n",
"17 [] \n",
"18 [en:plant-based-foods-and-beverages] \n",
"19 [] \n",
"20 [] \n",
"21 [en:sugared-beverages] \n",
"22 [] \n",
"23 [en:biscuits, en:biscuits-and-cakes, en:sugary... \n",
"24 [] \n",
"25 [] \n",
"26 [] \n",
"27 [] \n",
"28 [] \n",
"29 [] \n",
"... ... \n",
"29197 [] \n",
"29198 [] \n",
"29199 [en:cereals-and-potatoes, en:cereals-and-their... \n",
"29200 [] \n",
"29201 [] \n",
"29202 [] \n",
"29203 [] \n",
"29204 [] \n",
"29205 [] \n",
"29206 [] \n",
"29207 [en:fruit-based-beverages, en:plant-based-beve... \n",
"29208 [] \n",
"29209 [en:sugared-beverages] \n",
"29210 [en:beverages] \n",
"29211 [] \n",
"29212 [en:fruits-and-vegetables-based-foods, en:vege... \n",
"29213 [] \n",
"29214 [en:salty-snacks] \n",
"29215 [] \n",
"29216 [] \n",
"29217 [en:biscuits-and-cakes, en:desserts, en:sugary... \n",
"29218 [] \n",
"29219 [] \n",
"29220 [] \n",
"29221 [] \n",
"29222 [] \n",
"29223 [en:non-sugared-beverages] \n",
"29224 [en:salty-snacks] \n",
"29225 [] \n",
"29226 [] \n",
"\n",
" missing_guessed_labels \n",
"0 [en:sugared-beverages] \n",
"1 [en:spreads, en:breakfasts] \n",
"2 [en:plant-based-foods] \n",
"3 [en:confectioneries] \n",
"4 [en:canned-foods] \n",
"5 [en:cereals-and-their-products] \n",
"6 [en:beverages, en:sugared-beverages] \n",
"7 [en:plant-based-foods-and-beverages, en:plant-... \n",
"8 [en:sugary-snacks, en:confectioneries] \n",
"9 [en:dairies] \n",
"10 [en:dairies, en:cheeses] \n",
"11 [en:seafood] \n",
"12 [en:dairies] \n",
"13 [en:fruits-and-vegetables-based-foods, en:frui... \n",
"14 [en:fresh-foods, en:fruits-and-vegetables-base... \n",
"15 [en:sugary-snacks] \n",
"16 [en:non-sugared-beverages] \n",
"17 [en:cheeses] \n",
"18 [en:beverages, en:non-sugared-beverages] \n",
"19 [en:plant-based-foods, en:groceries, en:plant-... \n",
"20 [en:fresh-foods, en:meals] \n",
"21 [en:non-sugared-beverages] \n",
"22 [en:meals] \n",
"23 [en:frozen-foods] \n",
"24 [en:plant-based-foods-and-beverages, en:plant-... \n",
"25 [en:plant-based-foods-and-beverages, en:bevera... \n",
"26 [en:confectioneries] \n",
"27 [en:sugary-snacks, en:chocolates] \n",
"28 [en:sugary-snacks, en:confectioneries] \n",
"29 [en:meats] \n",
"... ... \n",
"29197 [en:canned-foods] \n",
"29198 [en:plant-based-foods-and-beverages] \n",
"29199 [en:fresh-foods] \n",
"29200 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29201 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"29202 [en:beverages, en:alcoholic-beverages] \n",
"29203 [en:desserts] \n",
"29204 [en:beverages, en:plant-based-beverages, en:no... \n",
"29205 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29206 [en:fresh-foods, en:meals] \n",
"29207 [en:non-sugared-beverages] \n",
"29208 [en:sugary-snacks] \n",
"29209 [en:non-sugared-beverages] \n",
"29210 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29211 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29212 [en:frozen-foods] \n",
"29213 [en:frozen-foods] \n",
"29214 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29215 [en:plant-based-foods-and-beverages, en:bevera... \n",
"29216 [en:meals] \n",
"29217 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29218 [en:canned-foods, en:meals] \n",
"29219 [en:beverages, en:dairies, en:non-sugared-beve... \n",
"29220 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29221 [en:canned-foods] \n",
"29222 [en:prepared-meats] \n",
"29223 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29224 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29225 [en:plant-based-foods-and-beverages, en:plant-... \n",
"29226 [en:salty-snacks] \n",
"\n",
2017-09-26 21:28:44 +02:00
"[29227 rows x 4 columns]"
]
},
2017-09-26 21:28:44 +02:00
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
2017-09-23 02:16:39 +02:00
"source": [
2017-09-26 21:28:44 +02:00
"testing_diffs = []\n",
"for _, row in testing_dataframe.iterrows():\n",
" diff = set(row.original_labels) - set(row.guessed_labels)\n",
" if len(diff) > 0:\n",
" testing_diffs.append({\n",
" 'product_name': row.product_name,\n",
" 'good_guessed_labels': set(row.original_labels) - diff,\n",
" 'extra_guessed_labels': [label for label in row.guessed_labels if label not in row.original_labels],\n",
" 'missing_guessed_labels': [label for label in row.original_labels if label not in row.guessed_labels]\n",
" })\n",
"pandas.DataFrame(testing_diffs,\n",
" columns=['product_name', 'good_guessed_labels', 'extra_guessed_labels', 'missing_guessed_labels'])"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 87,
2017-09-23 02:16:39 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['offClassifier.pkl']"
2017-09-23 02:16:39 +02:00
]
},
2017-09-26 21:28:44 +02:00
"execution_count": 87,
2017-09-23 02:16:39 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Dump the classifier\n",
"joblib.dump((mlb, classifier), 'offClassifier.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predict!"
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"def batch(iterable, size):\n",
" \"\"\"\n",
" Get items from a sequence a batch at a time.\n",
2017-09-23 02:16:39 +02:00
"\n",
" :param iterable: The iterable to get the items from.\n",
" :param size: The size of the batches.\n",
" :return: A new iterable.\n",
" \"\"\"\n",
" sourceiter = iter(iterable)\n",
" while True:\n",
" batchiter = itertools.islice(sourceiter, size)\n",
" yield itertools.chain([next(batchiter)], batchiter)"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"# Load the classifier\n",
"mlb, classifier = joblib.load('offClassifier.pkl')"
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 101,
2017-09-23 02:16:39 +02:00
"metadata": {},
"outputs": [
{
"name": "stderr",
2017-09-23 02:16:39 +02:00
"output_type": "stream",
"text": [
2017-09-26 21:28:44 +02:00
"/Users/lverney/.local/share/virtualenvs/machine_learning/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: generator 'batch' raised StopIteration\n",
" \"\"\"\n"
2017-09-23 02:16:39 +02:00
]
}
],
"source": [
"X_predicted = products_without_categories['product_name'].values.astype('U')\n",
"\n",
"all_labels = []\n",
"\n",
"for i in batch(X_predicted, 30000):\n",
" predicted = classifier.predict(list(i))\n",
" all_labels.extend(mlb.inverse_transform(predicted))"
2017-09-23 02:16:39 +02:00
]
},
{
"cell_type": "code",
2017-09-26 21:28:44 +02:00
"execution_count": 102,
2017-09-23 02:16:39 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>product_name</th>\n",
" <th>guessed_labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Farine de blé noir</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Banana Chips Sweetened (Whole)</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:appetizers, en:chips-and-fries, en:crisps,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Peanuts</td>\n",
" <td>(en:legumes, en:legumes-and-their-products, en...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Organic Salted Nut Mix</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Organic Polenta</td>\n",
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Breadshop Honey Gone Nuts Granola</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:breakfasts, en:cereals-and-potatoes, en:ce...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Organic Long Grain White Rice</td>\n",
" <td>(en:cereal-grains, en:cereals-and-potatoes, en...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Organic Muesli</td>\n",
" <td>(en:breakfast-cereals, en:breakfasts, en:cerea...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Organic Dark Chocolate Minis</td>\n",
" <td>(en:chocolates, en:dark-chocolates, en:sugary-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Organic Sunflower Oil</td>\n",
" <td>(en:fats, en:plant-based-foods, en:plant-based...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Organic Adzuki Beans</td>\n",
" <td>(en:legumes-and-their-products, en:plant-based...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Organic Penne Pasta</td>\n",
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Zen Party Mix</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Organic Golden Flax Seeds</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Organic Spicy Punks</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Cinnamon Nut Granola</td>\n",
" <td>(en:breakfast-cereals, en:breakfasts, en:cerea...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Organic Hazelnuts</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:sugary-snacks,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Organic Sweetened Banana Chips</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:appetizers, en:chips-and-fries, en:crisps,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Lotus Organic Brown Jasmine Rice</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Organic Oat Groats</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Energy Power Mix</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Antioxidant Mix - Berries &amp; Chocolate</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:sugary-snacks,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>Organic Quinoa Coconut Granola With Mango</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:cereals-and-potatoes, en:plant-based-foods...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Fire Roasted Hatch Green Chile Almonds</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>Peanut Butter Power Chews</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:plant-based-foods, en:spreads)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>Real Salt Granular</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:groceries,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>Organic Unswt Berry Coconut Granola</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:plant-based-foods-and-beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Roasted Salted Black Pepper Cashews</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:appetizers, en:plant-based-foods-and-bever...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>Thai Curry Roasted Cashews</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>Wasabi Tamari Almonds</td>\n",
2017-09-26 21:28:44 +02:00
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254702</th>\n",
" <td>Fairy Tail</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254703</th>\n",
" <td>Biscuits aux céréales, aux pépites de chocolat...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:sugary...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254704</th>\n",
" <td>Dico anglais</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254705</th>\n",
" <td>Neszt Cochon Con</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254706</th>\n",
" <td>Drid apricot the queen</td>\n",
" <td>(en:beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254707</th>\n",
" <td>Natural Cassava</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254708</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254709</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254710</th>\n",
" <td>Soda 1</td>\n",
" <td>(en:beverages, en:carbonated-drinks, en:sodas,...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254711</th>\n",
" <td>Merci 1</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:candies, en:chocolates, en:confectioneries...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254712</th>\n",
" <td>Merci2</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254713</th>\n",
" <td>Merci3</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254714</th>\n",
" <td>Libro parachute3</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254715</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254716</th>\n",
" <td>Vegan easy</td>\n",
2017-09-26 21:28:44 +02:00
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254717</th>\n",
" <td>Tarifs djoghrafia</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254718</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254719</th>\n",
" <td>Ferrero Rocher</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:chocolates,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254720</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254721</th>\n",
" <td>Raspados Ice Bars</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:sugary-snacks,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254722</th>\n",
" <td>nf test</td>\n",
2017-09-26 21:28:44 +02:00
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254723</th>\n",
" <td>Amandes</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:nuts-and-their-products, en:plant-based-fo...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254724</th>\n",
" <td>Mleko wiejskie</td>\n",
" <td>(en:dairies, en:milks)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254725</th>\n",
" <td>Poireaux</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:fruits-and-vegetables-based-foods, en:plan...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254726</th>\n",
" <td>Cheese cake thé vert, yuzu</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:herbal-teas,)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254727</th>\n",
" <td>Tomato &amp; ricotta</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254728</th>\n",
" <td>Mint Melange Tea A Blend Of Peppermint, Lemon ...</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:beverages, en:non-sugared-beverages)</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254729</th>\n",
" <td>Biscottes bio</td>\n",
" <td>(en:breads, en:cereals-and-potatoes, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254730</th>\n",
" <td>Tomates aux Vermicelles</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:fruits-and-vegetables-based-foods, en:plan...</td>\n",
" </tr>\n",
" <tr>\n",
2017-09-26 21:28:44 +02:00
" <th>254731</th>\n",
" <td>Sugar Free Drink Mix, Peach Tea</td>\n",
2017-09-26 21:28:44 +02:00
" <td>(en:beverages, en:plant-based-beverages, en:su...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
2017-09-26 21:28:44 +02:00
"<p>254732 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" product_name \\\n",
"0 Farine de blé noir \n",
"1 Banana Chips Sweetened (Whole) \n",
"2 Peanuts \n",
"3 Organic Salted Nut Mix \n",
"4 Organic Polenta \n",
"5 Breadshop Honey Gone Nuts Granola \n",
"6 Organic Long Grain White Rice \n",
"7 Organic Muesli \n",
"8 Organic Dark Chocolate Minis \n",
"9 Organic Sunflower Oil \n",
"10 Organic Adzuki Beans \n",
"11 Organic Penne Pasta \n",
"12 Zen Party Mix \n",
"13 Organic Golden Flax Seeds \n",
"14 Organic Spicy Punks \n",
"15 Cinnamon Nut Granola \n",
"16 Organic Hazelnuts \n",
"17 Organic Sweetened Banana Chips \n",
"18 Lotus Organic Brown Jasmine Rice \n",
"19 Organic Oat Groats \n",
"20 Energy Power Mix \n",
"21 Antioxidant Mix - Berries & Chocolate \n",
"22 Organic Quinoa Coconut Granola With Mango \n",
"23 Fire Roasted Hatch Green Chile Almonds \n",
"24 Peanut Butter Power Chews \n",
"25 Real Salt Granular \n",
"26 Organic Unswt Berry Coconut Granola \n",
"27 Roasted Salted Black Pepper Cashews \n",
"28 Thai Curry Roasted Cashews \n",
"29 Wasabi Tamari Almonds \n",
"... ... \n",
2017-09-26 21:28:44 +02:00
"254702 Fairy Tail \n",
"254703 Biscuits aux céréales, aux pépites de chocolat... \n",
"254704 Dico anglais \n",
"254705 Neszt Cochon Con \n",
"254706 Drid apricot the queen \n",
"254707 Natural Cassava \n",
"254708 nan \n",
"254709 nan \n",
"254710 Soda 1 \n",
"254711 Merci 1 \n",
"254712 Merci2 \n",
"254713 Merci3 \n",
"254714 Libro parachute3 \n",
"254715 nan \n",
"254716 Vegan easy \n",
"254717 Tarifs djoghrafia \n",
"254718 nan \n",
"254719 Ferrero Rocher \n",
"254720 nan \n",
"254721 Raspados Ice Bars \n",
"254722 nf test \n",
"254723 Amandes \n",
"254724 Mleko wiejskie \n",
"254725 Poireaux \n",
"254726 Cheese cake thé vert, yuzu \n",
"254727 Tomato & ricotta \n",
"254728 Mint Melange Tea A Blend Of Peppermint, Lemon ... \n",
"254729 Biscottes bio \n",
"254730 Tomates aux Vermicelles \n",
"254731 Sugar Free Drink Mix, Peach Tea \n",
"\n",
" guessed_labels \n",
2017-09-26 21:28:44 +02:00
"0 (en:cereals-and-potatoes, en:cereals-and-their... \n",
"1 (en:appetizers, en:chips-and-fries, en:crisps,... \n",
"2 (en:legumes, en:legumes-and-their-products, en... \n",
"3 () \n",
"4 (en:cereals-and-potatoes, en:cereals-and-their... \n",
2017-09-26 21:28:44 +02:00
"5 (en:breakfasts, en:cereals-and-potatoes, en:ce... \n",
"6 (en:cereal-grains, en:cereals-and-potatoes, en... \n",
"7 (en:breakfast-cereals, en:breakfasts, en:cerea... \n",
"8 (en:chocolates, en:dark-chocolates, en:sugary-... \n",
"9 (en:fats, en:plant-based-foods, en:plant-based... \n",
"10 (en:legumes-and-their-products, en:plant-based... \n",
"11 (en:cereals-and-potatoes, en:cereals-and-their... \n",
"12 () \n",
"13 (en:plant-based-foods, en:plant-based-foods-an... \n",
"14 () \n",
"15 (en:breakfast-cereals, en:breakfasts, en:cerea... \n",
2017-09-26 21:28:44 +02:00
"16 (en:sugary-snacks,) \n",
"17 (en:appetizers, en:chips-and-fries, en:crisps,... \n",
"18 (en:cereals-and-potatoes, en:cereals-and-their... \n",
"19 () \n",
2017-09-26 21:28:44 +02:00
"20 (en:beverages,) \n",
"21 (en:sugary-snacks,) \n",
"22 (en:cereals-and-potatoes, en:plant-based-foods... \n",
"23 (en:plant-based-foods, en:plant-based-foods-an... \n",
"24 (en:plant-based-foods, en:spreads) \n",
"25 (en:groceries,) \n",
"26 (en:plant-based-foods-and-beverages,) \n",
"27 (en:appetizers, en:plant-based-foods-and-bever... \n",
"28 (en:plant-based-foods, en:plant-based-foods-an... \n",
"29 () \n",
"... ... \n",
2017-09-26 21:28:44 +02:00
"254702 () \n",
"254703 (en:biscuits, en:biscuits-and-cakes, en:sugary... \n",
"254704 (en:plant-based-foods, en:plant-based-foods-an... \n",
"254705 () \n",
"254706 (en:beverages,) \n",
"254707 (en:beverages,) \n",
"254708 () \n",
"254709 () \n",
"254710 (en:beverages, en:carbonated-drinks, en:sodas,... \n",
"254711 (en:candies, en:chocolates, en:confectioneries... \n",
"254712 () \n",
"254713 () \n",
"254714 () \n",
"254715 () \n",
"254716 () \n",
"254717 () \n",
"254718 () \n",
"254719 (en:chocolates,) \n",
"254720 () \n",
"254721 (en:sugary-snacks,) \n",
"254722 () \n",
"254723 (en:nuts-and-their-products, en:plant-based-fo... \n",
"254724 (en:dairies, en:milks) \n",
"254725 (en:fruits-and-vegetables-based-foods, en:plan... \n",
"254726 (en:herbal-teas,) \n",
"254727 (en:plant-based-foods, en:plant-based-foods-an... \n",
"254728 (en:beverages, en:non-sugared-beverages) \n",
"254729 (en:breads, en:cereals-and-potatoes, en:plant-... \n",
"254730 (en:fruits-and-vegetables-based-foods, en:plan... \n",
"254731 (en:beverages, en:plant-based-beverages, en:su... \n",
"\n",
2017-09-26 21:28:44 +02:00
"[254732 rows x 2 columns]"
]
},
2017-09-26 21:28:44 +02:00
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prediction_dataframe = pandas.DataFrame({\n",
" 'product_name': products_without_categories['product_name'].values.astype('U'),\n",
" 'guessed_labels': all_labels\n",
"}, columns=['product_name', 'guessed_labels'])\n",
"prediction_dataframe.to_csv('prediction.csv', index=False)\n",
"prediction_dataframe"
]
2017-09-26 21:28:44 +02:00
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 True\n",
"1 True\n",
"2 True\n",
"3 False\n",
"4 True\n",
"5 True\n",
"6 True\n",
"7 True\n",
"8 True\n",
"9 True\n",
"10 True\n",
"11 True\n",
"12 False\n",
"13 True\n",
"14 False\n",
"15 True\n",
"16 True\n",
"17 True\n",
"18 True\n",
"19 False\n",
"20 True\n",
"21 True\n",
"22 True\n",
"23 True\n",
"24 True\n",
"25 True\n",
"26 True\n",
"27 True\n",
"28 True\n",
"29 False\n",
" ... \n",
"254702 False\n",
"254703 True\n",
"254704 True\n",
"254705 False\n",
"254706 True\n",
"254707 True\n",
"254708 False\n",
"254709 False\n",
"254710 True\n",
"254711 True\n",
"254712 False\n",
"254713 False\n",
"254714 False\n",
"254715 False\n",
"254716 False\n",
"254717 False\n",
"254718 False\n",
"254719 True\n",
"254720 False\n",
"254721 True\n",
"254722 False\n",
"254723 True\n",
"254724 True\n",
"254725 True\n",
"254726 True\n",
"254727 True\n",
"254728 True\n",
"254729 True\n",
"254730 True\n",
"254731 True\n",
"Name: guessed_labels, Length: 254732, dtype: bool"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prediction_dataframe['guessed_labels'].str.len() > 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
2017-09-23 02:16:39 +02:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
2017-09-23 02:16:39 +02:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}