offclassification/notebook.ipynb

1429 lines
62 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"import itertools\n",
"import json\n",
"import os\n",
"\n",
"import numpy as np\n",
"import pandas\n",
"import random\n",
"\n",
"from sklearn.externals import joblib\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.multiclass import OneVsRestClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.svm import LinearSVC\n",
"\n",
"MIN_NUMBER_PRODUCTS_PER_CATEGORY = 25\n",
"TRAINING_DATASET_SIZE = 50 / 100"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data from CSV dump"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"csv = pandas.read_csv(\n",
" 'en.openfoodfacts.org.products.csv',\n",
" sep='\\t',\n",
" usecols=[0, 7, 15],\n",
" dtype={'code': 'str', 'product_name': 'str'},\n",
" converters={'categories_tags': lambda x: x.split(',') if x else np.NaN}\n",
")\n",
"# Filter products with and without categories in two different DataFrames\n",
"products_with_categories = csv[pandas.notnull(csv['categories_tags'])]\n",
"products_without_categories = csv[pandas.isnull(csv['categories_tags'])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fitting on dataset"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# Let's build vectors of products and categories, for training purpose.\n",
"categories = [\n",
" category\n",
" for category, count in collections.Counter(\n",
" category for category_list in products_with_categories['categories_tags'] for category in category_list\n",
" ).items()\n",
" if count > MIN_NUMBER_PRODUCTS_PER_CATEGORY and category != ''\n",
"]\n",
"# Filter out empty lists of categories\n",
"XY = products_with_categories.copy()\n",
"XY['categories_tags'] = XY['categories_tags'].map(lambda c_list: [c for c in c_list if c in categories])\n",
"mask = XY['categories_tags'].str.len() > 0\n",
"XY = XY[mask]\n",
"# TODO: We should ensure each category is sufficiently represented"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# Select training sample\n",
"XY_training = XY.sample(frac=0.5)\n",
"X_train = XY_training['product_name'].values.astype('U')\n",
"Y_train = [np.array(c).astype('U') for c in XY_training['categories_tags'].values]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"mlb = MultiLabelBinarizer()\n",
"Y_train_transformed = mlb.fit_transform(Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 335,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(memory=None,\n",
" steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
" ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
" verbose=0),\n",
" n_jobs=1))])"
]
},
"execution_count": 335,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit our classifier\n",
"classifier = Pipeline([\n",
" ('vectorizer', CountVectorizer()),\n",
" ('tfidf', TfidfTransformer()),\n",
" ('clf', OneVsRestClassifier(LinearSVC()))])\n",
"\n",
"classifier.fit(X_train, Y_train_transformed)"
]
},
{
"cell_type": "code",
"execution_count": 380,
"metadata": {},
"outputs": [],
"source": [
"# Check score on a testing set\n",
"training_indices = np.in1d(XY['product_name'].values.astype('U'), X_train, invert=True)\n",
"X_test = XY['product_name'][training_indices]\n",
"Y_test = [np.array(c).astype('U') for c in XY['categories_tags'][training_indices].values]\n",
"\n",
"predicted = classifier.predict(X_test)\n",
"all_labels = mlb.inverse_transform(predicted)"
]
},
{
"cell_type": "code",
"execution_count": 394,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>product_name</th>\n",
" <th>original_labels</th>\n",
" <th>guessed_labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>176</th>\n",
" <td>Salade Cesar</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:meals, fr:salades-composees)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>184</th>\n",
" <td>lentilles vertes</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:green-lentils, en:legume-seeds, en:legumes...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>185</th>\n",
" <td>Root Beer</td>\n",
" <td>[en:beverages, en:carbonated-drinks, en:sodas,...</td>\n",
" <td>(en:alcoholic-beverages, en:beers, en:beverage...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>186</th>\n",
" <td>Biscuits sablés fourrage au cacao</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:chocol...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>238</th>\n",
" <td>Blle Pet 50CL Coca Cola Cherry</td>\n",
" <td>[en:beverages, en:sugared-beverages]</td>\n",
" <td>(en:beverages, en:non-sugared-beverages, en:so...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>244</th>\n",
" <td>Cauliflower</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:fruits-and-vegetables-based-foods, en:leaf...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>247</th>\n",
" <td>Salsa de mostaza</td>\n",
" <td>[en:groceries, en:condiments, en:sauces, en:mu...</td>\n",
" <td>(en:condiments, en:groceries, en:mustards, en:...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>7Up</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
" <td>(en:beverages, en:non-sugared-beverages)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>276</th>\n",
" <td>Mehrkomponeneten Protein 90 C6 Haselnuß</td>\n",
" <td>[en:dietary-supplements, en:bodybuilding-suppl...</td>\n",
" <td>(en:bodybuilding-supplements, en:dietary-suppl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>292</th>\n",
" <td>Cakes aux Fruits</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:d...</td>\n",
" <td>(en:biscuits-and-cakes, en:cakes, en:desserts,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>309</th>\n",
" <td>Whey Protein aus Molke 1000 Gramm Vanilla</td>\n",
" <td>[en:dietary-supplements, en:bodybuilding-suppl...</td>\n",
" <td>(en:bodybuilding-supplements, en:dietary-suppl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>310</th>\n",
" <td>Fondants Citron</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:d...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>369</th>\n",
" <td>Sour Fruit Gummies</td>\n",
" <td>[en:sugary-snacks, en:confectioneries, en:cand...</td>\n",
" <td>(en:candies, en:confectioneries, en:sugary-sna...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>421</th>\n",
" <td>Mixed peppers</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:fruits-and-vegetables-based-foods, en:plan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>437</th>\n",
" <td>30 Panach' Fruits</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:d...</td>\n",
" <td>(en:fruits-and-vegetables-based-foods, en:frui...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>440</th>\n",
" <td>Foie gras de canard du Périgord</td>\n",
" <td>[en:fish-and-meat-and-eggs, fr:foies-gras, fr:...</td>\n",
" <td>(en:fish-and-meat-and-eggs, fr:foies-gras, fr:...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>454</th>\n",
" <td>Terrine de caille aux pruneaux d'Agen</td>\n",
" <td>[en:terrine, fr:terrines-de-volailles]</td>\n",
" <td>(en:terrine,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>455</th>\n",
" <td>Foie de canard aux figues</td>\n",
" <td>[en:fish-and-meat-and-eggs, fr:foies-gras, fr:...</td>\n",
" <td>(en:fish-and-meat-and-eggs, en:meats)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>458</th>\n",
" <td>Foie gras d'oie Périgord</td>\n",
" <td>[en:fish-and-meat-and-eggs, fr:foies-gras]</td>\n",
" <td>(en:fish-and-meat-and-eggs, fr:foies-gras)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>459</th>\n",
" <td>Foie gras d'oie du Périgord</td>\n",
" <td>[en:fish-and-meat-and-eggs, fr:foies-gras]</td>\n",
" <td>(en:fish-and-meat-and-eggs, fr:foies-gras)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>464</th>\n",
" <td>All Butter Belgian White Chocolate Chunk Cookies</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:chocol...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>465</th>\n",
" <td>All Butter Fruity Flapjack Cookies</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:cookie...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>467</th>\n",
" <td>All butter Cranberry &amp; Orange Cookies</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:cookie...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>468</th>\n",
" <td>All Butter Triple Belgian Chocolate Chunk Cookies</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:cookie...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>469</th>\n",
" <td>Cookies Stem Ginger</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:cookie...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>472</th>\n",
" <td>Stem Ginger Dunkers</td>\n",
" <td>[en:sugary-snacks, en:biscuits-and-cakes, en:b...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>484</th>\n",
" <td>Reduced Fat Mayonnaise</td>\n",
" <td>[en:groceries, en:sauces, en:mayonnaises]</td>\n",
" <td>(en:dairies, en:groceries, en:milks, en:sauces)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>492</th>\n",
" <td>mostly mesquite honey</td>\n",
" <td>[en:spreads, en:breakfasts, en:sweet-spreads, ...</td>\n",
" <td>(en:breakfasts,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>496</th>\n",
" <td>Clam Chowder A Condensed Soup</td>\n",
" <td>[en:meals, en:soups]</td>\n",
" <td>(en:meals, en:soups)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>509</th>\n",
" <td>Salade Mac</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358216</th>\n",
" <td>Sauc Nuoc Mam</td>\n",
" <td>[en:groceries, en:sauces]</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358217</th>\n",
" <td>Sauce piment doux Thaï</td>\n",
" <td>[en:groceries, en:sauces, en:pimented-sauces]</td>\n",
" <td>(en:groceries, en:sauces)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358220</th>\n",
" <td>Sauce Piment Sriracha</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:groceries, en:pimented-sauces, en:sauces)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358222</th>\n",
" <td>Sauce Thaï Satay</td>\n",
" <td>[en:groceries, en:sauces, en:pimented-sauces]</td>\n",
" <td>(en:groceries, en:sauces)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358225</th>\n",
" <td>Crème de coco allégée</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:plant-based-creams, en:plant-based-foods-a...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358226</th>\n",
" <td>Nouilles instantanées</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358231</th>\n",
" <td>Pad Thaï Sauce Wok</td>\n",
" <td>[en:groceries, en:sauces]</td>\n",
" <td>(en:sauces,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358233</th>\n",
" <td>Pâte de Curry Vert</td>\n",
" <td>[en:groceries, en:sauces]</td>\n",
" <td>(en:curry-pastes, en:groceries)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358234</th>\n",
" <td>Pâte de curry rouge</td>\n",
" <td>[en:groceries, en:sauces, en:curry-pastes]</td>\n",
" <td>(en:groceries,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358242</th>\n",
" <td>Gula Gula Durian</td>\n",
" <td>[en:sugary-snacks, en:confectioneries, en:cand...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358254</th>\n",
" <td>Boisson au chrysanthème</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
" <td>(en:beverages, en:non-sugared-beverages, en:pl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358255</th>\n",
" <td>Lychee Drink</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
" <td>(en:beverages, en:fruit-based-beverages, en:pl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358279</th>\n",
" <td>Pudding nata de coco</td>\n",
" <td>[en:desserts, en:puddings]</td>\n",
" <td>(en:desserts, en:puddings)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358282</th>\n",
" <td>Glinter Soft Drink Orange</td>\n",
" <td>[en:beverages, en:non-sugared-beverages]</td>\n",
" <td>(en:beverages, en:plant-based-foods-and-bevera...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358283</th>\n",
" <td>Soft Drink</td>\n",
" <td>[en:beverages, en:artificially-sweetened-bever...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358293</th>\n",
" <td>Healtier palm oil (L'huile de palme)</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:fats, en:plant-based-foods, en:plant-based...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358306</th>\n",
" <td>100% Pur Jus 4 agrumes</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:bevera...</td>\n",
" <td>(en:beverages, en:fruit-based-beverages, en:fr...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358325</th>\n",
" <td>tuna chunks in spring water</td>\n",
" <td>[en:canned-foods, en:seafood, en:fishes, en:ca...</td>\n",
" <td>(en:canned-fishes, en:canned-foods, en:canned-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358335</th>\n",
" <td>Santa Cruz Chilli &amp; Lime Dressing</td>\n",
" <td>[en:groceries, en:sauces, en:salad-dressings]</td>\n",
" <td>(en:groceries, en:salad-dressings, en:sauces)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358343</th>\n",
" <td>Fisherman's Friend Miel-Citron</td>\n",
" <td>[en:sugary-snacks, en:confectioneries, en:cand...</td>\n",
" <td>(en:candies, en:confectioneries, en:sugary-sna...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358347</th>\n",
" <td>Dessert Noir (lot de 2)</td>\n",
" <td>[en:sugary-snacks, en:chocolates, en:dark-choc...</td>\n",
" <td>(en:chocolates, en:dark-chocolates, en:dessert...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358351</th>\n",
" <td>Kirkland Purified Drinking Water</td>\n",
" <td>[en:beverages, en:waters, en:non-sugared-bever...</td>\n",
" <td>(en:beverages, en:non-sugared-beverages, en:wa...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358372</th>\n",
" <td>Boîte de saumon frais trouvée à l'extérieur d'...</td>\n",
" <td>[en:seafood, en:fishes, en:salmons]</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358389</th>\n",
" <td>Cervoise Mexicaine</td>\n",
" <td>[en:beverages, en:alcoholic-beverages, en:arti...</td>\n",
" <td>(en:alcoholic-beverages, en:amber-beers, en:be...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358399</th>\n",
" <td>Ma bite</td>\n",
" <td>[en:beverages, en:non-sugared-beverages]</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358400</th>\n",
" <td>Les schtroumpfs &amp; le village des fille</td>\n",
" <td>[en:fats]</td>\n",
" <td>(en:confectioneries, en:sugary-snacks)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358426</th>\n",
" <td>Les Belles Tranches Bacon fumé</td>\n",
" <td>[en:meats, en:pork, en:bacon, en:sliced-bacon]</td>\n",
" <td>(en:bacon, en:meats, en:pork, en:sliced-bacon)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358428</th>\n",
" <td>Tartines craquantes bio au sarrasin</td>\n",
" <td>[en:plant-based-foods-and-beverages, en:plant-...</td>\n",
" <td>(en:breads, en:cereals-and-potatoes, en:crispb...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358438</th>\n",
" <td>Roussette du Bugey (2011)</td>\n",
" <td>[en:beverages, en:alcoholic-beverages, en:wine...</td>\n",
" <td>(en:alcoholic-beverages, en:beverages, en:fren...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358446</th>\n",
" <td>乐吧泡菜味薯片</td>\n",
" <td>[en:salty-snacks, en:appetizers, en:chips-and-...</td>\n",
" <td>()</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>38381 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" product_name \\\n",
"176 Salade Cesar \n",
"184 lentilles vertes \n",
"185 Root Beer \n",
"186 Biscuits sablés fourrage au cacao \n",
"238 Blle Pet 50CL Coca Cola Cherry \n",
"244 Cauliflower \n",
"247 Salsa de mostaza \n",
"249 7Up \n",
"276 Mehrkomponeneten Protein 90 C6 Haselnuß \n",
"292 Cakes aux Fruits \n",
"309 Whey Protein aus Molke 1000 Gramm Vanilla \n",
"310 Fondants Citron \n",
"369 Sour Fruit Gummies \n",
"421 Mixed peppers \n",
"437 30 Panach' Fruits \n",
"440 Foie gras de canard du Périgord \n",
"454 Terrine de caille aux pruneaux d'Agen \n",
"455 Foie de canard aux figues \n",
"458 Foie gras d'oie Périgord \n",
"459 Foie gras d'oie du Périgord \n",
"464 All Butter Belgian White Chocolate Chunk Cookies \n",
"465 All Butter Fruity Flapjack Cookies \n",
"467 All butter Cranberry & Orange Cookies \n",
"468 All Butter Triple Belgian Chocolate Chunk Cookies \n",
"469 Cookies Stem Ginger \n",
"472 Stem Ginger Dunkers \n",
"484 Reduced Fat Mayonnaise \n",
"492 mostly mesquite honey \n",
"496 Clam Chowder A Condensed Soup \n",
"509 Salade Mac \n",
"... ... \n",
"358216 Sauc Nuoc Mam \n",
"358217 Sauce piment doux Thaï \n",
"358220 Sauce Piment Sriracha \n",
"358222 Sauce Thaï Satay \n",
"358225 Crème de coco allégée \n",
"358226 Nouilles instantanées \n",
"358231 Pad Thaï Sauce Wok \n",
"358233 Pâte de Curry Vert \n",
"358234 Pâte de curry rouge \n",
"358242 Gula Gula Durian \n",
"358254 Boisson au chrysanthème \n",
"358255 Lychee Drink \n",
"358279 Pudding nata de coco \n",
"358282 Glinter Soft Drink Orange \n",
"358283 Soft Drink \n",
"358293 Healtier palm oil (L'huile de palme) \n",
"358306 100% Pur Jus 4 agrumes \n",
"358325 tuna chunks in spring water \n",
"358335 Santa Cruz Chilli & Lime Dressing \n",
"358343 Fisherman's Friend Miel-Citron \n",
"358347 Dessert Noir (lot de 2) \n",
"358351 Kirkland Purified Drinking Water \n",
"358372 Boîte de saumon frais trouvée à l'extérieur d'... \n",
"358389 Cervoise Mexicaine \n",
"358399 Ma bite \n",
"358400 Les schtroumpfs & le village des fille \n",
"358426 Les Belles Tranches Bacon fumé \n",
"358428 Tartines craquantes bio au sarrasin \n",
"358438 Roussette du Bugey (2011) \n",
"358446 乐吧泡菜味薯片 \n",
"\n",
" original_labels \\\n",
"176 [en:plant-based-foods-and-beverages, en:plant-... \n",
"184 [en:plant-based-foods-and-beverages, en:plant-... \n",
"185 [en:beverages, en:carbonated-drinks, en:sodas,... \n",
"186 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"238 [en:beverages, en:sugared-beverages] \n",
"244 [en:plant-based-foods-and-beverages, en:plant-... \n",
"247 [en:groceries, en:condiments, en:sauces, en:mu... \n",
"249 [en:plant-based-foods-and-beverages, en:bevera... \n",
"276 [en:dietary-supplements, en:bodybuilding-suppl... \n",
"292 [en:sugary-snacks, en:biscuits-and-cakes, en:d... \n",
"309 [en:dietary-supplements, en:bodybuilding-suppl... \n",
"310 [en:sugary-snacks, en:biscuits-and-cakes, en:d... \n",
"369 [en:sugary-snacks, en:confectioneries, en:cand... \n",
"421 [en:plant-based-foods-and-beverages, en:plant-... \n",
"437 [en:sugary-snacks, en:biscuits-and-cakes, en:d... \n",
"440 [en:fish-and-meat-and-eggs, fr:foies-gras, fr:... \n",
"454 [en:terrine, fr:terrines-de-volailles] \n",
"455 [en:fish-and-meat-and-eggs, fr:foies-gras, fr:... \n",
"458 [en:fish-and-meat-and-eggs, fr:foies-gras] \n",
"459 [en:fish-and-meat-and-eggs, fr:foies-gras] \n",
"464 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"465 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"467 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"468 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"469 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"472 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n",
"484 [en:groceries, en:sauces, en:mayonnaises] \n",
"492 [en:spreads, en:breakfasts, en:sweet-spreads, ... \n",
"496 [en:meals, en:soups] \n",
"509 [en:plant-based-foods-and-beverages, en:plant-... \n",
"... ... \n",
"358216 [en:groceries, en:sauces] \n",
"358217 [en:groceries, en:sauces, en:pimented-sauces] \n",
"358220 [en:plant-based-foods-and-beverages, en:plant-... \n",
"358222 [en:groceries, en:sauces, en:pimented-sauces] \n",
"358225 [en:plant-based-foods-and-beverages, en:plant-... \n",
"358226 [en:plant-based-foods-and-beverages, en:plant-... \n",
"358231 [en:groceries, en:sauces] \n",
"358233 [en:groceries, en:sauces] \n",
"358234 [en:groceries, en:sauces, en:curry-pastes] \n",
"358242 [en:sugary-snacks, en:confectioneries, en:cand... \n",
"358254 [en:plant-based-foods-and-beverages, en:bevera... \n",
"358255 [en:plant-based-foods-and-beverages, en:bevera... \n",
"358279 [en:desserts, en:puddings] \n",
"358282 [en:beverages, en:non-sugared-beverages] \n",
"358283 [en:beverages, en:artificially-sweetened-bever... \n",
"358293 [en:plant-based-foods-and-beverages, en:plant-... \n",
"358306 [en:plant-based-foods-and-beverages, en:bevera... \n",
"358325 [en:canned-foods, en:seafood, en:fishes, en:ca... \n",
"358335 [en:groceries, en:sauces, en:salad-dressings] \n",
"358343 [en:sugary-snacks, en:confectioneries, en:cand... \n",
"358347 [en:sugary-snacks, en:chocolates, en:dark-choc... \n",
"358351 [en:beverages, en:waters, en:non-sugared-bever... \n",
"358372 [en:seafood, en:fishes, en:salmons] \n",
"358389 [en:beverages, en:alcoholic-beverages, en:arti... \n",
"358399 [en:beverages, en:non-sugared-beverages] \n",
"358400 [en:fats] \n",
"358426 [en:meats, en:pork, en:bacon, en:sliced-bacon] \n",
"358428 [en:plant-based-foods-and-beverages, en:plant-... \n",
"358438 [en:beverages, en:alcoholic-beverages, en:wine... \n",
"358446 [en:salty-snacks, en:appetizers, en:chips-and-... \n",
"\n",
" guessed_labels \n",
"176 (en:meals, fr:salades-composees) \n",
"184 (en:green-lentils, en:legume-seeds, en:legumes... \n",
"185 (en:alcoholic-beverages, en:beers, en:beverage... \n",
"186 (en:biscuits, en:biscuits-and-cakes, en:chocol... \n",
"238 (en:beverages, en:non-sugared-beverages, en:so... \n",
"244 (en:fruits-and-vegetables-based-foods, en:leaf... \n",
"247 (en:condiments, en:groceries, en:mustards, en:... \n",
"249 (en:beverages, en:non-sugared-beverages) \n",
"276 (en:bodybuilding-supplements, en:dietary-suppl... \n",
"292 (en:biscuits-and-cakes, en:cakes, en:desserts,... \n",
"309 (en:bodybuilding-supplements, en:dietary-suppl... \n",
"310 () \n",
"369 (en:candies, en:confectioneries, en:sugary-sna... \n",
"421 (en:fruits-and-vegetables-based-foods, en:plan... \n",
"437 (en:fruits-and-vegetables-based-foods, en:frui... \n",
"440 (en:fish-and-meat-and-eggs, fr:foies-gras, fr:... \n",
"454 (en:terrine,) \n",
"455 (en:fish-and-meat-and-eggs, en:meats) \n",
"458 (en:fish-and-meat-and-eggs, fr:foies-gras) \n",
"459 (en:fish-and-meat-and-eggs, fr:foies-gras) \n",
"464 (en:biscuits, en:biscuits-and-cakes, en:chocol... \n",
"465 (en:biscuits, en:biscuits-and-cakes, en:cookie... \n",
"467 (en:biscuits, en:biscuits-and-cakes, en:cookie... \n",
"468 (en:biscuits, en:biscuits-and-cakes, en:cookie... \n",
"469 (en:biscuits, en:biscuits-and-cakes, en:cookie... \n",
"472 () \n",
"484 (en:dairies, en:groceries, en:milks, en:sauces) \n",
"492 (en:breakfasts,) \n",
"496 (en:meals, en:soups) \n",
"509 () \n",
"... ... \n",
"358216 (en:plant-based-foods, en:plant-based-foods-an... \n",
"358217 (en:groceries, en:sauces) \n",
"358220 (en:groceries, en:pimented-sauces, en:sauces) \n",
"358222 (en:groceries, en:sauces) \n",
"358225 (en:plant-based-creams, en:plant-based-foods-a... \n",
"358226 (en:cereals-and-potatoes, en:cereals-and-their... \n",
"358231 (en:sauces,) \n",
"358233 (en:curry-pastes, en:groceries) \n",
"358234 (en:groceries,) \n",
"358242 () \n",
"358254 (en:beverages, en:non-sugared-beverages, en:pl... \n",
"358255 (en:beverages, en:fruit-based-beverages, en:pl... \n",
"358279 (en:desserts, en:puddings) \n",
"358282 (en:beverages, en:plant-based-foods-and-bevera... \n",
"358283 () \n",
"358293 (en:fats, en:plant-based-foods, en:plant-based... \n",
"358306 (en:beverages, en:fruit-based-beverages, en:fr... \n",
"358325 (en:canned-fishes, en:canned-foods, en:canned-... \n",
"358335 (en:groceries, en:salad-dressings, en:sauces) \n",
"358343 (en:candies, en:confectioneries, en:sugary-sna... \n",
"358347 (en:chocolates, en:dark-chocolates, en:dessert... \n",
"358351 (en:beverages, en:non-sugared-beverages, en:wa... \n",
"358372 () \n",
"358389 (en:alcoholic-beverages, en:amber-beers, en:be... \n",
"358399 (en:plant-based-foods, en:plant-based-foods-an... \n",
"358400 (en:confectioneries, en:sugary-snacks) \n",
"358426 (en:bacon, en:meats, en:pork, en:sliced-bacon) \n",
"358428 (en:breads, en:cereals-and-potatoes, en:crispb... \n",
"358438 (en:alcoholic-beverages, en:beverages, en:fren... \n",
"358446 () \n",
"\n",
"[38381 rows x 3 columns]"
]
},
"execution_count": 394,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testing_dataframe = pandas.DataFrame({\n",
" 'product_name': X_test,\n",
" 'original_labels': Y_test,\n",
" 'guessed_labels': all_labels\n",
"}, columns=['product_name', 'original_labels', 'guessed_labels'])\n",
"testing_dataframe.to_csv('testing.csv', index=False)\n",
"testing_dataframe"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['offClassifier.pkl']"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Dump the classifier\n",
"joblib.dump((mlb, classifier), 'offClassifier.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predict!"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def batch(iterable, size):\n",
" \"\"\"\n",
" Get items from a sequence a batch at a time.\n",
"\n",
" :param iterable: The iterable to get the items from.\n",
" :param size: The size of the batches.\n",
" :return: A new iterable.\n",
" \"\"\"\n",
" sourceiter = iter(iterable)\n",
" while True:\n",
" batchiter = itertools.islice(sourceiter, size)\n",
" yield itertools.chain([next(batchiter)], batchiter)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# Load the classifier\n",
"mlb, classifier = joblib.load('offClassifier.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/phyks/.local/share/virtualenvs/machine_learning/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: generator 'batch' raised StopIteration\n",
" \"\"\"\n"
]
}
],
"source": [
"X_predicted = products_without_categories['product_name'].values.astype('U')\n",
"\n",
"all_labels = []\n",
"\n",
"for i in batch(X_predicted, 30000):\n",
" predicted = classifier.predict(list(i))\n",
" all_labels.extend(mlb.inverse_transform(predicted))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>product_name</th>\n",
" <th>guessed_labels</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Farine de blé noir</td>\n",
" <td>(en:cereal-flours, en:cereals-and-potatoes, en...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Banana Chips Sweetened (Whole)</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Peanuts</td>\n",
" <td>(en:legumes, en:legumes-and-their-products, en...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Organic Salted Nut Mix</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Organic Polenta</td>\n",
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Breadshop Honey Gone Nuts Granola</td>\n",
" <td>(en:breakfasts, en:sugary-snacks)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Organic Long Grain White Rice</td>\n",
" <td>(en:cereal-grains, en:cereals-and-potatoes, en...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Organic Muesli</td>\n",
" <td>(en:breakfast-cereals, en:breakfasts, en:cerea...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Organic Dark Chocolate Minis</td>\n",
" <td>(en:chocolates, en:dark-chocolates, en:sugary-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Organic Sunflower Oil</td>\n",
" <td>(en:fats, en:plant-based-foods, en:plant-based...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Organic Adzuki Beans</td>\n",
" <td>(en:legumes-and-their-products, en:plant-based...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Organic Penne Pasta</td>\n",
" <td>(en:cereals-and-potatoes, en:cereals-and-their...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Zen Party Mix</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Organic Golden Flax Seeds</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Organic Spicy Punks</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Cinnamon Nut Granola</td>\n",
" <td>(en:breakfast-cereals, en:breakfasts, en:cerea...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Organic Hazelnuts</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Organic Sweetened Banana Chips</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Lotus Organic Brown Jasmine Rice</td>\n",
" <td>(en:cereal-grains, en:cereals-and-their-produc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Organic Oat Groats</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Energy Power Mix</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Antioxidant Mix - Berries &amp; Chocolate</td>\n",
" <td>(en:plant-based-foods-and-beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>Organic Quinoa Coconut Granola With Mango</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Fire Roasted Hatch Green Chile Almonds</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>Peanut Butter Power Chews</td>\n",
" <td>(en:plant-based-foods,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>Real Salt Granular</td>\n",
" <td>(en:chips-and-fries, en:crisps, en:groceries, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>Organic Unswt Berry Coconut Granola</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Roasted Salted Black Pepper Cashews</td>\n",
" <td>(en:cashew-nuts, en:nuts, en:plant-based-foods...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>Thai Curry Roasted Cashews</td>\n",
" <td>(en:cashew-nuts, en:plant-based-foods, en:plan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>Wasabi Tamari Almonds</td>\n",
" <td>(en:condiments, en:groceries, en:plant-based-f...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254346</th>\n",
" <td>Fairy Tail</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254347</th>\n",
" <td>Biscuits aux céréales, aux pépites de chocolat...</td>\n",
" <td>(en:biscuits, en:biscuits-and-cakes, en:sugary...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254348</th>\n",
" <td>Neszt Cochon Con</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254349</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254350</th>\n",
" <td>Drid apricot the queen</td>\n",
" <td>(en:beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254351</th>\n",
" <td>Natural Cassava</td>\n",
" <td>(en:plant-based-foods-and-beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254352</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254353</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254354</th>\n",
" <td>Soda 1</td>\n",
" <td>(en:beverages, en:carbonated-drinks, en:sodas,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254355</th>\n",
" <td>Merci 1</td>\n",
" <td>(en:bonbons, en:candies, en:chocolates, en:cho...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254356</th>\n",
" <td>Merci2</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254357</th>\n",
" <td>Merci3</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254358</th>\n",
" <td>Libro parachute3</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254359</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254360</th>\n",
" <td>Vegan easy</td>\n",
" <td>(en:plant-based-foods, en:plant-based-foods-an...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254361</th>\n",
" <td>Tarifs djoghrafia</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254362</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254363</th>\n",
" <td>Ferrero Rocher</td>\n",
" <td>(en:chocolates, en:sugary-snacks)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254364</th>\n",
" <td>nan</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254365</th>\n",
" <td>Raspados Ice Bars</td>\n",
" <td>(en:bars, en:cereal-bars, en:sugary-snacks)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254366</th>\n",
" <td>nf test</td>\n",
" <td>(en:beverages,)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254367</th>\n",
" <td>Amandes</td>\n",
" <td>(en:almonds, en:chocolates-with-almonds, en:nu...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254368</th>\n",
" <td>Mleko wiejskie</td>\n",
" <td>(en:dairies, en:milks)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254369</th>\n",
" <td>Poireaux</td>\n",
" <td>(en:fruits-and-vegetables-based-foods, en:leek...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254370</th>\n",
" <td>Cheese cake thé vert, yuzu</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254371</th>\n",
" <td>Tomato &amp; ricotta</td>\n",
" <td>()</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254372</th>\n",
" <td>Mint Melange Tea A Blend Of Peppermint, Lemon ...</td>\n",
" <td>(en:beverages, en:non-sugared-beverages, en:su...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254373</th>\n",
" <td>Biscottes bio</td>\n",
" <td>(en:breads, en:cereals-and-potatoes, en:plant-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254374</th>\n",
" <td>Tomates aux Vermicelles</td>\n",
" <td>(en:fruits-and-vegetables-based-foods, en:meal...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254375</th>\n",
" <td>Sugar Free Drink Mix, Peach Tea</td>\n",
" <td>(en:artificially-sweetened-beverages, en:bever...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>254376 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" product_name \\\n",
"0 Farine de blé noir \n",
"1 Banana Chips Sweetened (Whole) \n",
"2 Peanuts \n",
"3 Organic Salted Nut Mix \n",
"4 Organic Polenta \n",
"5 Breadshop Honey Gone Nuts Granola \n",
"6 Organic Long Grain White Rice \n",
"7 Organic Muesli \n",
"8 Organic Dark Chocolate Minis \n",
"9 Organic Sunflower Oil \n",
"10 Organic Adzuki Beans \n",
"11 Organic Penne Pasta \n",
"12 Zen Party Mix \n",
"13 Organic Golden Flax Seeds \n",
"14 Organic Spicy Punks \n",
"15 Cinnamon Nut Granola \n",
"16 Organic Hazelnuts \n",
"17 Organic Sweetened Banana Chips \n",
"18 Lotus Organic Brown Jasmine Rice \n",
"19 Organic Oat Groats \n",
"20 Energy Power Mix \n",
"21 Antioxidant Mix - Berries & Chocolate \n",
"22 Organic Quinoa Coconut Granola With Mango \n",
"23 Fire Roasted Hatch Green Chile Almonds \n",
"24 Peanut Butter Power Chews \n",
"25 Real Salt Granular \n",
"26 Organic Unswt Berry Coconut Granola \n",
"27 Roasted Salted Black Pepper Cashews \n",
"28 Thai Curry Roasted Cashews \n",
"29 Wasabi Tamari Almonds \n",
"... ... \n",
"254346 Fairy Tail \n",
"254347 Biscuits aux céréales, aux pépites de chocolat... \n",
"254348 Neszt Cochon Con \n",
"254349 nan \n",
"254350 Drid apricot the queen \n",
"254351 Natural Cassava \n",
"254352 nan \n",
"254353 nan \n",
"254354 Soda 1 \n",
"254355 Merci 1 \n",
"254356 Merci2 \n",
"254357 Merci3 \n",
"254358 Libro parachute3 \n",
"254359 nan \n",
"254360 Vegan easy \n",
"254361 Tarifs djoghrafia \n",
"254362 nan \n",
"254363 Ferrero Rocher \n",
"254364 nan \n",
"254365 Raspados Ice Bars \n",
"254366 nf test \n",
"254367 Amandes \n",
"254368 Mleko wiejskie \n",
"254369 Poireaux \n",
"254370 Cheese cake thé vert, yuzu \n",
"254371 Tomato & ricotta \n",
"254372 Mint Melange Tea A Blend Of Peppermint, Lemon ... \n",
"254373 Biscottes bio \n",
"254374 Tomates aux Vermicelles \n",
"254375 Sugar Free Drink Mix, Peach Tea \n",
"\n",
" guessed_labels \n",
"0 (en:cereal-flours, en:cereals-and-potatoes, en... \n",
"1 () \n",
"2 (en:legumes, en:legumes-and-their-products, en... \n",
"3 () \n",
"4 (en:cereals-and-potatoes, en:cereals-and-their... \n",
"5 (en:breakfasts, en:sugary-snacks) \n",
"6 (en:cereal-grains, en:cereals-and-potatoes, en... \n",
"7 (en:breakfast-cereals, en:breakfasts, en:cerea... \n",
"8 (en:chocolates, en:dark-chocolates, en:sugary-... \n",
"9 (en:fats, en:plant-based-foods, en:plant-based... \n",
"10 (en:legumes-and-their-products, en:plant-based... \n",
"11 (en:cereals-and-potatoes, en:cereals-and-their... \n",
"12 () \n",
"13 (en:plant-based-foods, en:plant-based-foods-an... \n",
"14 () \n",
"15 (en:breakfast-cereals, en:breakfasts, en:cerea... \n",
"16 () \n",
"17 () \n",
"18 (en:cereal-grains, en:cereals-and-their-produc... \n",
"19 () \n",
"20 () \n",
"21 (en:plant-based-foods-and-beverages,) \n",
"22 (en:plant-based-foods, en:plant-based-foods-an... \n",
"23 () \n",
"24 (en:plant-based-foods,) \n",
"25 (en:chips-and-fries, en:crisps, en:groceries, ... \n",
"26 (en:plant-based-foods, en:plant-based-foods-an... \n",
"27 (en:cashew-nuts, en:nuts, en:plant-based-foods... \n",
"28 (en:cashew-nuts, en:plant-based-foods, en:plan... \n",
"29 (en:condiments, en:groceries, en:plant-based-f... \n",
"... ... \n",
"254346 () \n",
"254347 (en:biscuits, en:biscuits-and-cakes, en:sugary... \n",
"254348 () \n",
"254349 () \n",
"254350 (en:beverages,) \n",
"254351 (en:plant-based-foods-and-beverages,) \n",
"254352 () \n",
"254353 () \n",
"254354 (en:beverages, en:carbonated-drinks, en:sodas,... \n",
"254355 (en:bonbons, en:candies, en:chocolates, en:cho... \n",
"254356 () \n",
"254357 () \n",
"254358 () \n",
"254359 () \n",
"254360 (en:plant-based-foods, en:plant-based-foods-an... \n",
"254361 () \n",
"254362 () \n",
"254363 (en:chocolates, en:sugary-snacks) \n",
"254364 () \n",
"254365 (en:bars, en:cereal-bars, en:sugary-snacks) \n",
"254366 (en:beverages,) \n",
"254367 (en:almonds, en:chocolates-with-almonds, en:nu... \n",
"254368 (en:dairies, en:milks) \n",
"254369 (en:fruits-and-vegetables-based-foods, en:leek... \n",
"254370 () \n",
"254371 () \n",
"254372 (en:beverages, en:non-sugared-beverages, en:su... \n",
"254373 (en:breads, en:cereals-and-potatoes, en:plant-... \n",
"254374 (en:fruits-and-vegetables-based-foods, en:meal... \n",
"254375 (en:artificially-sweetened-beverages, en:bever... \n",
"\n",
"[254376 rows x 2 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prediction_dataframe = pandas.DataFrame({\n",
" 'product_name': products_without_categories['product_name'].values.astype('U'),\n",
" 'guessed_labels': all_labels\n",
"}, columns=['product_name', 'guessed_labels'])\n",
"prediction_dataframe.to_csv('prediction.csv', index=False)\n",
"prediction_dataframe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Port to JS"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from sklearn_porter import Porter"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Load the classifier\n",
"mlb, classifier = joblib.load('offClassifier.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "The given model 'Pipeline(memory=None,\n steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n lowercase=True, max_df=1.0, max_features=None, min_df=1,\n ngram_range=(1, 1), preprocessor=None, stop_words=None,\n ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0),\n n_jobs=1))])' isn't supported.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-4f2b5247caa7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mporter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPorter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlanguage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'javascript'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mporter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/share/virtualenvs/machine_learning/lib/python3.6/site-packages/sklearn_porter/Porter.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, model, language, method, **kwargs)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0merror\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"The given model '{model}' isn't\"\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;34m\" supported.\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__dict__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;31m# Import model class:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: The given model 'Pipeline(memory=None,\n steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n lowercase=True, max_df=1.0, max_features=None, min_df=1,\n ngram_range=(1, 1), preprocessor=None, stop_words=None,\n ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0),\n n_jobs=1))])' isn't supported."
]
}
],
"source": [
"porter = Porter(classifier, language='javascript')\n",
"output = porter.export()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline\n"
]
}
],
"source": [
"algorithm_name = str(type(classifier).__name__)\n",
"print(algorithm_name)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}