{ "cells": [ { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "import collections\n", "import itertools\n", "import json\n", "import os\n", "\n", "import numpy as np\n", "import pandas\n", "import random\n", "\n", "from sklearn.externals import joblib\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_extraction.text import TfidfTransformer\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.multiclass import OneVsRestClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.svm import LinearSVC\n", "\n", "MIN_NUMBER_PRODUCTS_PER_CATEGORY = 3000\n", "TRAINING_DATASET_SIZE = 25 / 100" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data from CSV dump" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "csv = pandas.read_csv(\n", " 'en.openfoodfacts.org.products.csv',\n", " sep='\\t',\n", " usecols=[0, 7, 15],\n", " dtype={'code': 'str', 'product_name': 'str'},\n", " converters={'categories_tags': lambda x: x.split(',') if x else np.NaN}\n", ")\n", "# Filter products with and without categories in two different DataFrames\n", "products_with_categories = csv[pandas.notnull(csv['categories_tags'])]\n", "products_without_categories = csv[pandas.isnull(csv['categories_tags'])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fitting on dataset" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Keeping 33 categories.\n" ] } ], "source": [ "# Let's build vectors of products and categories, for training purpose.\n", "categories = [\n", " category\n", " for category, count in collections.Counter(\n", " category for category_list in products_with_categories['categories_tags'] for category in category_list\n", " ).items()\n", " # Filter out categories without enough products\n", " if count > MIN_NUMBER_PRODUCTS_PER_CATEGORY and category != ''\n", "]\n", "print('Keeping %d categories.' % len(categories))\n", "# Filter out empty lists of categories\n", "XY = products_with_categories.copy()\n", "XY['categories_tags'] = XY['categories_tags'].map(lambda c_list: [c for c in c_list if c in categories])\n", "mask = XY['categories_tags'].str.len() > 0\n", "XY = XY[mask]" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/lverney/.local/share/virtualenvs/machine_learning/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", " FutureWarning)\n" ] }, { "data": { "text/plain": [ "741" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select training and testing sample\n", "X = XY['product_name'].values.astype('U')\n", "Y = [np.array(c).astype('U') for c in XY['categories_tags'].values]\n", "\n", "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, train_size=TRAINING_DATASET_SIZE)\n", "\n", "# Check each category is sufficiently represented\n", "min([\n", " count\n", " for _, count in collections.Counter(\n", " category for category_list in Y_train for category in category_list\n", " ).items()\n", "])" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "mlb = MultiLabelBinarizer()\n", "Y_train_transformed = mlb.fit_transform(Y_train)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(memory=None,\n", " steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", " ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", " verbose=0),\n", " n_jobs=1))])" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fit our classifier\n", "classifier = Pipeline([\n", " ('vectorizer', CountVectorizer()),\n", " ('tfidf', TfidfTransformer()),\n", " ('clf', OneVsRestClassifier(LinearSVC()))\n", "])\n", "\n", "classifier.fit(X_train, Y_train_transformed)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.52062784115275751" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute predictions for testing set\n", "predicted = classifier.predict(X_test)\n", "all_labels = mlb.inverse_transform(predicted)\n", "\n", "accuracy_score(mlb.fit_transform(Y_test), predicted)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_nameoriginal_labelsguessed_labels
0Fernandes Cherry Soda - Ga Mee Naar Suruname &...[en:beverages, en:sugared-beverages](en:beverages, en:plant-based-foods-and-bevera...
1Compotes allégée en sucres Pommes Carrefour[en:plant-based-foods-and-beverages, en:plant-...(en:desserts, en:fruits-and-vegetables-based-f...
2Tortellini 4 Fromages, LunchBox[en:plant-based-foods-and-beverages, en:plant-...(en:cereals-and-potatoes, en:cereals-and-their...
3Orange and mango squash[en:beverages](en:beverages, en:plant-based-foods-and-bevera...
4Crackers toast nature[en:salty-snacks](en:cereals-and-potatoes, en:salty-snacks)
5Miellats du Maquis cru d'été 2016[en:spreads, en:breakfasts]()
6Cebollas "Juan de Dios"[en:plant-based-foods-and-beverages, en:plant-...(en:fresh-foods, en:fruits-and-vegetables-base...
7Moules fraîches de Hollande[en:seafood](en:seafood,)
8Thé citron[en:plant-based-foods-and-beverages, en:bevera...(en:beverages, en:non-sugared-beverages, en:pl...
9Rêves de chocolat Assortiment de chocolats fin...[en:sugary-snacks, en:confectioneries, en:dess...(en:chocolates, en:desserts, en:sugary-snacks)
10Noir aux éclats de noisettes[en:sugary-snacks, en:chocolates](en:chocolates, en:sugary-snacks)
11Couscous royal poulet merguez[en:canned-foods, en:meals](en:meals,)
12Baguette Céréales Carrefour[en:plant-based-foods-and-beverages, en:plant-...(en:cereals-and-potatoes, en:plant-based-foods...
13Pomme Fraise[en:plant-based-foods-and-beverages, en:plant-...(en:desserts, en:fruits-and-vegetables-based-f...
14Risotto Champignons[en:meals](en:meals, en:plant-based-foods, en:plant-base...
15Bio Chocolat Noir aux Éclats de Noisettes[en:sugary-snacks, en:chocolates](en:chocolates, en:sugary-snacks)
16Angeliter Zitronenlimonade[en:beverages, en:sugared-beverages]()
17Agar-Agar[en:plant-based-foods-and-beverages, en:plant-...()
18Préparation à l'huile de thym, romarin et laur...[en:plant-based-foods-and-beverages, en:plant-...(en:plant-based-foods, en:plant-based-foods-an...
19Velamints[en:sugary-snacks, en:confectioneries]()
20Vegemil Black Bean Soymilk[en:dairies](en:plant-based-foods-and-beverages,)
21Carottes Râpées[en:meals](en:meals,)
22Budweiser Budvar[en:beverages, en:alcoholic-beverages](en:alcoholic-beverages, en:beverages)
23Mayonnaise aux oeufs frais[en:groceries, en:sauces](en:groceries, en:sauces)
24Kremsi krem sir[en:dairies, en:cheeses]()
25Domaine de l'Echauguette - 2010[en:beverages, en:alcoholic-beverages](en:alcoholic-beverages, en:beverages)
26Strathmore Still Spring Water[en:beverages, en:non-sugared-beverages](en:beverages, en:non-sugared-beverages)
27Mousse de Viennois (8 Chocolat)[en:fresh-foods, en:sugary-snacks, en:dairies,...(en:chocolates, en:dairies, en:desserts, en:fr...
28Poulet & Riz basquaise[en:meals](en:meals,)
29Le Norvège -25% sel[en:seafood]()
............
69924Cake Release Spray[en:plant-based-foods-and-beverages, en:plant-...(en:biscuits-and-cakes, en:desserts, en:sugary...
69925Maxi Quenelles à Gratiner, Jambon sauce béchamel[en:canned-foods, en:meals]()
69926Beurre gastronomique demi-sel[en:spreads, en:dairies](en:dairies, en:spreads)
69927Jambon à Griller[en:meats, en:prepared-meats](en:meats, en:prepared-meats)
69928Alpro Oat Almond[en:beverages, en:dairies, en:non-sugared-beve...()
69929Bière trappiste[en:beverages, en:alcoholic-beverages](en:alcoholic-beverages, en:beverages)
69930Viré-Clessé 2010[en:beverages, en:alcoholic-beverages](en:alcoholic-beverages, en:beverages)
69931Cuillers aux œufs frais[en:sugary-snacks, en:biscuits-and-cakes, en:b...(en:biscuits, en:biscuits-and-cakes, en:sugary...
69932Super Smoothie Antioxidant Innocent[en:beverages, en:non-sugared-beverages](en:beverages, en:non-sugared-beverages, en:pl...
699336 petits pains précuits[en:plant-based-foods-and-beverages, en:plant-...(en:cereals-and-potatoes, en:plant-based-foods...
69934Indian tonic[en:beverages](en:beverages, en:sugared-beverages)
69935Merlot 2015[en:beverages, en:alcoholic-beverages](en:alcoholic-beverages, en:beverages)
69936Jus multivitaminé 11 fruits[en:plant-based-foods-and-beverages, en:bevera...(en:beverages, en:fruit-based-beverages, en:pl...
69937Pain de mie complet[en:plant-based-foods-and-beverages, en:plant-...(en:cereals-and-potatoes, en:plant-based-foods...
69938Farfalle Zebra[en:plant-based-foods-and-beverages, en:plant-...(en:cereals-and-potatoes, en:cereals-and-their...
69939Nectarinas[en:plant-based-foods-and-beverages, en:plant-...()
69940Ail et fines herbes fromage à tartiner (27% MG)[en:spreads, en:dairies, en:cheeses](en:cheeses, en:dairies, en:spreads)
69941Ratatouille cuisinée à la Provençale[en:canned-foods, en:meals](en:meals,)
69942Confiture De Cerises Noires De Bâle[en:plant-based-foods-and-beverages, en:plant-...(en:breakfasts, en:fruits-and-vegetables-based...
69943Madeleines Marbrees chocolat[en:sugary-snacks, en:biscuits-and-cakes, en:d...(en:biscuits-and-cakes, en:desserts, en:sugary...
69944Filet de poulet (-25% de sel) (4+2 gratuites)[en:meats, en:prepared-meats](en:meats,)
69945Boisson gazeuse goût framboise[en:plant-based-foods-and-beverages, en:bevera...(en:beverages, en:non-sugared-beverages)
69946Mini billes Mozzarella di Bufala Campana AOP[en:dairies, en:cheeses](en:cheeses, en:dairies)
69947Crème fraîche épaisse de Montagne[en:dairies](en:dairies,)
69948Cacahuètes grillées et salées[en:plant-based-foods-and-beverages, en:plant-...(en:plant-based-foods, en:plant-based-foods-an...
69949Potato Wedges[en:plant-based-foods-and-beverages, en:plant-...(en:salty-snacks,)
69950Jus d'orange à base de jus d'orange concentré[en:plant-based-foods-and-beverages, en:bevera...(en:beverages, en:fruit-based-beverages, en:pl...
69951Fleur de Sel aux Épices Grillées[en:plant-based-foods-and-beverages, en:plant-...()
69952Pétales à la crevette[en:salty-snacks]()
69953Pommes rissolées Bio[en:plant-based-foods-and-beverages, en:plant-...(en:frozen-foods, en:fruits-and-vegetables-bas...
\n", "

69954 rows × 3 columns

\n", "
" ], "text/plain": [ " product_name \\\n", "0 Fernandes Cherry Soda - Ga Mee Naar Suruname &... \n", "1 Compotes allégée en sucres Pommes Carrefour \n", "2 Tortellini 4 Fromages, LunchBox \n", "3 Orange and mango squash \n", "4 Crackers toast nature \n", "5 Miellats du Maquis cru d'été 2016 \n", "6 Cebollas "Juan de Dios" \n", "7 Moules fraîches de Hollande \n", "8 Thé citron \n", "9 Rêves de chocolat Assortiment de chocolats fin... \n", "10 Noir aux éclats de noisettes \n", "11 Couscous royal poulet merguez \n", "12 Baguette Céréales Carrefour \n", "13 Pomme Fraise \n", "14 Risotto Champignons \n", "15 Bio Chocolat Noir aux Éclats de Noisettes \n", "16 Angeliter Zitronenlimonade \n", "17 Agar-Agar \n", "18 Préparation à l'huile de thym, romarin et laur... \n", "19 Velamints \n", "20 Vegemil Black Bean Soymilk \n", "21 Carottes Râpées \n", "22 Budweiser Budvar \n", "23 Mayonnaise aux oeufs frais \n", "24 Kremsi krem sir \n", "25 Domaine de l'Echauguette - 2010 \n", "26 Strathmore Still Spring Water \n", "27 Mousse de Viennois (8 Chocolat) \n", "28 Poulet & Riz basquaise \n", "29 Le Norvège -25% sel \n", "... ... \n", "69924 Cake Release Spray \n", "69925 Maxi Quenelles à Gratiner, Jambon sauce béchamel \n", "69926 Beurre gastronomique demi-sel \n", "69927 Jambon à Griller \n", "69928 Alpro Oat Almond \n", "69929 Bière trappiste \n", "69930 Viré-Clessé 2010 \n", "69931 Cuillers aux œufs frais \n", "69932 Super Smoothie Antioxidant Innocent \n", "69933 6 petits pains précuits \n", "69934 Indian tonic \n", "69935 Merlot 2015 \n", "69936 Jus multivitaminé 11 fruits \n", "69937 Pain de mie complet \n", "69938 Farfalle Zebra \n", "69939 Nectarinas \n", "69940 Ail et fines herbes fromage à tartiner (27% MG) \n", "69941 Ratatouille cuisinée à la Provençale \n", "69942 Confiture De Cerises Noires De Bâle \n", "69943 Madeleines Marbrees chocolat \n", "69944 Filet de poulet (-25% de sel) (4+2 gratuites) \n", "69945 Boisson gazeuse goût framboise \n", "69946 Mini billes Mozzarella di Bufala Campana AOP \n", "69947 Crème fraîche épaisse de Montagne \n", "69948 Cacahuètes grillées et salées \n", "69949 Potato Wedges \n", "69950 Jus d'orange à base de jus d'orange concentré \n", "69951 Fleur de Sel aux Épices Grillées \n", "69952 Pétales à la crevette \n", "69953 Pommes rissolées Bio \n", "\n", " original_labels \\\n", "0 [en:beverages, en:sugared-beverages] \n", "1 [en:plant-based-foods-and-beverages, en:plant-... \n", "2 [en:plant-based-foods-and-beverages, en:plant-... \n", "3 [en:beverages] \n", "4 [en:salty-snacks] \n", "5 [en:spreads, en:breakfasts] \n", "6 [en:plant-based-foods-and-beverages, en:plant-... \n", "7 [en:seafood] \n", "8 [en:plant-based-foods-and-beverages, en:bevera... \n", "9 [en:sugary-snacks, en:confectioneries, en:dess... \n", "10 [en:sugary-snacks, en:chocolates] \n", "11 [en:canned-foods, en:meals] \n", "12 [en:plant-based-foods-and-beverages, en:plant-... \n", "13 [en:plant-based-foods-and-beverages, en:plant-... \n", "14 [en:meals] \n", "15 [en:sugary-snacks, en:chocolates] \n", "16 [en:beverages, en:sugared-beverages] \n", "17 [en:plant-based-foods-and-beverages, en:plant-... \n", "18 [en:plant-based-foods-and-beverages, en:plant-... \n", "19 [en:sugary-snacks, en:confectioneries] \n", "20 [en:dairies] \n", "21 [en:meals] \n", "22 [en:beverages, en:alcoholic-beverages] \n", "23 [en:groceries, en:sauces] \n", "24 [en:dairies, en:cheeses] \n", "25 [en:beverages, en:alcoholic-beverages] \n", "26 [en:beverages, en:non-sugared-beverages] \n", "27 [en:fresh-foods, en:sugary-snacks, en:dairies,... \n", "28 [en:meals] \n", "29 [en:seafood] \n", "... ... \n", "69924 [en:plant-based-foods-and-beverages, en:plant-... \n", "69925 [en:canned-foods, en:meals] \n", "69926 [en:spreads, en:dairies] \n", "69927 [en:meats, en:prepared-meats] \n", "69928 [en:beverages, en:dairies, en:non-sugared-beve... \n", "69929 [en:beverages, en:alcoholic-beverages] \n", "69930 [en:beverages, en:alcoholic-beverages] \n", "69931 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "69932 [en:beverages, en:non-sugared-beverages] \n", "69933 [en:plant-based-foods-and-beverages, en:plant-... \n", "69934 [en:beverages] \n", "69935 [en:beverages, en:alcoholic-beverages] \n", "69936 [en:plant-based-foods-and-beverages, en:bevera... \n", "69937 [en:plant-based-foods-and-beverages, en:plant-... \n", "69938 [en:plant-based-foods-and-beverages, en:plant-... \n", "69939 [en:plant-based-foods-and-beverages, en:plant-... \n", "69940 [en:spreads, en:dairies, en:cheeses] \n", "69941 [en:canned-foods, en:meals] \n", "69942 [en:plant-based-foods-and-beverages, en:plant-... \n", "69943 [en:sugary-snacks, en:biscuits-and-cakes, en:d... \n", "69944 [en:meats, en:prepared-meats] \n", "69945 [en:plant-based-foods-and-beverages, en:bevera... \n", "69946 [en:dairies, en:cheeses] \n", "69947 [en:dairies] \n", "69948 [en:plant-based-foods-and-beverages, en:plant-... \n", "69949 [en:plant-based-foods-and-beverages, en:plant-... \n", "69950 [en:plant-based-foods-and-beverages, en:bevera... \n", "69951 [en:plant-based-foods-and-beverages, en:plant-... \n", "69952 [en:salty-snacks] \n", "69953 [en:plant-based-foods-and-beverages, en:plant-... \n", "\n", " guessed_labels \n", "0 (en:beverages, en:plant-based-foods-and-bevera... \n", "1 (en:desserts, en:fruits-and-vegetables-based-f... \n", "2 (en:cereals-and-potatoes, en:cereals-and-their... \n", "3 (en:beverages, en:plant-based-foods-and-bevera... \n", "4 (en:cereals-and-potatoes, en:salty-snacks) \n", "5 () \n", "6 (en:fresh-foods, en:fruits-and-vegetables-base... \n", "7 (en:seafood,) \n", "8 (en:beverages, en:non-sugared-beverages, en:pl... \n", "9 (en:chocolates, en:desserts, en:sugary-snacks) \n", "10 (en:chocolates, en:sugary-snacks) \n", "11 (en:meals,) \n", "12 (en:cereals-and-potatoes, en:plant-based-foods... \n", "13 (en:desserts, en:fruits-and-vegetables-based-f... \n", "14 (en:meals, en:plant-based-foods, en:plant-base... \n", "15 (en:chocolates, en:sugary-snacks) \n", "16 () \n", "17 () \n", "18 (en:plant-based-foods, en:plant-based-foods-an... \n", "19 () \n", "20 (en:plant-based-foods-and-beverages,) \n", "21 (en:meals,) \n", "22 (en:alcoholic-beverages, en:beverages) \n", "23 (en:groceries, en:sauces) \n", "24 () \n", "25 (en:alcoholic-beverages, en:beverages) \n", "26 (en:beverages, en:non-sugared-beverages) \n", "27 (en:chocolates, en:dairies, en:desserts, en:fr... \n", "28 (en:meals,) \n", "29 () \n", "... ... \n", "69924 (en:biscuits-and-cakes, en:desserts, en:sugary... \n", "69925 () \n", "69926 (en:dairies, en:spreads) \n", "69927 (en:meats, en:prepared-meats) \n", "69928 () \n", "69929 (en:alcoholic-beverages, en:beverages) \n", "69930 (en:alcoholic-beverages, en:beverages) \n", "69931 (en:biscuits, en:biscuits-and-cakes, en:sugary... \n", "69932 (en:beverages, en:non-sugared-beverages, en:pl... \n", "69933 (en:cereals-and-potatoes, en:plant-based-foods... \n", "69934 (en:beverages, en:sugared-beverages) \n", "69935 (en:alcoholic-beverages, en:beverages) \n", "69936 (en:beverages, en:fruit-based-beverages, en:pl... \n", "69937 (en:cereals-and-potatoes, en:plant-based-foods... \n", "69938 (en:cereals-and-potatoes, en:cereals-and-their... \n", "69939 () \n", "69940 (en:cheeses, en:dairies, en:spreads) \n", "69941 (en:meals,) \n", "69942 (en:breakfasts, en:fruits-and-vegetables-based... \n", "69943 (en:biscuits-and-cakes, en:desserts, en:sugary... \n", "69944 (en:meats,) \n", "69945 (en:beverages, en:non-sugared-beverages) \n", "69946 (en:cheeses, en:dairies) \n", "69947 (en:dairies,) \n", "69948 (en:plant-based-foods, en:plant-based-foods-an... \n", "69949 (en:salty-snacks,) \n", "69950 (en:beverages, en:fruit-based-beverages, en:pl... \n", "69951 () \n", "69952 () \n", "69953 (en:frozen-foods, en:fruits-and-vegetables-bas... \n", "\n", "[69954 rows x 3 columns]" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "testing_dataframe = pandas.DataFrame({\n", " 'product_name': X_test,\n", " 'original_labels': Y_test,\n", " 'guessed_labels': all_labels\n", "}, columns=['product_name', 'original_labels', 'guessed_labels'])\n", "testing_dataframe.to_csv('testing.csv', index=False)\n", "testing_dataframe" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_namegood_guessed_labelsextra_guessed_labelsmissing_guessed_labels
0Fernandes Cherry Soda - Ga Mee Naar Suruname &...{en:beverages}[en:plant-based-foods-and-beverages][en:sugared-beverages]
1Miellats du Maquis cru d'été 2016{}[][en:spreads, en:breakfasts]
2Thé citron{en:non-sugared-beverages, en:plant-based-food...[][en:plant-based-foods]
3Rêves de chocolat Assortiment de chocolats fin...{en:chocolates, en:desserts, en:sugary-snacks}[][en:confectioneries]
4Couscous royal poulet merguez{en:meals}[][en:canned-foods]
5Baguette Céréales Carrefour{en:plant-based-foods, en:cereals-and-potatoes...[][en:cereals-and-their-products]
6Angeliter Zitronenlimonade{}[][en:beverages, en:sugared-beverages]
7Agar-Agar{}[][en:plant-based-foods-and-beverages, en:plant-...
8Velamints{}[][en:sugary-snacks, en:confectioneries]
9Vegemil Black Bean Soymilk{}[en:plant-based-foods-and-beverages][en:dairies]
10Kremsi krem sir{}[][en:dairies, en:cheeses]
11Le Norvège -25% sel{}[][en:seafood]
12Petit Munster Géromé{en:cheeses}[][en:dairies]
13Nectarines jaunes{en:plant-based-foods, en:plant-based-foods-an...[][en:fruits-and-vegetables-based-foods, en:frui...
14Carotte nouvelle{en:plant-based-foods, en:plant-based-foods-an...[][en:fresh-foods, en:fruits-and-vegetables-base...
15Apple Real Fruit Pie{}[en:plant-based-foods-and-beverages][en:sugary-snacks]
16Jus de pomme{en:plant-based-foods-and-beverages, en:fruit-...[][en:non-sugared-beverages]
17Roquefort Premium{en:dairies}[][en:cheeses]
18Antésite concentré de Réglisse Menthe{}[en:plant-based-foods-and-beverages][en:beverages, en:non-sugared-beverages]
19Menta piperita en bolsitas{en:non-sugared-beverages, en:plant-based-food...[][en:plant-based-foods, en:groceries, en:plant-...
20Demi-Lunes Cèpes aux oeufs frais{en:plant-based-foods, en:cereals-and-potatoes...[][en:fresh-foods, en:meals]
21Sirop de Grenadine Bio{en:beverages}[en:sugared-beverages][en:non-sugared-beverages]
22Gulaschsuppe{}[][en:meals]
23Déli'Pocket Montagnard{}[en:biscuits, en:biscuits-and-cakes, en:sugary...[en:frozen-foods]
24Orzo by Sainsbury's{}[][en:plant-based-foods-and-beverages, en:plant-...
25Vive soy Vainilla{}[][en:plant-based-foods-and-beverages, en:bevera...
26Excellence 99% Cacao Noir Absolu{en:chocolates, en:sugary-snacks}[][en:confectioneries]
27nan{}[][en:sugary-snacks, en:chocolates]
28jupiter{}[][en:sugary-snacks, en:confectioneries]
29Faux-Filet{}[][en:meats]
...............
29197Noix d'épaule cuite choix désossée{}[][en:canned-foods]
29198Lait de coco{en:non-sugared-beverages, en:beverages, en:pl...[][en:plant-based-foods-and-beverages]
29199Tagliatelles au poulet{en:meals}[en:cereals-and-potatoes, en:cereals-and-their...[en:fresh-foods]
29200Dim Sum Shao Mai{}[][en:plant-based-foods-and-beverages, en:plant-...
29201Waffel Blätter mit Vollmilchschokolade{}[][en:sugary-snacks, en:biscuits-and-cakes, en:b...
29202Génépi de Savoie L'Ancienne{}[][en:beverages, en:alcoholic-beverages]
292036 carrés fourrés, saveur amande{en:biscuits-and-cakes, en:sugary-snacks}[][en:desserts]
29204Multivitamins{en:plant-based-foods-and-beverages}[][en:beverages, en:plant-based-beverages, en:no...
29205Instantané aux châtaignes{}[][en:plant-based-foods-and-beverages, en:plant-...
29206Nuggets de Poulet{en:meats}[][en:fresh-foods, en:meals]
29207Jus d'orange{en:beverages}[en:fruit-based-beverages, en:plant-based-beve...[en:non-sugared-beverages]
29208Зефир с ароматом крем-брюле{}[][en:sugary-snacks]
29209Coca-cola{en:beverages}[en:sugared-beverages][en:non-sugared-beverages]
29210Citron vert bio{}[en:beverages][en:plant-based-foods-and-beverages, en:plant-...
29211Weiße Riesenbohnen{}[][en:plant-based-foods-and-beverages, en:plant-...
29212Haricots Verts à la Périgourdine{en:plant-based-foods, en:plant-based-foods-an...[en:fruits-and-vegetables-based-foods, en:vege...[en:frozen-foods]
29213Steaks Hachés Charolais Façon Bouchère{en:meats}[][en:frozen-foods]
29214Tortilla Wraps Plain{}[en:salty-snacks][en:plant-based-foods-and-beverages, en:plant-...
29215nan{}[][en:plant-based-foods-and-beverages, en:bevera...
29216Korma de légumes BIO{}[][en:meals]
29217Cake Release Spray{}[en:biscuits-and-cakes, en:desserts, en:sugary...[en:plant-based-foods-and-beverages, en:plant-...
29218Maxi Quenelles à Gratiner, Jambon sauce béchamel{}[][en:canned-foods, en:meals]
29219Alpro Oat Almond{}[][en:beverages, en:dairies, en:non-sugared-beve...
29220Nectarinas{}[][en:plant-based-foods-and-beverages, en:plant-...
29221Ratatouille cuisinée à la Provençale{en:meals}[][en:canned-foods]
29222Filet de poulet (-25% de sel) (4+2 gratuites){en:meats}[][en:prepared-meats]
29223Boisson gazeuse goût framboise{en:beverages}[en:non-sugared-beverages][en:plant-based-foods-and-beverages, en:plant-...
29224Potato Wedges{}[en:salty-snacks][en:plant-based-foods-and-beverages, en:plant-...
29225Fleur de Sel aux Épices Grillées{}[][en:plant-based-foods-and-beverages, en:plant-...
29226Pétales à la crevette{}[][en:salty-snacks]
\n", "

29227 rows × 4 columns

\n", "
" ], "text/plain": [ " product_name \\\n", "0 Fernandes Cherry Soda - Ga Mee Naar Suruname &... \n", "1 Miellats du Maquis cru d'été 2016 \n", "2 Thé citron \n", "3 Rêves de chocolat Assortiment de chocolats fin... \n", "4 Couscous royal poulet merguez \n", "5 Baguette Céréales Carrefour \n", "6 Angeliter Zitronenlimonade \n", "7 Agar-Agar \n", "8 Velamints \n", "9 Vegemil Black Bean Soymilk \n", "10 Kremsi krem sir \n", "11 Le Norvège -25% sel \n", "12 Petit Munster Géromé \n", "13 Nectarines jaunes \n", "14 Carotte nouvelle \n", "15 Apple Real Fruit Pie \n", "16 Jus de pomme \n", "17 Roquefort Premium \n", "18 Antésite concentré de Réglisse Menthe \n", "19 Menta piperita en bolsitas \n", "20 Demi-Lunes Cèpes aux oeufs frais \n", "21 Sirop de Grenadine Bio \n", "22 Gulaschsuppe \n", "23 Déli'Pocket Montagnard \n", "24 Orzo by Sainsbury's \n", "25 Vive soy Vainilla \n", "26 Excellence 99% Cacao Noir Absolu \n", "27 nan \n", "28 jupiter \n", "29 Faux-Filet \n", "... ... \n", "29197 Noix d'épaule cuite choix désossée \n", "29198 Lait de coco \n", "29199 Tagliatelles au poulet \n", "29200 Dim Sum Shao Mai \n", "29201 Waffel Blätter mit Vollmilchschokolade \n", "29202 Génépi de Savoie L'Ancienne \n", "29203 6 carrés fourrés, saveur amande \n", "29204 Multivitamins \n", "29205 Instantané aux châtaignes \n", "29206 Nuggets de Poulet \n", "29207 Jus d'orange \n", "29208 Зефир с ароматом крем-брюле \n", "29209 Coca-cola \n", "29210 Citron vert bio \n", "29211 Weiße Riesenbohnen \n", "29212 Haricots Verts à la Périgourdine \n", "29213 Steaks Hachés Charolais Façon Bouchère \n", "29214 Tortilla Wraps Plain \n", "29215 nan \n", "29216 Korma de légumes BIO \n", "29217 Cake Release Spray \n", "29218 Maxi Quenelles à Gratiner, Jambon sauce béchamel \n", "29219 Alpro Oat Almond \n", "29220 Nectarinas \n", "29221 Ratatouille cuisinée à la Provençale \n", "29222 Filet de poulet (-25% de sel) (4+2 gratuites) \n", "29223 Boisson gazeuse goût framboise \n", "29224 Potato Wedges \n", "29225 Fleur de Sel aux Épices Grillées \n", "29226 Pétales à la crevette \n", "\n", " good_guessed_labels \\\n", "0 {en:beverages} \n", "1 {} \n", "2 {en:non-sugared-beverages, en:plant-based-food... \n", "3 {en:chocolates, en:desserts, en:sugary-snacks} \n", "4 {en:meals} \n", "5 {en:plant-based-foods, en:cereals-and-potatoes... \n", "6 {} \n", "7 {} \n", "8 {} \n", "9 {} \n", "10 {} \n", "11 {} \n", "12 {en:cheeses} \n", "13 {en:plant-based-foods, en:plant-based-foods-an... \n", "14 {en:plant-based-foods, en:plant-based-foods-an... \n", "15 {} \n", "16 {en:plant-based-foods-and-beverages, en:fruit-... \n", "17 {en:dairies} \n", "18 {} \n", "19 {en:non-sugared-beverages, en:plant-based-food... \n", "20 {en:plant-based-foods, en:cereals-and-potatoes... \n", "21 {en:beverages} \n", "22 {} \n", "23 {} \n", "24 {} \n", "25 {} \n", "26 {en:chocolates, en:sugary-snacks} \n", "27 {} \n", "28 {} \n", "29 {} \n", "... ... \n", "29197 {} \n", "29198 {en:non-sugared-beverages, en:beverages, en:pl... \n", "29199 {en:meals} \n", "29200 {} \n", "29201 {} \n", "29202 {} \n", "29203 {en:biscuits-and-cakes, en:sugary-snacks} \n", "29204 {en:plant-based-foods-and-beverages} \n", "29205 {} \n", "29206 {en:meats} \n", "29207 {en:beverages} \n", "29208 {} \n", "29209 {en:beverages} \n", "29210 {} \n", "29211 {} \n", "29212 {en:plant-based-foods, en:plant-based-foods-an... \n", "29213 {en:meats} \n", "29214 {} \n", "29215 {} \n", "29216 {} \n", "29217 {} \n", "29218 {} \n", "29219 {} \n", "29220 {} \n", "29221 {en:meals} \n", "29222 {en:meats} \n", "29223 {en:beverages} \n", "29224 {} \n", "29225 {} \n", "29226 {} \n", "\n", " extra_guessed_labels \\\n", "0 [en:plant-based-foods-and-beverages] \n", "1 [] \n", "2 [] \n", "3 [] \n", "4 [] \n", "5 [] \n", "6 [] \n", "7 [] \n", "8 [] \n", "9 [en:plant-based-foods-and-beverages] \n", "10 [] \n", "11 [] \n", "12 [] \n", "13 [] \n", "14 [] \n", "15 [en:plant-based-foods-and-beverages] \n", "16 [] \n", "17 [] \n", "18 [en:plant-based-foods-and-beverages] \n", "19 [] \n", "20 [] \n", "21 [en:sugared-beverages] \n", "22 [] \n", "23 [en:biscuits, en:biscuits-and-cakes, en:sugary... \n", "24 [] \n", "25 [] \n", "26 [] \n", "27 [] \n", "28 [] \n", "29 [] \n", "... ... \n", "29197 [] \n", "29198 [] \n", "29199 [en:cereals-and-potatoes, en:cereals-and-their... \n", "29200 [] \n", "29201 [] \n", "29202 [] \n", "29203 [] \n", "29204 [] \n", "29205 [] \n", "29206 [] \n", "29207 [en:fruit-based-beverages, en:plant-based-beve... \n", "29208 [] \n", "29209 [en:sugared-beverages] \n", "29210 [en:beverages] \n", "29211 [] \n", "29212 [en:fruits-and-vegetables-based-foods, en:vege... \n", "29213 [] \n", "29214 [en:salty-snacks] \n", "29215 [] \n", "29216 [] \n", "29217 [en:biscuits-and-cakes, en:desserts, en:sugary... \n", "29218 [] \n", "29219 [] \n", "29220 [] \n", "29221 [] \n", "29222 [] \n", "29223 [en:non-sugared-beverages] \n", "29224 [en:salty-snacks] \n", "29225 [] \n", "29226 [] \n", "\n", " missing_guessed_labels \n", "0 [en:sugared-beverages] \n", "1 [en:spreads, en:breakfasts] \n", "2 [en:plant-based-foods] \n", "3 [en:confectioneries] \n", "4 [en:canned-foods] \n", "5 [en:cereals-and-their-products] \n", "6 [en:beverages, en:sugared-beverages] \n", "7 [en:plant-based-foods-and-beverages, en:plant-... \n", "8 [en:sugary-snacks, en:confectioneries] \n", "9 [en:dairies] \n", "10 [en:dairies, en:cheeses] \n", "11 [en:seafood] \n", "12 [en:dairies] \n", "13 [en:fruits-and-vegetables-based-foods, en:frui... \n", "14 [en:fresh-foods, en:fruits-and-vegetables-base... \n", "15 [en:sugary-snacks] \n", "16 [en:non-sugared-beverages] \n", "17 [en:cheeses] \n", "18 [en:beverages, en:non-sugared-beverages] \n", "19 [en:plant-based-foods, en:groceries, en:plant-... \n", "20 [en:fresh-foods, en:meals] \n", "21 [en:non-sugared-beverages] \n", "22 [en:meals] \n", "23 [en:frozen-foods] \n", "24 [en:plant-based-foods-and-beverages, en:plant-... \n", "25 [en:plant-based-foods-and-beverages, en:bevera... \n", "26 [en:confectioneries] \n", "27 [en:sugary-snacks, en:chocolates] \n", "28 [en:sugary-snacks, en:confectioneries] \n", "29 [en:meats] \n", "... ... \n", "29197 [en:canned-foods] \n", "29198 [en:plant-based-foods-and-beverages] \n", "29199 [en:fresh-foods] \n", "29200 [en:plant-based-foods-and-beverages, en:plant-... \n", "29201 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "29202 [en:beverages, en:alcoholic-beverages] \n", "29203 [en:desserts] \n", "29204 [en:beverages, en:plant-based-beverages, en:no... \n", "29205 [en:plant-based-foods-and-beverages, en:plant-... \n", "29206 [en:fresh-foods, en:meals] \n", "29207 [en:non-sugared-beverages] \n", "29208 [en:sugary-snacks] \n", "29209 [en:non-sugared-beverages] \n", "29210 [en:plant-based-foods-and-beverages, en:plant-... \n", "29211 [en:plant-based-foods-and-beverages, en:plant-... \n", "29212 [en:frozen-foods] \n", "29213 [en:frozen-foods] \n", "29214 [en:plant-based-foods-and-beverages, en:plant-... \n", "29215 [en:plant-based-foods-and-beverages, en:bevera... \n", "29216 [en:meals] \n", "29217 [en:plant-based-foods-and-beverages, en:plant-... \n", "29218 [en:canned-foods, en:meals] \n", "29219 [en:beverages, en:dairies, en:non-sugared-beve... \n", "29220 [en:plant-based-foods-and-beverages, en:plant-... \n", "29221 [en:canned-foods] \n", "29222 [en:prepared-meats] \n", "29223 [en:plant-based-foods-and-beverages, en:plant-... \n", "29224 [en:plant-based-foods-and-beverages, en:plant-... \n", "29225 [en:plant-based-foods-and-beverages, en:plant-... \n", "29226 [en:salty-snacks] \n", "\n", "[29227 rows x 4 columns]" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "testing_diffs = []\n", "for _, row in testing_dataframe.iterrows():\n", " diff = set(row.original_labels) - set(row.guessed_labels)\n", " if len(diff) > 0:\n", " testing_diffs.append({\n", " 'product_name': row.product_name,\n", " 'good_guessed_labels': set(row.original_labels) - diff,\n", " 'extra_guessed_labels': [label for label in row.guessed_labels if label not in row.original_labels],\n", " 'missing_guessed_labels': [label for label in row.original_labels if label not in row.guessed_labels]\n", " })\n", "pandas.DataFrame(testing_diffs,\n", " columns=['product_name', 'good_guessed_labels', 'extra_guessed_labels', 'missing_guessed_labels'])" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['offClassifier.pkl']" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Dump the classifier\n", "joblib.dump((mlb, classifier), 'offClassifier.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict!" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "def batch(iterable, size):\n", " \"\"\"\n", " Get items from a sequence a batch at a time.\n", "\n", " :param iterable: The iterable to get the items from.\n", " :param size: The size of the batches.\n", " :return: A new iterable.\n", " \"\"\"\n", " sourceiter = iter(iterable)\n", " while True:\n", " batchiter = itertools.islice(sourceiter, size)\n", " yield itertools.chain([next(batchiter)], batchiter)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "# Load the classifier\n", "mlb, classifier = joblib.load('offClassifier.pkl')" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/lverney/.local/share/virtualenvs/machine_learning/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: generator 'batch' raised StopIteration\n", " \"\"\"\n" ] } ], "source": [ "X_predicted = products_without_categories['product_name'].values.astype('U')\n", "\n", "all_labels = []\n", "\n", "for i in batch(X_predicted, 30000):\n", " predicted = classifier.predict(list(i))\n", " all_labels.extend(mlb.inverse_transform(predicted))" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_nameguessed_labels
0Farine de blé noir(en:cereals-and-potatoes, en:cereals-and-their...
1Banana Chips Sweetened (Whole)(en:appetizers, en:chips-and-fries, en:crisps,...
2Peanuts(en:legumes, en:legumes-and-their-products, en...
3Organic Salted Nut Mix()
4Organic Polenta(en:cereals-and-potatoes, en:cereals-and-their...
5Breadshop Honey Gone Nuts Granola(en:breakfasts, en:cereals-and-potatoes, en:ce...
6Organic Long Grain White Rice(en:cereal-grains, en:cereals-and-potatoes, en...
7Organic Muesli(en:breakfast-cereals, en:breakfasts, en:cerea...
8Organic Dark Chocolate Minis(en:chocolates, en:dark-chocolates, en:sugary-...
9Organic Sunflower Oil(en:fats, en:plant-based-foods, en:plant-based...
10Organic Adzuki Beans(en:legumes-and-their-products, en:plant-based...
11Organic Penne Pasta(en:cereals-and-potatoes, en:cereals-and-their...
12Zen Party Mix()
13Organic Golden Flax Seeds(en:plant-based-foods, en:plant-based-foods-an...
14Organic Spicy Punks()
15Cinnamon Nut Granola(en:breakfast-cereals, en:breakfasts, en:cerea...
16Organic Hazelnuts(en:sugary-snacks,)
17Organic Sweetened Banana Chips(en:appetizers, en:chips-and-fries, en:crisps,...
18Lotus Organic Brown Jasmine Rice(en:cereals-and-potatoes, en:cereals-and-their...
19Organic Oat Groats()
20Energy Power Mix(en:beverages,)
21Antioxidant Mix - Berries & Chocolate(en:sugary-snacks,)
22Organic Quinoa Coconut Granola With Mango(en:cereals-and-potatoes, en:plant-based-foods...
23Fire Roasted Hatch Green Chile Almonds(en:plant-based-foods, en:plant-based-foods-an...
24Peanut Butter Power Chews(en:plant-based-foods, en:spreads)
25Real Salt Granular(en:groceries,)
26Organic Unswt Berry Coconut Granola(en:plant-based-foods-and-beverages,)
27Roasted Salted Black Pepper Cashews(en:appetizers, en:plant-based-foods-and-bever...
28Thai Curry Roasted Cashews(en:plant-based-foods, en:plant-based-foods-an...
29Wasabi Tamari Almonds()
.........
254702Fairy Tail()
254703Biscuits aux céréales, aux pépites de chocolat...(en:biscuits, en:biscuits-and-cakes, en:sugary...
254704Dico anglais(en:plant-based-foods, en:plant-based-foods-an...
254705Neszt Cochon Con()
254706Drid apricot the queen(en:beverages,)
254707Natural Cassava(en:beverages,)
254708nan()
254709nan()
254710Soda 1(en:beverages, en:carbonated-drinks, en:sodas,...
254711Merci 1(en:candies, en:chocolates, en:confectioneries...
254712Merci2()
254713Merci3()
254714Libro parachute3()
254715nan()
254716Vegan easy()
254717Tarifs djoghrafia()
254718nan()
254719Ferrero Rocher(en:chocolates,)
254720nan()
254721Raspados Ice Bars(en:sugary-snacks,)
254722nf test()
254723Amandes(en:nuts-and-their-products, en:plant-based-fo...
254724Mleko wiejskie(en:dairies, en:milks)
254725Poireaux(en:fruits-and-vegetables-based-foods, en:plan...
254726Cheese cake thé vert, yuzu(en:herbal-teas,)
254727Tomato & ricotta(en:plant-based-foods, en:plant-based-foods-an...
254728Mint Melange Tea A Blend Of Peppermint, Lemon ...(en:beverages, en:non-sugared-beverages)
254729Biscottes bio(en:breads, en:cereals-and-potatoes, en:plant-...
254730Tomates aux Vermicelles(en:fruits-and-vegetables-based-foods, en:plan...
254731Sugar Free Drink Mix, Peach Tea(en:beverages, en:plant-based-beverages, en:su...
\n", "

254732 rows × 2 columns

\n", "
" ], "text/plain": [ " product_name \\\n", "0 Farine de blé noir \n", "1 Banana Chips Sweetened (Whole) \n", "2 Peanuts \n", "3 Organic Salted Nut Mix \n", "4 Organic Polenta \n", "5 Breadshop Honey Gone Nuts Granola \n", "6 Organic Long Grain White Rice \n", "7 Organic Muesli \n", "8 Organic Dark Chocolate Minis \n", "9 Organic Sunflower Oil \n", "10 Organic Adzuki Beans \n", "11 Organic Penne Pasta \n", "12 Zen Party Mix \n", "13 Organic Golden Flax Seeds \n", "14 Organic Spicy Punks \n", "15 Cinnamon Nut Granola \n", "16 Organic Hazelnuts \n", "17 Organic Sweetened Banana Chips \n", "18 Lotus Organic Brown Jasmine Rice \n", "19 Organic Oat Groats \n", "20 Energy Power Mix \n", "21 Antioxidant Mix - Berries & Chocolate \n", "22 Organic Quinoa Coconut Granola With Mango \n", "23 Fire Roasted Hatch Green Chile Almonds \n", "24 Peanut Butter Power Chews \n", "25 Real Salt Granular \n", "26 Organic Unswt Berry Coconut Granola \n", "27 Roasted Salted Black Pepper Cashews \n", "28 Thai Curry Roasted Cashews \n", "29 Wasabi Tamari Almonds \n", "... ... \n", "254702 Fairy Tail \n", "254703 Biscuits aux céréales, aux pépites de chocolat... \n", "254704 Dico anglais \n", "254705 Neszt Cochon Con \n", "254706 Drid apricot the queen \n", "254707 Natural Cassava \n", "254708 nan \n", "254709 nan \n", "254710 Soda 1 \n", "254711 Merci 1 \n", "254712 Merci2 \n", "254713 Merci3 \n", "254714 Libro parachute3 \n", "254715 nan \n", "254716 Vegan easy \n", "254717 Tarifs djoghrafia \n", "254718 nan \n", "254719 Ferrero Rocher \n", "254720 nan \n", "254721 Raspados Ice Bars \n", "254722 nf test \n", "254723 Amandes \n", "254724 Mleko wiejskie \n", "254725 Poireaux \n", "254726 Cheese cake thé vert, yuzu \n", "254727 Tomato & ricotta \n", "254728 Mint Melange Tea A Blend Of Peppermint, Lemon ... \n", "254729 Biscottes bio \n", "254730 Tomates aux Vermicelles \n", "254731 Sugar Free Drink Mix, Peach Tea \n", "\n", " guessed_labels \n", "0 (en:cereals-and-potatoes, en:cereals-and-their... \n", "1 (en:appetizers, en:chips-and-fries, en:crisps,... \n", "2 (en:legumes, en:legumes-and-their-products, en... \n", "3 () \n", "4 (en:cereals-and-potatoes, en:cereals-and-their... \n", "5 (en:breakfasts, en:cereals-and-potatoes, en:ce... \n", "6 (en:cereal-grains, en:cereals-and-potatoes, en... \n", "7 (en:breakfast-cereals, en:breakfasts, en:cerea... \n", "8 (en:chocolates, en:dark-chocolates, en:sugary-... \n", "9 (en:fats, en:plant-based-foods, en:plant-based... \n", "10 (en:legumes-and-their-products, en:plant-based... \n", "11 (en:cereals-and-potatoes, en:cereals-and-their... \n", "12 () \n", "13 (en:plant-based-foods, en:plant-based-foods-an... \n", "14 () \n", "15 (en:breakfast-cereals, en:breakfasts, en:cerea... \n", "16 (en:sugary-snacks,) \n", "17 (en:appetizers, en:chips-and-fries, en:crisps,... \n", "18 (en:cereals-and-potatoes, en:cereals-and-their... \n", "19 () \n", "20 (en:beverages,) \n", "21 (en:sugary-snacks,) \n", "22 (en:cereals-and-potatoes, en:plant-based-foods... \n", "23 (en:plant-based-foods, en:plant-based-foods-an... \n", "24 (en:plant-based-foods, en:spreads) \n", "25 (en:groceries,) \n", "26 (en:plant-based-foods-and-beverages,) \n", "27 (en:appetizers, en:plant-based-foods-and-bever... \n", "28 (en:plant-based-foods, en:plant-based-foods-an... \n", "29 () \n", "... ... \n", "254702 () \n", "254703 (en:biscuits, en:biscuits-and-cakes, en:sugary... \n", "254704 (en:plant-based-foods, en:plant-based-foods-an... \n", "254705 () \n", "254706 (en:beverages,) \n", "254707 (en:beverages,) \n", "254708 () \n", "254709 () \n", "254710 (en:beverages, en:carbonated-drinks, en:sodas,... \n", "254711 (en:candies, en:chocolates, en:confectioneries... \n", "254712 () \n", "254713 () \n", "254714 () \n", "254715 () \n", "254716 () \n", "254717 () \n", "254718 () \n", "254719 (en:chocolates,) \n", "254720 () \n", "254721 (en:sugary-snacks,) \n", "254722 () \n", "254723 (en:nuts-and-their-products, en:plant-based-fo... \n", "254724 (en:dairies, en:milks) \n", "254725 (en:fruits-and-vegetables-based-foods, en:plan... \n", "254726 (en:herbal-teas,) \n", "254727 (en:plant-based-foods, en:plant-based-foods-an... \n", "254728 (en:beverages, en:non-sugared-beverages) \n", "254729 (en:breads, en:cereals-and-potatoes, en:plant-... \n", "254730 (en:fruits-and-vegetables-based-foods, en:plan... \n", "254731 (en:beverages, en:plant-based-beverages, en:su... \n", "\n", "[254732 rows x 2 columns]" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prediction_dataframe = pandas.DataFrame({\n", " 'product_name': products_without_categories['product_name'].values.astype('U'),\n", " 'guessed_labels': all_labels\n", "}, columns=['product_name', 'guessed_labels'])\n", "prediction_dataframe.to_csv('prediction.csv', index=False)\n", "prediction_dataframe" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 True\n", "2 True\n", "3 False\n", "4 True\n", "5 True\n", "6 True\n", "7 True\n", "8 True\n", "9 True\n", "10 True\n", "11 True\n", "12 False\n", "13 True\n", "14 False\n", "15 True\n", "16 True\n", "17 True\n", "18 True\n", "19 False\n", "20 True\n", "21 True\n", "22 True\n", "23 True\n", "24 True\n", "25 True\n", "26 True\n", "27 True\n", "28 True\n", "29 False\n", " ... \n", "254702 False\n", "254703 True\n", "254704 True\n", "254705 False\n", "254706 True\n", "254707 True\n", "254708 False\n", "254709 False\n", "254710 True\n", "254711 True\n", "254712 False\n", "254713 False\n", "254714 False\n", "254715 False\n", "254716 False\n", "254717 False\n", "254718 False\n", "254719 True\n", "254720 False\n", "254721 True\n", "254722 False\n", "254723 True\n", "254724 True\n", "254725 True\n", "254726 True\n", "254727 True\n", "254728 True\n", "254729 True\n", "254730 True\n", "254731 True\n", "Name: guessed_labels, Length: 254732, dtype: bool" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prediction_dataframe['guessed_labels'].str.len() > 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }