{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import collections\n", "import itertools\n", "import json\n", "import os\n", "\n", "import numpy as np\n", "import pandas\n", "import random\n", "\n", "from sklearn.externals import joblib\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_extraction.text import TfidfTransformer\n", "from sklearn.multiclass import OneVsRestClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.svm import LinearSVC\n", "\n", "MIN_NUMBER_PRODUCTS_PER_CATEGORY = 25\n", "TRAINING_DATASET_SIZE = 50 / 100" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data from CSV dump" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "csv = pandas.read_csv(\n", " 'en.openfoodfacts.org.products.csv',\n", " sep='\\t',\n", " usecols=[0, 7, 15],\n", " dtype={'code': 'str', 'product_name': 'str'},\n", " converters={'categories_tags': lambda x: x.split(',') if x else np.NaN}\n", ")\n", "# Filter products with and without categories in two different DataFrames\n", "products_with_categories = csv[pandas.notnull(csv['categories_tags'])]\n", "products_without_categories = csv[pandas.isnull(csv['categories_tags'])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fitting on dataset" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# Let's build vectors of products and categories, for training purpose.\n", "categories = [\n", " category\n", " for category, count in collections.Counter(\n", " category for category_list in products_with_categories['categories_tags'] for category in category_list\n", " ).items()\n", " if count > MIN_NUMBER_PRODUCTS_PER_CATEGORY and category != ''\n", "]\n", "# Filter out empty lists of categories\n", "XY = products_with_categories.copy()\n", "XY['categories_tags'] = XY['categories_tags'].map(lambda c_list: [c for c in c_list if c in categories])\n", "mask = XY['categories_tags'].str.len() > 0\n", "XY = XY[mask]\n", "# TODO: We should ensure each category is sufficiently represented" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# Select training sample\n", "XY_training = XY.sample(frac=0.5)\n", "X_train = XY_training['product_name'].values.astype('U')\n", "Y_train = [np.array(c).astype('U') for c in XY_training['categories_tags'].values]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "mlb = MultiLabelBinarizer()\n", "Y_train_transformed = mlb.fit_transform(Y_train)" ] }, { "cell_type": "code", "execution_count": 335, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(memory=None,\n", " steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", " ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", " verbose=0),\n", " n_jobs=1))])" ] }, "execution_count": 335, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fit our classifier\n", "classifier = Pipeline([\n", " ('vectorizer', CountVectorizer()),\n", " ('tfidf', TfidfTransformer()),\n", " ('clf', OneVsRestClassifier(LinearSVC()))])\n", "\n", "classifier.fit(X_train, Y_train_transformed)" ] }, { "cell_type": "code", "execution_count": 380, "metadata": {}, "outputs": [], "source": [ "# Check score on a testing set\n", "training_indices = np.in1d(XY['product_name'].values.astype('U'), X_train, invert=True)\n", "X_test = XY['product_name'][training_indices]\n", "Y_test = [np.array(c).astype('U') for c in XY['categories_tags'][training_indices].values]\n", "\n", "predicted = classifier.predict(X_test)\n", "all_labels = mlb.inverse_transform(predicted)" ] }, { "cell_type": "code", "execution_count": 394, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_nameoriginal_labelsguessed_labels
176Salade Cesar[en:plant-based-foods-and-beverages, en:plant-...(en:meals, fr:salades-composees)
184lentilles vertes[en:plant-based-foods-and-beverages, en:plant-...(en:green-lentils, en:legume-seeds, en:legumes...
185Root Beer[en:beverages, en:carbonated-drinks, en:sodas,...(en:alcoholic-beverages, en:beers, en:beverage...
186Biscuits sablés fourrage au cacao[en:sugary-snacks, en:biscuits-and-cakes, en:b...(en:biscuits, en:biscuits-and-cakes, en:chocol...
238Blle Pet 50CL Coca Cola Cherry[en:beverages, en:sugared-beverages](en:beverages, en:non-sugared-beverages, en:so...
244Cauliflower[en:plant-based-foods-and-beverages, en:plant-...(en:fruits-and-vegetables-based-foods, en:leaf...
247Salsa de mostaza[en:groceries, en:condiments, en:sauces, en:mu...(en:condiments, en:groceries, en:mustards, en:...
2497Up[en:plant-based-foods-and-beverages, en:bevera...(en:beverages, en:non-sugared-beverages)
276Mehrkomponeneten Protein 90 C6 Haselnuß[en:dietary-supplements, en:bodybuilding-suppl...(en:bodybuilding-supplements, en:dietary-suppl...
292Cakes aux Fruits[en:sugary-snacks, en:biscuits-and-cakes, en:d...(en:biscuits-and-cakes, en:cakes, en:desserts,...
309Whey Protein aus Molke 1000 Gramm Vanilla[en:dietary-supplements, en:bodybuilding-suppl...(en:bodybuilding-supplements, en:dietary-suppl...
310Fondants Citron[en:sugary-snacks, en:biscuits-and-cakes, en:d...()
369Sour Fruit Gummies[en:sugary-snacks, en:confectioneries, en:cand...(en:candies, en:confectioneries, en:sugary-sna...
421Mixed peppers[en:plant-based-foods-and-beverages, en:plant-...(en:fruits-and-vegetables-based-foods, en:plan...
43730 Panach' Fruits[en:sugary-snacks, en:biscuits-and-cakes, en:d...(en:fruits-and-vegetables-based-foods, en:frui...
440Foie gras de canard du Périgord[en:fish-and-meat-and-eggs, fr:foies-gras, fr:...(en:fish-and-meat-and-eggs, fr:foies-gras, fr:...
454Terrine de caille aux pruneaux d'Agen[en:terrine, fr:terrines-de-volailles](en:terrine,)
455Foie de canard aux figues[en:fish-and-meat-and-eggs, fr:foies-gras, fr:...(en:fish-and-meat-and-eggs, en:meats)
458Foie gras d'oie Périgord[en:fish-and-meat-and-eggs, fr:foies-gras](en:fish-and-meat-and-eggs, fr:foies-gras)
459Foie gras d'oie du Périgord[en:fish-and-meat-and-eggs, fr:foies-gras](en:fish-and-meat-and-eggs, fr:foies-gras)
464All Butter Belgian White Chocolate Chunk Cookies[en:sugary-snacks, en:biscuits-and-cakes, en:b...(en:biscuits, en:biscuits-and-cakes, en:chocol...
465All Butter Fruity Flapjack Cookies[en:sugary-snacks, en:biscuits-and-cakes, en:b...(en:biscuits, en:biscuits-and-cakes, en:cookie...
467All butter Cranberry & Orange Cookies[en:sugary-snacks, en:biscuits-and-cakes, en:b...(en:biscuits, en:biscuits-and-cakes, en:cookie...
468All Butter Triple Belgian Chocolate Chunk Cookies[en:sugary-snacks, en:biscuits-and-cakes, en:b...(en:biscuits, en:biscuits-and-cakes, en:cookie...
469Cookies Stem Ginger[en:sugary-snacks, en:biscuits-and-cakes, en:b...(en:biscuits, en:biscuits-and-cakes, en:cookie...
472Stem Ginger Dunkers[en:sugary-snacks, en:biscuits-and-cakes, en:b...()
484Reduced Fat Mayonnaise[en:groceries, en:sauces, en:mayonnaises](en:dairies, en:groceries, en:milks, en:sauces)
492mostly mesquite honey[en:spreads, en:breakfasts, en:sweet-spreads, ...(en:breakfasts,)
496Clam Chowder A Condensed Soup[en:meals, en:soups](en:meals, en:soups)
509Salade Mac[en:plant-based-foods-and-beverages, en:plant-...()
............
358216Sauc Nuoc Mam[en:groceries, en:sauces](en:plant-based-foods, en:plant-based-foods-an...
358217Sauce piment doux Thaï[en:groceries, en:sauces, en:pimented-sauces](en:groceries, en:sauces)
358220Sauce Piment Sriracha[en:plant-based-foods-and-beverages, en:plant-...(en:groceries, en:pimented-sauces, en:sauces)
358222Sauce Thaï Satay[en:groceries, en:sauces, en:pimented-sauces](en:groceries, en:sauces)
358225Crème de coco allégée[en:plant-based-foods-and-beverages, en:plant-...(en:plant-based-creams, en:plant-based-foods-a...
358226Nouilles instantanées[en:plant-based-foods-and-beverages, en:plant-...(en:cereals-and-potatoes, en:cereals-and-their...
358231Pad Thaï Sauce Wok[en:groceries, en:sauces](en:sauces,)
358233Pâte de Curry Vert[en:groceries, en:sauces](en:curry-pastes, en:groceries)
358234Pâte de curry rouge[en:groceries, en:sauces, en:curry-pastes](en:groceries,)
358242Gula Gula Durian[en:sugary-snacks, en:confectioneries, en:cand...()
358254Boisson au chrysanthème[en:plant-based-foods-and-beverages, en:bevera...(en:beverages, en:non-sugared-beverages, en:pl...
358255Lychee Drink[en:plant-based-foods-and-beverages, en:bevera...(en:beverages, en:fruit-based-beverages, en:pl...
358279Pudding nata de coco[en:desserts, en:puddings](en:desserts, en:puddings)
358282Glinter Soft Drink Orange[en:beverages, en:non-sugared-beverages](en:beverages, en:plant-based-foods-and-bevera...
358283Soft Drink[en:beverages, en:artificially-sweetened-bever...()
358293Healtier palm oil (L'huile de palme)[en:plant-based-foods-and-beverages, en:plant-...(en:fats, en:plant-based-foods, en:plant-based...
358306100% Pur Jus 4 agrumes[en:plant-based-foods-and-beverages, en:bevera...(en:beverages, en:fruit-based-beverages, en:fr...
358325tuna chunks in spring water[en:canned-foods, en:seafood, en:fishes, en:ca...(en:canned-fishes, en:canned-foods, en:canned-...
358335Santa Cruz Chilli & Lime Dressing[en:groceries, en:sauces, en:salad-dressings](en:groceries, en:salad-dressings, en:sauces)
358343Fisherman's Friend Miel-Citron[en:sugary-snacks, en:confectioneries, en:cand...(en:candies, en:confectioneries, en:sugary-sna...
358347Dessert Noir (lot de 2)[en:sugary-snacks, en:chocolates, en:dark-choc...(en:chocolates, en:dark-chocolates, en:dessert...
358351Kirkland Purified Drinking Water[en:beverages, en:waters, en:non-sugared-bever...(en:beverages, en:non-sugared-beverages, en:wa...
358372Boîte de saumon frais trouvée à l'extérieur d'...[en:seafood, en:fishes, en:salmons]()
358389Cervoise Mexicaine[en:beverages, en:alcoholic-beverages, en:arti...(en:alcoholic-beverages, en:amber-beers, en:be...
358399Ma bite[en:beverages, en:non-sugared-beverages](en:plant-based-foods, en:plant-based-foods-an...
358400Les schtroumpfs & le village des fille[en:fats](en:confectioneries, en:sugary-snacks)
358426Les Belles Tranches Bacon fumé[en:meats, en:pork, en:bacon, en:sliced-bacon](en:bacon, en:meats, en:pork, en:sliced-bacon)
358428Tartines craquantes bio au sarrasin[en:plant-based-foods-and-beverages, en:plant-...(en:breads, en:cereals-and-potatoes, en:crispb...
358438Roussette du Bugey (2011)[en:beverages, en:alcoholic-beverages, en:wine...(en:alcoholic-beverages, en:beverages, en:fren...
358446乐吧泡菜味薯片[en:salty-snacks, en:appetizers, en:chips-and-...()
\n", "

38381 rows × 3 columns

\n", "
" ], "text/plain": [ " product_name \\\n", "176 Salade Cesar \n", "184 lentilles vertes \n", "185 Root Beer \n", "186 Biscuits sablés fourrage au cacao \n", "238 Blle Pet 50CL Coca Cola Cherry \n", "244 Cauliflower \n", "247 Salsa de mostaza \n", "249 7Up \n", "276 Mehrkomponeneten Protein 90 C6 Haselnuß \n", "292 Cakes aux Fruits \n", "309 Whey Protein aus Molke 1000 Gramm Vanilla \n", "310 Fondants Citron \n", "369 Sour Fruit Gummies \n", "421 Mixed peppers \n", "437 30 Panach' Fruits \n", "440 Foie gras de canard du Périgord \n", "454 Terrine de caille aux pruneaux d'Agen \n", "455 Foie de canard aux figues \n", "458 Foie gras d'oie Périgord \n", "459 Foie gras d'oie du Périgord \n", "464 All Butter Belgian White Chocolate Chunk Cookies \n", "465 All Butter Fruity Flapjack Cookies \n", "467 All butter Cranberry & Orange Cookies \n", "468 All Butter Triple Belgian Chocolate Chunk Cookies \n", "469 Cookies Stem Ginger \n", "472 Stem Ginger Dunkers \n", "484 Reduced Fat Mayonnaise \n", "492 mostly mesquite honey \n", "496 Clam Chowder A Condensed Soup \n", "509 Salade Mac \n", "... ... \n", "358216 Sauc Nuoc Mam \n", "358217 Sauce piment doux Thaï \n", "358220 Sauce Piment Sriracha \n", "358222 Sauce Thaï Satay \n", "358225 Crème de coco allégée \n", "358226 Nouilles instantanées \n", "358231 Pad Thaï Sauce Wok \n", "358233 Pâte de Curry Vert \n", "358234 Pâte de curry rouge \n", "358242 Gula Gula Durian \n", "358254 Boisson au chrysanthème \n", "358255 Lychee Drink \n", "358279 Pudding nata de coco \n", "358282 Glinter Soft Drink Orange \n", "358283 Soft Drink \n", "358293 Healtier palm oil (L'huile de palme) \n", "358306 100% Pur Jus 4 agrumes \n", "358325 tuna chunks in spring water \n", "358335 Santa Cruz Chilli & Lime Dressing \n", "358343 Fisherman's Friend Miel-Citron \n", "358347 Dessert Noir (lot de 2) \n", "358351 Kirkland Purified Drinking Water \n", "358372 Boîte de saumon frais trouvée à l'extérieur d'... \n", "358389 Cervoise Mexicaine \n", "358399 Ma bite \n", "358400 Les schtroumpfs & le village des fille \n", "358426 Les Belles Tranches Bacon fumé \n", "358428 Tartines craquantes bio au sarrasin \n", "358438 Roussette du Bugey (2011) \n", "358446 乐吧泡菜味薯片 \n", "\n", " original_labels \\\n", "176 [en:plant-based-foods-and-beverages, en:plant-... \n", "184 [en:plant-based-foods-and-beverages, en:plant-... \n", "185 [en:beverages, en:carbonated-drinks, en:sodas,... \n", "186 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "238 [en:beverages, en:sugared-beverages] \n", "244 [en:plant-based-foods-and-beverages, en:plant-... \n", "247 [en:groceries, en:condiments, en:sauces, en:mu... \n", "249 [en:plant-based-foods-and-beverages, en:bevera... \n", "276 [en:dietary-supplements, en:bodybuilding-suppl... \n", "292 [en:sugary-snacks, en:biscuits-and-cakes, en:d... \n", "309 [en:dietary-supplements, en:bodybuilding-suppl... \n", "310 [en:sugary-snacks, en:biscuits-and-cakes, en:d... \n", "369 [en:sugary-snacks, en:confectioneries, en:cand... \n", "421 [en:plant-based-foods-and-beverages, en:plant-... \n", "437 [en:sugary-snacks, en:biscuits-and-cakes, en:d... \n", "440 [en:fish-and-meat-and-eggs, fr:foies-gras, fr:... \n", "454 [en:terrine, fr:terrines-de-volailles] \n", "455 [en:fish-and-meat-and-eggs, fr:foies-gras, fr:... \n", "458 [en:fish-and-meat-and-eggs, fr:foies-gras] \n", "459 [en:fish-and-meat-and-eggs, fr:foies-gras] \n", "464 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "465 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "467 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "468 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "469 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "472 [en:sugary-snacks, en:biscuits-and-cakes, en:b... \n", "484 [en:groceries, en:sauces, en:mayonnaises] \n", "492 [en:spreads, en:breakfasts, en:sweet-spreads, ... \n", "496 [en:meals, en:soups] \n", "509 [en:plant-based-foods-and-beverages, en:plant-... \n", "... ... \n", "358216 [en:groceries, en:sauces] \n", "358217 [en:groceries, en:sauces, en:pimented-sauces] \n", "358220 [en:plant-based-foods-and-beverages, en:plant-... \n", "358222 [en:groceries, en:sauces, en:pimented-sauces] \n", "358225 [en:plant-based-foods-and-beverages, en:plant-... \n", "358226 [en:plant-based-foods-and-beverages, en:plant-... \n", "358231 [en:groceries, en:sauces] \n", "358233 [en:groceries, en:sauces] \n", "358234 [en:groceries, en:sauces, en:curry-pastes] \n", "358242 [en:sugary-snacks, en:confectioneries, en:cand... \n", "358254 [en:plant-based-foods-and-beverages, en:bevera... \n", "358255 [en:plant-based-foods-and-beverages, en:bevera... \n", "358279 [en:desserts, en:puddings] \n", "358282 [en:beverages, en:non-sugared-beverages] \n", "358283 [en:beverages, en:artificially-sweetened-bever... \n", "358293 [en:plant-based-foods-and-beverages, en:plant-... \n", "358306 [en:plant-based-foods-and-beverages, en:bevera... \n", "358325 [en:canned-foods, en:seafood, en:fishes, en:ca... \n", "358335 [en:groceries, en:sauces, en:salad-dressings] \n", "358343 [en:sugary-snacks, en:confectioneries, en:cand... \n", "358347 [en:sugary-snacks, en:chocolates, en:dark-choc... \n", "358351 [en:beverages, en:waters, en:non-sugared-bever... \n", "358372 [en:seafood, en:fishes, en:salmons] \n", "358389 [en:beverages, en:alcoholic-beverages, en:arti... \n", "358399 [en:beverages, en:non-sugared-beverages] \n", "358400 [en:fats] \n", "358426 [en:meats, en:pork, en:bacon, en:sliced-bacon] \n", "358428 [en:plant-based-foods-and-beverages, en:plant-... \n", "358438 [en:beverages, en:alcoholic-beverages, en:wine... \n", "358446 [en:salty-snacks, en:appetizers, en:chips-and-... \n", "\n", " guessed_labels \n", "176 (en:meals, fr:salades-composees) \n", "184 (en:green-lentils, en:legume-seeds, en:legumes... \n", "185 (en:alcoholic-beverages, en:beers, en:beverage... \n", "186 (en:biscuits, en:biscuits-and-cakes, en:chocol... \n", "238 (en:beverages, en:non-sugared-beverages, en:so... \n", "244 (en:fruits-and-vegetables-based-foods, en:leaf... \n", "247 (en:condiments, en:groceries, en:mustards, en:... \n", "249 (en:beverages, en:non-sugared-beverages) \n", "276 (en:bodybuilding-supplements, en:dietary-suppl... \n", "292 (en:biscuits-and-cakes, en:cakes, en:desserts,... \n", "309 (en:bodybuilding-supplements, en:dietary-suppl... \n", "310 () \n", "369 (en:candies, en:confectioneries, en:sugary-sna... \n", "421 (en:fruits-and-vegetables-based-foods, en:plan... \n", "437 (en:fruits-and-vegetables-based-foods, en:frui... \n", "440 (en:fish-and-meat-and-eggs, fr:foies-gras, fr:... \n", "454 (en:terrine,) \n", "455 (en:fish-and-meat-and-eggs, en:meats) \n", "458 (en:fish-and-meat-and-eggs, fr:foies-gras) \n", "459 (en:fish-and-meat-and-eggs, fr:foies-gras) \n", "464 (en:biscuits, en:biscuits-and-cakes, en:chocol... \n", "465 (en:biscuits, en:biscuits-and-cakes, en:cookie... \n", "467 (en:biscuits, en:biscuits-and-cakes, en:cookie... \n", "468 (en:biscuits, en:biscuits-and-cakes, en:cookie... \n", "469 (en:biscuits, en:biscuits-and-cakes, en:cookie... \n", "472 () \n", "484 (en:dairies, en:groceries, en:milks, en:sauces) \n", "492 (en:breakfasts,) \n", "496 (en:meals, en:soups) \n", "509 () \n", "... ... \n", "358216 (en:plant-based-foods, en:plant-based-foods-an... \n", "358217 (en:groceries, en:sauces) \n", "358220 (en:groceries, en:pimented-sauces, en:sauces) \n", "358222 (en:groceries, en:sauces) \n", "358225 (en:plant-based-creams, en:plant-based-foods-a... \n", "358226 (en:cereals-and-potatoes, en:cereals-and-their... \n", "358231 (en:sauces,) \n", "358233 (en:curry-pastes, en:groceries) \n", "358234 (en:groceries,) \n", "358242 () \n", "358254 (en:beverages, en:non-sugared-beverages, en:pl... \n", "358255 (en:beverages, en:fruit-based-beverages, en:pl... \n", "358279 (en:desserts, en:puddings) \n", "358282 (en:beverages, en:plant-based-foods-and-bevera... \n", "358283 () \n", "358293 (en:fats, en:plant-based-foods, en:plant-based... \n", "358306 (en:beverages, en:fruit-based-beverages, en:fr... \n", "358325 (en:canned-fishes, en:canned-foods, en:canned-... \n", "358335 (en:groceries, en:salad-dressings, en:sauces) \n", "358343 (en:candies, en:confectioneries, en:sugary-sna... \n", "358347 (en:chocolates, en:dark-chocolates, en:dessert... \n", "358351 (en:beverages, en:non-sugared-beverages, en:wa... \n", "358372 () \n", "358389 (en:alcoholic-beverages, en:amber-beers, en:be... \n", "358399 (en:plant-based-foods, en:plant-based-foods-an... \n", "358400 (en:confectioneries, en:sugary-snacks) \n", "358426 (en:bacon, en:meats, en:pork, en:sliced-bacon) \n", "358428 (en:breads, en:cereals-and-potatoes, en:crispb... \n", "358438 (en:alcoholic-beverages, en:beverages, en:fren... \n", "358446 () \n", "\n", "[38381 rows x 3 columns]" ] }, "execution_count": 394, "metadata": {}, "output_type": "execute_result" } ], "source": [ "testing_dataframe = pandas.DataFrame({\n", " 'product_name': X_test,\n", " 'original_labels': Y_test,\n", " 'guessed_labels': all_labels\n", "}, columns=['product_name', 'original_labels', 'guessed_labels'])\n", "testing_dataframe.to_csv('testing.csv', index=False)\n", "testing_dataframe" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['offClassifier.pkl']" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Dump the classifier\n", "joblib.dump((mlb, classifier), 'offClassifier.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict!" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def batch(iterable, size):\n", " \"\"\"\n", " Get items from a sequence a batch at a time.\n", "\n", " :param iterable: The iterable to get the items from.\n", " :param size: The size of the batches.\n", " :return: A new iterable.\n", " \"\"\"\n", " sourceiter = iter(iterable)\n", " while True:\n", " batchiter = itertools.islice(sourceiter, size)\n", " yield itertools.chain([next(batchiter)], batchiter)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# Load the classifier\n", "mlb, classifier = joblib.load('offClassifier.pkl')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/phyks/.local/share/virtualenvs/machine_learning/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: generator 'batch' raised StopIteration\n", " \"\"\"\n" ] } ], "source": [ "X_predicted = products_without_categories['product_name'].values.astype('U')\n", "\n", "all_labels = []\n", "\n", "for i in batch(X_predicted, 30000):\n", " predicted = classifier.predict(list(i))\n", " all_labels.extend(mlb.inverse_transform(predicted))" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
product_nameguessed_labels
0Farine de blé noir(en:cereal-flours, en:cereals-and-potatoes, en...
1Banana Chips Sweetened (Whole)()
2Peanuts(en:legumes, en:legumes-and-their-products, en...
3Organic Salted Nut Mix()
4Organic Polenta(en:cereals-and-potatoes, en:cereals-and-their...
5Breadshop Honey Gone Nuts Granola(en:breakfasts, en:sugary-snacks)
6Organic Long Grain White Rice(en:cereal-grains, en:cereals-and-potatoes, en...
7Organic Muesli(en:breakfast-cereals, en:breakfasts, en:cerea...
8Organic Dark Chocolate Minis(en:chocolates, en:dark-chocolates, en:sugary-...
9Organic Sunflower Oil(en:fats, en:plant-based-foods, en:plant-based...
10Organic Adzuki Beans(en:legumes-and-their-products, en:plant-based...
11Organic Penne Pasta(en:cereals-and-potatoes, en:cereals-and-their...
12Zen Party Mix()
13Organic Golden Flax Seeds(en:plant-based-foods, en:plant-based-foods-an...
14Organic Spicy Punks()
15Cinnamon Nut Granola(en:breakfast-cereals, en:breakfasts, en:cerea...
16Organic Hazelnuts()
17Organic Sweetened Banana Chips()
18Lotus Organic Brown Jasmine Rice(en:cereal-grains, en:cereals-and-their-produc...
19Organic Oat Groats()
20Energy Power Mix()
21Antioxidant Mix - Berries & Chocolate(en:plant-based-foods-and-beverages,)
22Organic Quinoa Coconut Granola With Mango(en:plant-based-foods, en:plant-based-foods-an...
23Fire Roasted Hatch Green Chile Almonds()
24Peanut Butter Power Chews(en:plant-based-foods,)
25Real Salt Granular(en:chips-and-fries, en:crisps, en:groceries, ...
26Organic Unswt Berry Coconut Granola(en:plant-based-foods, en:plant-based-foods-an...
27Roasted Salted Black Pepper Cashews(en:cashew-nuts, en:nuts, en:plant-based-foods...
28Thai Curry Roasted Cashews(en:cashew-nuts, en:plant-based-foods, en:plan...
29Wasabi Tamari Almonds(en:condiments, en:groceries, en:plant-based-f...
.........
254346Fairy Tail()
254347Biscuits aux céréales, aux pépites de chocolat...(en:biscuits, en:biscuits-and-cakes, en:sugary...
254348Neszt Cochon Con()
254349nan()
254350Drid apricot the queen(en:beverages,)
254351Natural Cassava(en:plant-based-foods-and-beverages,)
254352nan()
254353nan()
254354Soda 1(en:beverages, en:carbonated-drinks, en:sodas,...
254355Merci 1(en:bonbons, en:candies, en:chocolates, en:cho...
254356Merci2()
254357Merci3()
254358Libro parachute3()
254359nan()
254360Vegan easy(en:plant-based-foods, en:plant-based-foods-an...
254361Tarifs djoghrafia()
254362nan()
254363Ferrero Rocher(en:chocolates, en:sugary-snacks)
254364nan()
254365Raspados Ice Bars(en:bars, en:cereal-bars, en:sugary-snacks)
254366nf test(en:beverages,)
254367Amandes(en:almonds, en:chocolates-with-almonds, en:nu...
254368Mleko wiejskie(en:dairies, en:milks)
254369Poireaux(en:fruits-and-vegetables-based-foods, en:leek...
254370Cheese cake thé vert, yuzu()
254371Tomato & ricotta()
254372Mint Melange Tea A Blend Of Peppermint, Lemon ...(en:beverages, en:non-sugared-beverages, en:su...
254373Biscottes bio(en:breads, en:cereals-and-potatoes, en:plant-...
254374Tomates aux Vermicelles(en:fruits-and-vegetables-based-foods, en:meal...
254375Sugar Free Drink Mix, Peach Tea(en:artificially-sweetened-beverages, en:bever...
\n", "

254376 rows × 2 columns

\n", "
" ], "text/plain": [ " product_name \\\n", "0 Farine de blé noir \n", "1 Banana Chips Sweetened (Whole) \n", "2 Peanuts \n", "3 Organic Salted Nut Mix \n", "4 Organic Polenta \n", "5 Breadshop Honey Gone Nuts Granola \n", "6 Organic Long Grain White Rice \n", "7 Organic Muesli \n", "8 Organic Dark Chocolate Minis \n", "9 Organic Sunflower Oil \n", "10 Organic Adzuki Beans \n", "11 Organic Penne Pasta \n", "12 Zen Party Mix \n", "13 Organic Golden Flax Seeds \n", "14 Organic Spicy Punks \n", "15 Cinnamon Nut Granola \n", "16 Organic Hazelnuts \n", "17 Organic Sweetened Banana Chips \n", "18 Lotus Organic Brown Jasmine Rice \n", "19 Organic Oat Groats \n", "20 Energy Power Mix \n", "21 Antioxidant Mix - Berries & Chocolate \n", "22 Organic Quinoa Coconut Granola With Mango \n", "23 Fire Roasted Hatch Green Chile Almonds \n", "24 Peanut Butter Power Chews \n", "25 Real Salt Granular \n", "26 Organic Unswt Berry Coconut Granola \n", "27 Roasted Salted Black Pepper Cashews \n", "28 Thai Curry Roasted Cashews \n", "29 Wasabi Tamari Almonds \n", "... ... \n", "254346 Fairy Tail \n", "254347 Biscuits aux céréales, aux pépites de chocolat... \n", "254348 Neszt Cochon Con \n", "254349 nan \n", "254350 Drid apricot the queen \n", "254351 Natural Cassava \n", "254352 nan \n", "254353 nan \n", "254354 Soda 1 \n", "254355 Merci 1 \n", "254356 Merci2 \n", "254357 Merci3 \n", "254358 Libro parachute3 \n", "254359 nan \n", "254360 Vegan easy \n", "254361 Tarifs djoghrafia \n", "254362 nan \n", "254363 Ferrero Rocher \n", "254364 nan \n", "254365 Raspados Ice Bars \n", "254366 nf test \n", "254367 Amandes \n", "254368 Mleko wiejskie \n", "254369 Poireaux \n", "254370 Cheese cake thé vert, yuzu \n", "254371 Tomato & ricotta \n", "254372 Mint Melange Tea A Blend Of Peppermint, Lemon ... \n", "254373 Biscottes bio \n", "254374 Tomates aux Vermicelles \n", "254375 Sugar Free Drink Mix, Peach Tea \n", "\n", " guessed_labels \n", "0 (en:cereal-flours, en:cereals-and-potatoes, en... \n", "1 () \n", "2 (en:legumes, en:legumes-and-their-products, en... \n", "3 () \n", "4 (en:cereals-and-potatoes, en:cereals-and-their... \n", "5 (en:breakfasts, en:sugary-snacks) \n", "6 (en:cereal-grains, en:cereals-and-potatoes, en... \n", "7 (en:breakfast-cereals, en:breakfasts, en:cerea... \n", "8 (en:chocolates, en:dark-chocolates, en:sugary-... \n", "9 (en:fats, en:plant-based-foods, en:plant-based... \n", "10 (en:legumes-and-their-products, en:plant-based... \n", "11 (en:cereals-and-potatoes, en:cereals-and-their... \n", "12 () \n", "13 (en:plant-based-foods, en:plant-based-foods-an... \n", "14 () \n", "15 (en:breakfast-cereals, en:breakfasts, en:cerea... \n", "16 () \n", "17 () \n", "18 (en:cereal-grains, en:cereals-and-their-produc... \n", "19 () \n", "20 () \n", "21 (en:plant-based-foods-and-beverages,) \n", "22 (en:plant-based-foods, en:plant-based-foods-an... \n", "23 () \n", "24 (en:plant-based-foods,) \n", "25 (en:chips-and-fries, en:crisps, en:groceries, ... \n", "26 (en:plant-based-foods, en:plant-based-foods-an... \n", "27 (en:cashew-nuts, en:nuts, en:plant-based-foods... \n", "28 (en:cashew-nuts, en:plant-based-foods, en:plan... \n", "29 (en:condiments, en:groceries, en:plant-based-f... \n", "... ... \n", "254346 () \n", "254347 (en:biscuits, en:biscuits-and-cakes, en:sugary... \n", "254348 () \n", "254349 () \n", "254350 (en:beverages,) \n", "254351 (en:plant-based-foods-and-beverages,) \n", "254352 () \n", "254353 () \n", "254354 (en:beverages, en:carbonated-drinks, en:sodas,... \n", "254355 (en:bonbons, en:candies, en:chocolates, en:cho... \n", "254356 () \n", "254357 () \n", "254358 () \n", "254359 () \n", "254360 (en:plant-based-foods, en:plant-based-foods-an... \n", "254361 () \n", "254362 () \n", "254363 (en:chocolates, en:sugary-snacks) \n", "254364 () \n", "254365 (en:bars, en:cereal-bars, en:sugary-snacks) \n", "254366 (en:beverages,) \n", "254367 (en:almonds, en:chocolates-with-almonds, en:nu... \n", "254368 (en:dairies, en:milks) \n", "254369 (en:fruits-and-vegetables-based-foods, en:leek... \n", "254370 () \n", "254371 () \n", "254372 (en:beverages, en:non-sugared-beverages, en:su... \n", "254373 (en:breads, en:cereals-and-potatoes, en:plant-... \n", "254374 (en:fruits-and-vegetables-based-foods, en:meal... \n", "254375 (en:artificially-sweetened-beverages, en:bever... \n", "\n", "[254376 rows x 2 columns]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prediction_dataframe = pandas.DataFrame({\n", " 'product_name': products_without_categories['product_name'].values.astype('U'),\n", " 'guessed_labels': all_labels\n", "}, columns=['product_name', 'guessed_labels'])\n", "prediction_dataframe.to_csv('prediction.csv', index=False)\n", "prediction_dataframe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Port to JS" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn_porter import Porter" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Load the classifier\n", "mlb, classifier = joblib.load('offClassifier.pkl')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "The given model 'Pipeline(memory=None,\n steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n dtype=, encoding='utf-8', input='content',\n lowercase=True, max_df=1.0, max_features=None, min_df=1,\n ngram_range=(1, 1), preprocessor=None, stop_words=None,\n ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0),\n n_jobs=1))])' isn't supported.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mporter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPorter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlanguage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'javascript'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mporter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexport\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/share/virtualenvs/machine_learning/lib/python3.6/site-packages/sklearn_porter/Porter.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, model, language, method, **kwargs)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0merror\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"The given model '{model}' isn't\"\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;34m\" supported.\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__dict__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merror\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;31m# Import model class:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: The given model 'Pipeline(memory=None,\n steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n dtype=, encoding='utf-8', input='content',\n lowercase=True, max_df=1.0, max_features=None, min_df=1,\n ngram_range=(1, 1), preprocessor=None, stop_words=None,\n ...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0),\n n_jobs=1))])' isn't supported." ] } ], "source": [ "porter = Porter(classifier, language='javascript')\n", "output = porter.export()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline\n" ] } ], "source": [ "algorithm_name = str(type(classifier).__name__)\n", "print(algorithm_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }