{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 2A.ml - Classification binaire avec features textuelles\n", "\n", "Ce notebook propose de voir comment incorporer des features pour voir l'am\u00e9lioration des performances sur une classification binaire. "]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", ""], "text/plain": ["\n", " | code | \n", "url | \n", "creator | \n", "created_t | \n", "created_datetime | \n", "last_modified_t | \n", "last_modified_datetime | \n", "product_name | \n", "generic_name | \n", "quantity | \n", "... | \n", "collagen-meat-protein-ratio_100g | \n", "cocoa_100g | \n", "chlorophyl_100g | \n", "carbon-footprint_100g | \n", "nutrition-score-fr_100g | \n", "nutrition-score-uk_100g | \n", "glycemic-index_100g | \n", "water-hardness_100g | \n", "hasE | \n", "s100 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1.008255e+10 | \n", "http://world-fr.openfoodfacts.org/produit/0010... | \n", "usda-ndb-import | \n", "1489064583 | \n", "2017-03-09T13:03:03Z | \n", "1489064583 | \n", "2017-03-09T13:03:03Z | \n", "Golden Island, Pork Jerky, Grilled Barbecue | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "23.0 | \n", "23.0 | \n", "NaN | \n", "NaN | \n", "False | \n", "17.0 | \n", "
1 | \n", "1.182204e+10 | \n", "http://world-fr.openfoodfacts.org/produit/0011... | \n", "usda-ndb-import | \n", "1489070197 | \n", "2017-03-09T14:36:37Z | \n", "1489070197 | \n", "2017-03-09T14:36:37Z | \n", "Big Fizz, Soda, Orange | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "True | \n", "7.0 | \n", "
2 | \n", "2.548401e+10 | \n", "http://world-fr.openfoodfacts.org/produit/0025... | \n", "usda-ndb-import | \n", "1489052024 | \n", "2017-03-09T09:33:44Z | \n", "1489052024 | \n", "2017-03-09T09:33:44Z | \n", "Tofubaked Marinated Baked Tofu, Sesame Ginger | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "-2.0 | \n", "-2.0 | \n", "NaN | \n", "NaN | \n", "True | \n", "17.0 | \n", "
3 | \n", "1.229250e+10 | \n", "http://world-fr.openfoodfacts.org/produit/0012... | \n", "usda-ndb-import | \n", "1489133493 | \n", "2017-03-10T08:11:33Z | \n", "1489133493 | \n", "2017-03-10T08:11:33Z | \n", "Milk Chocolate Eggs | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "23.0 | \n", "23.0 | \n", "NaN | \n", "NaN | \n", "True | \n", "17.0 | \n", "
4 | \n", "1.115054e+10 | \n", "http://world-fr.openfoodfacts.org/produit/0011... | \n", "usda-ndb-import | \n", "1489052892 | \n", "2017-03-09T09:48:12Z | \n", "1489052892 | \n", "2017-03-09T09:48:12Z | \n", "Fresh Polish Sausage | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "22.0 | \n", "22.0 | \n", "NaN | \n", "NaN | \n", "True | \n", "17.0 | \n", "
5 rows \u00d7 165 columns
\n", "\n", " | 0 | \n", "1 | \n", "
---|---|---|
code | \n", "1.00826e+10 | \n", "1.1822e+10 | \n", "
url | \n", "http://world-fr.openfoodfacts.org/produit/0010... | \n", "http://world-fr.openfoodfacts.org/produit/0011... | \n", "
creator | \n", "usda-ndb-import | \n", "usda-ndb-import | \n", "
created_t | \n", "1489064583 | \n", "1489070197 | \n", "
created_datetime | \n", "2017-03-09T13:03:03Z | \n", "2017-03-09T14:36:37Z | \n", "
last_modified_t | \n", "1489064583 | \n", "1489070197 | \n", "
last_modified_datetime | \n", "2017-03-09T13:03:03Z | \n", "2017-03-09T14:36:37Z | \n", "
product_name | \n", "Golden Island, Pork Jerky, Grilled Barbecue | \n", "Big Fizz, Soda, Orange | \n", "
generic_name | \n", "NaN | \n", "NaN | \n", "
quantity | \n", "NaN | \n", "NaN | \n", "
packaging | \n", "NaN | \n", "NaN | \n", "
packaging_tags | \n", "NaN | \n", "NaN | \n", "
brands | \n", "Golden Island Jerky Inc. | \n", "Rite Aid Corporation | \n", "
brands_tags | \n", "golden-island-jerky-inc | \n", "rite-aid-corporation | \n", "
categories | \n", "NaN | \n", "NaN | \n", "
categories_tags | \n", "NaN | \n", "NaN | \n", "
categories_fr | \n", "NaN | \n", "NaN | \n", "
origins | \n", "NaN | \n", "NaN | \n", "
origins_tags | \n", "NaN | \n", "NaN | \n", "
manufacturing_places | \n", "NaN | \n", "NaN | \n", "
manufacturing_places_tags | \n", "NaN | \n", "NaN | \n", "
labels | \n", "NaN | \n", "NaN | \n", "
labels_tags | \n", "NaN | \n", "NaN | \n", "
labels_fr | \n", "NaN | \n", "NaN | \n", "
emb_codes | \n", "NaN | \n", "NaN | \n", "
emb_codes_tags | \n", "NaN | \n", "NaN | \n", "
first_packaging_code_geo | \n", "NaN | \n", "NaN | \n", "
cities | \n", "NaN | \n", "NaN | \n", "
cities_tags | \n", "NaN | \n", "NaN | \n", "
purchase_places | \n", "NaN | \n", "NaN | \n", "
stores | \n", "NaN | \n", "NaN | \n", "
countries | \n", "US | \n", "US | \n", "
countries_tags | \n", "en:united-states | \n", "en:united-states | \n", "
countries_fr | \n", "\u00c9tats-Unis | \n", "\u00c9tats-Unis | \n", "
ingredients_text | \n", "Pork, sugar, water, brown sugar, gluten free s... | \n", "Carbonated water, high fructose corn syrup, ci... | \n", "
allergens | \n", "NaN | \n", "NaN | \n", "
allergens_fr | \n", "NaN | \n", "NaN | \n", "
traces | \n", "NaN | \n", "NaN | \n", "
traces_tags | \n", "NaN | \n", "NaN | \n", "
traces_fr | \n", "NaN | \n", "NaN | \n", "
serving_size | \n", "28 g (1 oz) | \n", "240 ml (8 fl oz) | \n", "
no_nutriments | \n", "NaN | \n", "NaN | \n", "
additives_n | \n", "0 | \n", "6 | \n", "
additives | \n", "en:2-or-less;en:brown-sugar;en:contain-rice;en... | \n", "en:and-brominated-vegetable-oil;en:carbonated-... | \n", "
additives_tags | \n", "NaN | \n", "en:e110,en:e211,en:e330,en:e414,en:e443,en:e445 | \n", "
additives_fr | \n", "NaN | \n", "E110 - Jaune orang\u00e9 S,E211 - Benzoate de sodiu... | \n", "
ingredients_from_palm_oil_n | \n", "0 | \n", "0 | \n", "
ingredients_from_palm_oil | \n", "NaN | \n", "NaN | \n", "
ingredients_from_palm_oil_tags | \n", "NaN | \n", "NaN | \n", "
ingredients_that_may_be_from_palm_oil_n | \n", "0 | \n", "0 | \n", "
\n", " | 0 | \n", "1 | \n", "
---|---|---|
ingredients_that_may_be_from_palm_oil | \n", "NaN | \n", "NaN | \n", "
ingredients_that_may_be_from_palm_oil_tags | \n", "NaN | \n", "NaN | \n", "
nutrition_grade_uk | \n", "NaN | \n", "NaN | \n", "
nutrition_grade_fr | \n", "e | \n", "NaN | \n", "
pnns_groups_1 | \n", "NaN | \n", "NaN | \n", "
pnns_groups_2 | \n", "NaN | \n", "NaN | \n", "
states | \n", "en:to-be-completed, en:nutrition-facts-complet... | \n", "en:to-be-completed, en:nutrition-facts-complet... | \n", "
states_tags | \n", "en:to-be-completed,en:nutrition-facts-complete... | \n", "en:to-be-completed,en:nutrition-facts-complete... | \n", "
states_fr | \n", "A compl\u00e9ter,Informations nutritionnelles compl... | \n", "A compl\u00e9ter,Informations nutritionnelles compl... | \n", "
main_category | \n", "NaN | \n", "NaN | \n", "
main_category_fr | \n", "NaN | \n", "NaN | \n", "
image_url | \n", "NaN | \n", "NaN | \n", "
image_small_url | \n", "NaN | \n", "NaN | \n", "
energy_100g | \n", "1494 | \n", "226 | \n", "
energy-from-fat_100g | \n", "NaN | \n", "NaN | \n", "
fat_100g | \n", "7.14 | \n", "0 | \n", "
saturated-fat_100g | \n", "1.79 | \n", "NaN | \n", "
butyric-acid_100g | \n", "NaN | \n", "NaN | \n", "
caproic-acid_100g | \n", "NaN | \n", "NaN | \n", "
caprylic-acid_100g | \n", "NaN | \n", "NaN | \n", "
capric-acid_100g | \n", "NaN | \n", "NaN | \n", "
lauric-acid_100g | \n", "NaN | \n", "NaN | \n", "
myristic-acid_100g | \n", "NaN | \n", "NaN | \n", "
palmitic-acid_100g | \n", "NaN | \n", "NaN | \n", "
stearic-acid_100g | \n", "NaN | \n", "NaN | \n", "
arachidic-acid_100g | \n", "NaN | \n", "NaN | \n", "
behenic-acid_100g | \n", "NaN | \n", "NaN | \n", "
lignoceric-acid_100g | \n", "NaN | \n", "NaN | \n", "
cerotic-acid_100g | \n", "NaN | \n", "NaN | \n", "
montanic-acid_100g | \n", "NaN | \n", "NaN | \n", "
melissic-acid_100g | \n", "NaN | \n", "NaN | \n", "
monounsaturated-fat_100g | \n", "NaN | \n", "NaN | \n", "
polyunsaturated-fat_100g | \n", "NaN | \n", "NaN | \n", "
omega-3-fat_100g | \n", "NaN | \n", "NaN | \n", "
alpha-linolenic-acid_100g | \n", "NaN | \n", "NaN | \n", "
eicosapentaenoic-acid_100g | \n", "NaN | \n", "NaN | \n", "
docosahexaenoic-acid_100g | \n", "NaN | \n", "NaN | \n", "
omega-6-fat_100g | \n", "NaN | \n", "NaN | \n", "
linoleic-acid_100g | \n", "NaN | \n", "NaN | \n", "
arachidonic-acid_100g | \n", "NaN | \n", "NaN | \n", "
gamma-linolenic-acid_100g | \n", "NaN | \n", "NaN | \n", "
dihomo-gamma-linolenic-acid_100g | \n", "NaN | \n", "NaN | \n", "
omega-9-fat_100g | \n", "NaN | \n", "NaN | \n", "
oleic-acid_100g | \n", "NaN | \n", "NaN | \n", "
elaidic-acid_100g | \n", "NaN | \n", "NaN | \n", "
gondoic-acid_100g | \n", "NaN | \n", "NaN | \n", "
mead-acid_100g | \n", "NaN | \n", "NaN | \n", "
erucic-acid_100g | \n", "NaN | \n", "NaN | \n", "
nervonic-acid_100g | \n", "NaN | \n", "NaN | \n", "
trans-fat_100g | \n", "0 | \n", "NaN | \n", "
\n", " | 0 | \n", "1 | \n", "
---|---|---|
cholesterol_100g | \n", "0.089 | \n", "NaN | \n", "
carbohydrates_100g | \n", "39.29 | \n", "13.75 | \n", "
sugars_100g | \n", "39.29 | \n", "13.75 | \n", "
sucrose_100g | \n", "NaN | \n", "NaN | \n", "
glucose_100g | \n", "NaN | \n", "NaN | \n", "
fructose_100g | \n", "NaN | \n", "NaN | \n", "
lactose_100g | \n", "NaN | \n", "NaN | \n", "
maltose_100g | \n", "NaN | \n", "NaN | \n", "
maltodextrins_100g | \n", "NaN | \n", "NaN | \n", "
starch_100g | \n", "NaN | \n", "NaN | \n", "
polyols_100g | \n", "NaN | \n", "NaN | \n", "
fiber_100g | \n", "0 | \n", "NaN | \n", "
proteins_100g | \n", "28.57 | \n", "0 | \n", "
casein_100g | \n", "NaN | \n", "NaN | \n", "
serum-proteins_100g | \n", "NaN | \n", "NaN | \n", "
nucleotides_100g | \n", "NaN | \n", "NaN | \n", "
salt_100g | \n", "2.81178 | \n", "0.04826 | \n", "
sodium_100g | \n", "1.107 | \n", "0.019 | \n", "
alcohol_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-a_100g | \n", "0 | \n", "NaN | \n", "
beta-carotene_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-d_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-e_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-k_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-c_100g | \n", "0 | \n", "NaN | \n", "
vitamin-b1_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-b2_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-pp_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-b6_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-b9_100g | \n", "NaN | \n", "NaN | \n", "
folates_100g | \n", "NaN | \n", "NaN | \n", "
vitamin-b12_100g | \n", "NaN | \n", "NaN | \n", "
biotin_100g | \n", "NaN | \n", "NaN | \n", "
pantothenic-acid_100g | \n", "NaN | \n", "NaN | \n", "
silica_100g | \n", "NaN | \n", "NaN | \n", "
bicarbonate_100g | \n", "NaN | \n", "NaN | \n", "
potassium_100g | \n", "NaN | \n", "NaN | \n", "
chloride_100g | \n", "NaN | \n", "NaN | \n", "
calcium_100g | \n", "0 | \n", "NaN | \n", "
phosphorus_100g | \n", "NaN | \n", "NaN | \n", "
iron_100g | \n", "0.00257 | \n", "NaN | \n", "
magnesium_100g | \n", "NaN | \n", "NaN | \n", "
zinc_100g | \n", "NaN | \n", "NaN | \n", "
copper_100g | \n", "NaN | \n", "NaN | \n", "
manganese_100g | \n", "NaN | \n", "NaN | \n", "
fluoride_100g | \n", "NaN | \n", "NaN | \n", "
selenium_100g | \n", "NaN | \n", "NaN | \n", "
chromium_100g | \n", "NaN | \n", "NaN | \n", "
molybdenum_100g | \n", "NaN | \n", "NaN | \n", "
iodine_100g | \n", "NaN | \n", "NaN | \n", "
\n", " | 0 | \n", "1 | \n", "
---|---|---|
caffeine_100g | \n", "NaN | \n", "NaN | \n", "
taurine_100g | \n", "NaN | \n", "NaN | \n", "
ph_100g | \n", "NaN | \n", "NaN | \n", "
fruits-vegetables-nuts_100g | \n", "NaN | \n", "NaN | \n", "
fruits-vegetables-nuts-estimate_100g | \n", "NaN | \n", "NaN | \n", "
collagen-meat-protein-ratio_100g | \n", "NaN | \n", "NaN | \n", "
cocoa_100g | \n", "NaN | \n", "NaN | \n", "
chlorophyl_100g | \n", "NaN | \n", "NaN | \n", "
carbon-footprint_100g | \n", "NaN | \n", "NaN | \n", "
nutrition-score-fr_100g | \n", "23 | \n", "NaN | \n", "
nutrition-score-uk_100g | \n", "23 | \n", "NaN | \n", "
glycemic-index_100g | \n", "NaN | \n", "NaN | \n", "
water-hardness_100g | \n", "NaN | \n", "NaN | \n", "
hasE | \n", "False | \n", "True | \n", "
s100 | \n", "17 | \n", "7 | \n", "