from jyquickhelper import add_notebook_menu
add_notebook_menu()


%matplotlib inline


from papierstat.datasets import load_wines_dataset
df = load_wines_dataset()
X = df.drop(['quality', 'color'], axis=1)
y = df['quality']
df.head()


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9)


from sklearn.ensemble import HistGradientBoostingRegressor
rf = HistGradientBoostingRegressor(max_iter=10)
rf.fit(X_train, y_train)

HistGradientBoostingRegressor(max_iter=10)


from sklearn.metrics import r2_score
r2_score(y_test, rf.predict(X_test))

0.24727950503706164


X_train.shape, len(rf.feature_names_in_)

((649, 11), 11)


rf.feature_names_in_

array(['fixed_acidity', 'volatile_acidity', 'citric_acid',
       'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
       'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype=object)


import matplotlib.pyplot as plt
from sklearn.inspection import (
    partial_dependence, PartialDependenceDisplay)

fig, ax = plt.subplots(4, 3, figsize=(14, 8))
display = PartialDependenceDisplay.from_estimator(
    rf, X_train, list(rf.feature_names_in_), kind="both",
    subsample=50, n_jobs=3, grid_resolution=20,
    random_state=0, ax=ax.ravel()[:11])


import warnings
from interpret import show
from interpret.blackbox import LimeTabular

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    lime = LimeTabular(predict_fn=rf.predict, data=X_train, 
                       feature_names=list(rf.feature_names_in_))
    lime_local = lime.explain_local(X_test[:5], y_test[:5])

show(lime_local)


from interpret.blackbox import ShapKernel

shap = ShapKernel(predict_fn=rf.predict, data=X_train, 
                  feature_names=list(rf.feature_names_in_))
shap_local = shap.explain_local(X_test[:5], y_test[:5])

X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
Using 1299 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.

  0%|          | 0/5 [00:00<?, ?it/s]

X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names
X does not have valid feature names, but HistGradientBoostingRegressor was fitted with feature names


show(shap_local)

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	pH	sulphates	alcohol	quality	color
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5	red
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5	red
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5	red
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6	red
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5	red

Interprétation de la note d'un vin¶

Un aperçu des données¶

Partial dependance¶

LIME¶

Shap¶