from jyquickhelper import add_notebook_menu
add_notebook_menu(last_level=3)


%matplotlib inline
import matplotlib.pyplot as plt


add_notebook_menu(menu_id="reg", first_level=3, last_level=4, keep_item=0)


import numpy.random
X = numpy.random.random((100, 1))
xx = numpy.random.normal(size=(100, 1)) / 10
Y = X*X + xx


fig, ax = plt.subplots()
ax.plot(X, Y, ".", label="data")
ax.legend()
ax.set_title("Courbe façon nuage de points");


# model 1
from sklearn.linear_model import LinearRegression
clr = LinearRegression()
clr.fit(X, Y)
Yp = clr.predict(X)


# model 2
from sklearn.tree import DecisionTreeRegressor
clr2 = DecisionTreeRegressor(max_depth=4)
clr2.fit(X, Y)
Yp2 = clr2.predict(X)


fig, ax = plt.subplots()
ax.plot(X, Y, ".", label="data")
ax.plot(X, Yp, ".", label="model 1")
ax.plot(X, Yp2, "g.", label="model 2")
ax.legend()
ax.set_title("Régressions linéaires");


fig, ax = plt.subplots()
ax.plot(Y, Yp, ".", label="model 1")
ax.plot(Y, Yp2, "g.", label="model 2")
mm = [numpy.min(Y), numpy.max(Y)]
ax.plot(mm, mm,"--")
ax.set_xlabel("Y attendu")
ax.set_ylabel("Y prédit")
ax.legend()
ax.set_title("Régression linéaire");


err1 = ((Y - Yp)).reshape((len(Y),))
err1.sort()
err2 = ((Y - Yp2.reshape((len(Y), 1)))).reshape((len(Y),))
err2.sort()


fig, ax = plt.subplots()
ax.plot(err1, label="model 1")
ax.plot(err2, label="model 2")
ax.set_xlabel("observations")
ax.set_ylabel("erreur")
ax.set_title("Répartition des erreurs");


add_notebook_menu(menu_id="classif", first_level=3, last_level=4, keep_item=1)


from pyquickhelper.helpgen import NbImage
NbImage("dt.png", width=600)


import numpy.random
X = numpy.random.normal(size=(100, 2))
Y = numpy.random.randint(2, size=(100, ))
X[Y==1,0] += 1.2
X[Y==1,1] += 1.2


fix, ax = plt.subplots()
ax.plot(X[Y==0,0], X[Y==0,1], "o")
ax.plot(X[Y==1,0], X[Y==1,1], "o")
ax.set_title("Nuage de points avec deux classes");


# model 1
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X, Y)
Yp = clf.predict(X)


# model 1
from sklearn.tree import DecisionTreeClassifier
clf2 = DecisionTreeClassifier(max_depth=3)
clf2.fit(X, Y)
Yp2 = clf2.predict(X)


fig, ax = plt.subplots()
ax.plot(X[Yp==0,0]+0.1, X[Yp==0,1]+0.1, "rs", label="cl0 model 1", ms=3)
ax.plot(X[Yp==1,0]+0.1, X[Yp==1,1]+0.1, "bs", label="cl1 model 1", ms=3)
ax.plot(X[Yp2==0,0]-0.1, X[Yp2==0,1]+0.1, "rs", label="cl0 model 2", ms=3)
ax.plot(X[Yp2==1,0]-0.1, X[Yp2==1,1]+0.1, "bs", label="cl1 model 2", ms=3)
ax.plot(X[Y==0,0], X[Y==0,1], "ro", label="cl0")
ax.plot(X[Y==1,0], X[Y==1,1], "bo", label="cl1")
ax.legend()
ax.set_title("Résultat de la classification binaire");


from sklearn.metrics import roc_curve, auc

Ypr = clf.predict_proba(X)


score_pos = Ypr[Y == 1, 1]
score_neg = Ypr[Y == 0, 1]


import numpy
numpy.average(score_neg), numpy.average(score_pos)

(0.36854464309856544, 0.7010128988674563)


f, ax = plt.subplots()
ax.hist(score_pos, label="score positifs", alpha=0.5)
ax.hist(score_neg, label="score négatifs", alpha=0.5)
ax.set_xlabel("score")
ax.set_ylabel("proportion")
ax.legend()
ax.set_title("Distribution des scores avec seaborn");


from sklearn.metrics import roc_curve, auc

# model 1
Ypr = clf.predict_proba(X)
fpr, tpr, thresholds = roc_curve(Y, Ypr[:,1]) # on choisit la classe 1


# model 2
Ypr2 = clf2.predict_proba(X)
fpr2, tpr2, thresholds2 = roc_curve(Y, Ypr2[:,1]) # on choisit la classe 1


auc(fpr, tpr), auc(fpr2, tpr2)

(0.8337359098228664, 0.8836553945249597)


import pandas
df = pandas.DataFrame({"seuil":thresholds, "fpr":fpr, "tpr": tpr })[["seuil", "fpr", "tpr"]]
pandas.concat([df.head(), df.tail()])


fix, ax = plt.subplots()
ax.plot(fpr, tpr, ".-", label="model 1 auc=%0.2f" % auc(fpr, tpr))
ax.plot(fpr2, tpr2, ".-", label="model 2 auc=%0.2f" % auc(fpr2, tpr2))
ax.set_xlabel("FPR = False Positive Rate")
ax.set_ylabel("TPR = True Positive Rate")
ax.legend()
ax.set_title("Courbe ROC");


# nuage de points
from matplotlib.colors import ListedColormap
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

fig, ax = plt.subplots()

# grille
h = .02
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = numpy.meshgrid(numpy.arange(x_min, x_max, h),
                        numpy.arange(y_min, y_max, h))

# valeur pour cette grille
if hasattr(clf, "decision_function"):
    Z = clf.decision_function(numpy.c_[xx.ravel(), yy.ravel()])
else:
    Z = clf.predict_proba(numpy.c_[xx.ravel(), yy.ravel()])[:, 1]

# 
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=cm_bright)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_title("Utilisation de la couleur pour\nreprésenter les scores");


# nuage de points
from matplotlib.colors import ListedColormap
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

fig, ax = plt.subplots()

# grille
h = .02
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = numpy.meshgrid(numpy.arange(x_min, x_max, h),
                        numpy.arange(y_min, y_max, h))

# valeur pour cette grille
if hasattr(clf2, "decision_function"):
    Z = clf2.decision_function(numpy.c_[xx.ravel(), yy.ravel()])
else:
    Z = clf2.predict_proba(numpy.c_[xx.ravel(), yy.ravel()])[:, 1]

# 
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=cm_bright)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_title("Couleur, score et arbres");


from sklearn.metrics import confusion_matrix
confusion_matrix(Y, Yp)

array([[31, 15],
       [ 7, 47]], dtype=int64)


confusion_matrix(Y, Yp2)

array([[32, 14],
       [ 4, 50]], dtype=int64)


add_notebook_menu(menu_id="rank", first_level=3, last_level=4, keep_item=3)


from pyquickhelper.helpgen import NbImage
NbImage("sphx_glr_plot_cluster_comparison_001.png")


NbImage("sphx_glr_plot_pca_iris_001.png")


NbImage("sphx_glr_plot_compare_methods_001.png")


from pyquickhelper.helpgen import NbImage
NbImage("pipeline.png")


from sklearn.datasets import load_iris
data = load_iris()
X = data.data
Y = data.target


from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([("acp", PCA()),
                 ("logreg", LogisticRegression())])
pipe.fit(X, Y)

Pipeline(memory=None,
     steps=[('acp', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


pred = pipe.predict(X)
pred[:5]

array([0, 0, 0, 0, 0])

	seuil	fpr	tpr
0	0.994020	0.000000	0.018519
1	0.878538	0.000000	0.203704
2	0.876186	0.021739	0.203704
3	0.844416	0.021739	0.277778
4	0.837452	0.043478	0.277778
28	0.166698	0.739130	0.962963
29	0.143926	0.739130	0.981481
30	0.111130	0.782609	0.981481
31	0.110802	0.782609	1.000000
32	0.032959	1.000000	1.000000

2A.ml - Problèmes classiques de machine learning illustrés¶

Supervisés¶

Regression¶

données¶

sortie¶

optimisation¶

évaluation¶

exemple¶

graphe erreur XY¶

Classification binaire¶

données¶

sortie¶

optimisation¶

évaluation¶

exemple¶

distribution des scores¶

graphe erreur ROC¶

graphe frontière¶

matrice de confusion¶

Plusieurs dimensions¶

Régression¶

Classification¶

Ranking¶

données¶

sortie¶

optimisation¶

évaluation¶

Non supervisés¶

Clustering¶

Réduction de dimensions¶

Ridge / Lasso / Régularisation¶

Learner / Transformer / Pipeline¶

Exercice 1 : classifier avec plusieurs classes¶

Exercice 2 : grand nombre de classes ?¶

Exercice 3 : ranking¶