2016 - Une solution à la compétition de machine learning 2A#

Links: notebook, html, python, slides, GitHub

Ce notebook a été proposé par un étudiant pour la compétition organisée pour ce cours : classification binaire.

from pyensae.datasource import download_data
['ensae_competition_test_X.txt', 'ensae_competition_train.txt']
# packages
import pandas as pd
import numpy as np
from sklearn import svm, linear_model, datasets, metrics
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.nonparametric import smoothers_lowess
# dataframe
# df = pd.read_excel("default_of_credit_card_clients.xls", header=[0, 1], encoding="utf8", index_col=0, engine='openpyxl')
df = pd.read_csv("ensae_competition_train.txt", header=[0, 1], encoding="utf8", index_col=0, sep="\t")
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 ... X15 X16 X17 X18 X19 X20 X21 X22 X23 Y
0 180000 1 2 1 47 0 0 0 0 0 ... 99694 65977 67415 3700 3700 4100 2360 2500 2618 0
1 110000 2 2 1 35 0 0 0 0 0 ... 4869 4966 5070 1053 1073 1081 178 184 185 1
2 70000 2 2 2 22 0 0 0 0 0 ... 69927 50579 49483 2501 3001 2608 1777 1792 1793 1
3 200000 2 1 2 27 -2 -2 -2 -2 -2 ... 1665 3370 -36 5610 15616 1673 3385 0 95456 0
4 370000 2 1 1 39 0 0 0 0 0 ... 48216 47675 48074 2157 2000 1668 2000 3000 1000 0
5 260000 2 1 1 29 0 0 0 -2 -2 ... 0 0 0 3090 0 0 0 0 141516 0
6 90000 2 1 1 43 -1 -1 2 -1 -1 ... 7660 21175 4009 4367 9 7660 21175 4009 7452 0
7 220000 2 1 1 43 -1 3 2 0 0 ... 1090 1090 0 167 0 0 0 0 0 1
8 50000 1 2 1 35 1 2 0 0 0 ... 21260 70 29575 0 2052 1800 0 29935 1200 1
9 50000 2 3 2 40 0 0 0 0 0 ... 8292 8465 8650 1271 1130 1000 307 325 436 0

10 rows × 24 columns

MultiIndex(levels=[['X1', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X2', 'X20', 'X21', 'X22', 'X23', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'Y'], ['AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'EDUCATION', 'LIMIT_BAL', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'SEX', 'default payment next month']],
           labels=[[0, 11, 16, 17, 18, 19, 20, 21, 22, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 23], [8, 22, 7, 9, 0, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 16, 17, 18, 19, 20, 21, 23]],
           names=[None, 'ID'])
# Retrait 2ème ligne header

df1 = df.copy()
df1.columns = df1.columns.droplevel(-1)
Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11',
       'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21',
       'X22', 'X23', 'Y'],
# statistiques descriptives

# paramètres des graphes
fig = plt.figure(figsize=(12, 6))
alpha=alpha_scatterplot = 0.2
alpha_bar_chart = 0.55

'''graphs - the history of past payment'''

# September 2005
plt.scatter(df1.Y, df1.X6, alpha=alpha_scatterplot)
# axe x
# axe y
plt.ylabel("Payment delay")
# grid - titre
plt.grid(b=True, which='major', axis='y')
plt.title("September 2005")

# August 2005
plt.scatter(df1.Y, df1.X7, alpha=alpha_scatterplot)
# axe x
# axe y
plt.ylabel("Payment delay")
# grid - titre
plt.grid(b=True, which='major', axis='y')
plt.title("August 2005")

# July 2005
plt.scatter(df1.Y, df1.X8, alpha=alpha_scatterplot)
# axe x
# axe y
plt.ylabel("Payment delay")
# grid - titre
plt.grid(b=True, which='major', axis='y')
plt.title("July 2005")

# May 2005
plt.scatter(df1.Y, df1.X9, alpha=alpha_scatterplot)
# axe x
# axe y
plt.ylabel("Payment delay")
# grid - titre
plt.grid(b=True, which='major', axis='y')
plt.title("May 2005")

# April 2005
plt.scatter(df1.Y, df1.X10, alpha=alpha_scatterplot)
# axe x
# axe y
plt.ylabel("Payment delay")
# grid - titre
plt.grid(b=True, which='major', axis='y')
plt.title("April 2005")

# March 2005
plt.scatter(df1.Y, df1.X11, alpha=alpha_scatterplot)
# axe x
# axe y
plt.ylabel("Payment delay")
# grid - titre
plt.grid(b=True, which='major', axis='y')
plt.title("March 2005")
<matplotlib.text.Text at 0x2a5062587f0>
fig = plt.figure(figsize=(12, 6))
alpha=alpha_scatterplot = 0.2
alpha_bar_chart = 0.55

'''Graphs - bill statement'''

# personnes pas en défaut de paiement
ax1 = plt.subplot2grid((3,6),(1,0), colspan=3)
# kernel density
df1.X12[df1.Y == 0].plot(kind='kde')
df1.X13[df1.Y == 0].plot(kind='kde')
df1.X14[df1.Y == 0].plot(kind='kde')
df1.X15[df1.Y == 0].plot(kind='kde')
df1.X16[df1.Y == 0].plot(kind='kde')
df1.X17[df1.Y == 0].plot(kind='kde')
# axes
plt.xlabel("Bill statement")
plt.title("People distribution, no default")
# limites
ax1.set_xlim(0, 200000)
# légende

# personnes en défaut de paiement
ax2 = plt.subplot2grid((3,6),(1,3), colspan=3)
# kernel density
df1.X12[df1.Y == 1].plot(kind='kde')
df1.X13[df1.Y == 1].plot(kind='kde')
df1.X14[df1.Y == 1].plot(kind='kde')
df1.X15[df1.Y == 1].plot(kind='kde')
df1.X16[df1.Y == 1].plot(kind='kde')
df1.X17[df1.Y == 1].plot(kind='kde')
# axes
plt.xlabel("Bill statement")
plt.title("People distribution, default")
# limites
ax2.set_xlim(0, 200000)
# légende

'''Graphs - amount of bill payed'''

# personnes pas en défaut de paiement
ax1 = plt.subplot2grid((3,6),(2,0), colspan=3)
# kernel density
df1.X18[df1.Y == 0].plot(kind='kde')
df1.X19[df1.Y == 0].plot(kind='kde')
df1.X20[df1.Y == 0].plot(kind='kde')
df1.X21[df1.Y == 0].plot(kind='kde')
df1.X22[df1.Y == 0].plot(kind='kde')
df1.X23[df1.Y == 0].plot(kind='kde')
# axes
plt.xlabel("Amount of bill payed")
plt.title("People distribution, no default")
# limites
ax1.set_xlim(0, 25000)
# légende

# personnes en défaut de paiement
ax2 = plt.subplot2grid((3,6),(2,3), colspan=3)
# kernel density
df1.X18[df1.Y == 1].plot(kind='kde')
df1.X19[df1.Y == 1].plot(kind='kde')
df1.X20[df1.Y == 1].plot(kind='kde')
df1.X21[df1.Y == 1].plot(kind='kde')
df1.X22[df1.Y == 1].plot(kind='kde')
df1.X23[df1.Y == 1].plot(kind='kde')
# axes
plt.xlabel("Amount of bill payed")
plt.title("People distribution, default")
# limites
ax2.set_xlim(0, 25000)
# légende
<matplotlib.legend.Legend at 0x2a5065c50b8>
# Matrice des corrélations

sns.set(context="paper", font="monospace")
corrmat = df1.corr()

# atplotlib figure
f, ax = plt.subplots(figsize=(12, 9))

# Draw the heatmap using seaborn
sns.heatmap(corrmat, vmax=.8, square=True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a506461588>
# on modifie les colonnes (création de variables d'intérêt)

df1['TotalDelay'] = df1.X11 + 2*df1.X10 + 4*df1.X9 + 8*df1.X8 + 16*df1.X7 + 32*df1.X6
df1['TotalPayment'] = df1.X23 + 2*df1.X22 + 3*df1.X21 + 4*df1.X20 + 5*df1.X19 + 6*df1.X18
df1['PartMay'] = -(df1.X22 - df1.X17)/(df1.X17 + 1)
df1['PartJune'] = -(df1.X21 - df1.X16)/(df1.X16 + 1)
df1['PartJuly'] = -(df1.X20 - df1.X15)/(df1.X15 + 1)
df1['PartAugust'] = -(df1.X19 - df1.X14)/(df1.X14 + 1)
df1['PartSeptember'] = -(df1.X18 - df1.X13)/(df1.X13 + 1)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 ... X22 X23 Y TotalDelay TotalPayment PartMay PartJune PartJuly PartAugust PartSeptember
0 180000 1 2 1 47 0 0 0 0 0 ... 2500 2618 0 0 71798 0.962902 0.964215 0.958865 0.961978 0.961112
1 110000 2 2 1 35 0 0 0 0 0 ... 184 185 1 0 17094 0.963518 0.963962 0.777823 0.721906 0.850305
2 70000 2 2 2 22 0 0 0 0 0 ... 1792 1793 1 0 51151 0.963766 0.964848 0.962690 0.956517 0.962942
3 200000 2 1 2 27 -2 -2 -2 -2 -2 ... 0 95456 0 -126 224043 1.028571 -0.004450 -0.004802 -0.004502 -0.009899
4 370000 2 1 1 39 0 0 0 0 0 ... 3000 1000 0 0 42614 0.937577 0.958029 0.965386 0.959245 0.968143
5 260000 2 1 1 29 0 0 0 -2 -2 ... 0 141516 0 -14 160056 0.000000 0.000000 0.000000 0.000000 0.942813
6 90000 2 1 1 43 -1 -1 2 -1 -1 ... 4009 7452 0 -39 135882 0.000000 0.000000 0.000000 0.997711 0.393333
7 220000 2 1 1 43 -1 3 2 0 0 ... 0 0 1 32 1002 0.000000 0.999083 0.999083 0.999083 0.866455
8 50000 1 2 1 35 1 2 0 0 0 ... 29935 1200 1 63 78530 -0.012172 0.985915 0.915291 0.956269 0.999979
9 50000 2 3 2 40 0 0 0 0 0 ... 325 436 0 0 19283 0.962316 0.963619 0.879296 0.847636 0.806216
10 130000 1 2 1 24 0 0 2 0 0 ... 0 780 0 16 38480 0.997442 0.999978 0.959249 0.999979 0.899013
11 200000 2 1 2 25 -1 -1 -1 -1 -1 ... 4970 8888 0 -63 82748 0.000000 0.000000 0.000000 0.000000 0.000000
12 230000 2 2 1 38 -2 -2 -2 -2 -2 ... 2132 2204 0 -126 111453 0.000000 0.000000 0.000000 0.000000 0.000000
13 90000 2 1 2 29 -2 -2 -2 -2 -2 ... 0 0 0 -126 0 1.004184 1.004184 1.004184 1.004184 1.004184
14 230000 1 3 2 37 -1 0 0 0 -1 ... 5003 3016 0 -34 287273 0.887873 -0.000340 0.769071 0.760863 0.952064
15 130000 1 2 2 33 2 2 -1 -1 -2 ... 0 0 0 78 3578 0.000000 0.000000 0.000000 0.000000 0.993003
16 90000 2 2 1 35 0 0 0 0 0 ... 4000 0 0 2 91108 0.954684 0.883510 0.952754 0.953646 0.952596
17 10000 2 2 1 37 -1 4 3 2 2 ... 0 36 0 70 3236 0.999550 0.848221 0.800478 0.999590 0.999652
18 80000 1 3 1 36 0 0 0 0 0 ... 3000 6200 0 0 73411 0.961214 0.926935 0.962670 0.963936 0.962893
19 320000 2 1 1 36 -1 2 0 0 0 ... 5000 11906 0 0 96906 0.792507 0.755381 0.703680 0.400943 0.999851

20 rows × 31 columns

# Matrice des corrélations

sns.set(context="paper", font="monospace")
corrmat = df1.corr()

# matplotlib figure
f, ax = plt.subplots(figsize=(12, 9))

# Draw the heatmap using seaborn
sns.heatmap(corrmat, vmax=.8, square=True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a5067c70f0>
# drop some columns

df1 = df1.drop(['X'+str(n) for n in range(7,12)] + ['X'+str(n) for n in range(13,24)], axis=1)
X1 X2 X3 X4 X5 X6 X12 Y TotalDelay TotalPayment PartMay PartJune PartJuly PartAugust PartSeptember
0 180000 1 2 1 47 0 179253 0 0 71798 0.962902 0.964215 0.958865 0.961978 0.961112
1 110000 2 2 1 35 0 6137 1 0 17094 0.963518 0.963962 0.777823 0.721906 0.850305
2 70000 2 2 2 22 0 66505 1 0 51151 0.963766 0.964848 0.962690 0.956517 0.962942
3 200000 2 1 2 27 -2 4941 0 -126 224043 1.028571 -0.004450 -0.004802 -0.004502 -0.009899
4 370000 2 1 1 39 0 141552 0 0 42614 0.937577 0.958029 0.965386 0.959245 0.968143
5 260000 2 1 1 29 0 71864 0 -14 160056 0.000000 0.000000 0.000000 0.000000 0.942813
6 90000 2 1 1 43 -1 16139 0 -39 135882 0.000000 0.000000 0.000000 0.997711 0.393333
7 220000 2 1 1 43 -1 1090 1 32 1002 0.000000 0.999083 0.999083 0.999083 0.866455
8 50000 1 2 1 35 1 48047 1 63 78530 -0.012172 0.985915 0.915291 0.956269 0.999979
9 50000 2 3 2 40 0 5538 0 0 19283 0.962316 0.963619 0.879296 0.847636 0.806216
10 130000 1 2 1 24 0 46113 0 16 38480 0.997442 0.999978 0.959249 0.999979 0.899013
11 200000 2 1 2 25 -1 8926 0 -63 82748 0.000000 0.000000 0.000000 0.000000 0.000000
12 230000 2 2 1 38 -2 12696 0 -126 111453 0.000000 0.000000 0.000000 0.000000 0.000000
13 90000 2 1 2 29 -2 -240 0 -126 0 1.004184 1.004184 1.004184 1.004184 1.004184
14 230000 1 3 2 37 -1 36571 0 -34 287273 0.887873 -0.000340 0.769071 0.760863 0.952064
15 130000 1 2 2 33 2 2183 0 78 3578 0.000000 0.000000 0.000000 0.000000 0.993003
16 90000 2 2 1 35 0 72112 0 2 91108 0.954684 0.883510 0.952754 0.953646 0.952596
17 10000 2 2 1 37 -1 3305 0 70 3236 0.999550 0.848221 0.800478 0.999590 0.999652
18 80000 1 3 1 36 0 81066 0 0 73411 0.961214 0.926935 0.962670 0.963936 0.962893
19 320000 2 1 1 36 -1 7868 0 0 96906 0.792507 0.755381 0.703680 0.400943 0.999851
from sklearn.decomposition import PCA
from numpy import inf
pca = PCA(n_components=2, svd_solver='randomized')
dfpca = df1.values
dfpca[dfpca == -inf] = 0
y = dfpca[:, 7]
proj = pca.fit_transform(dfpca[:, :7 + 8:])
plt.scatter(proj[:, 0], proj[:, 1], c=y)
<matplotlib.colorbar.Colorbar at 0x2a506a1a0b8>
# training/crossval set

X = df1.values
X[X==-inf] = 0
# training set
X_train = X[:, :]
Y_train = X[:, 7].ravel()
X_train = np.delete(X_train, 7, axis=1)
# expected result
expected = X[20000:, 7].ravel()
# cross-validation data set
X_cross = X[20000:, :]
X_cross = np.delete(X_cross, 7, axis=1)
       X1  X2  X3  X4  X5  X6     X12  Y  TotalDelay  TotalPayment   PartMay  0  180000   1   2   1  47   0  179253  0           0         71798  0.962902
1  110000   2   2   1  35   0    6137  1           0         17094  0.963518
2   70000   2   2   2  22   0   66505  1           0         51151  0.963766
3  200000   2   1   2  27  -2    4941  0        -126        224043  1.028571
4  370000   2   1   1  39   0  141552  0           0         42614  0.937577
   PartJune  PartJuly  PartAugust  PartSeptember
0  0.964215  0.958865    0.961978       0.961112
1  0.963962  0.777823    0.721906       0.850305
2  0.964848  0.962690    0.956517       0.962942
3 -0.004450 -0.004802   -0.004502      -0.009899
4  0.958029  0.965386    0.959245       0.968143
from sklearn.naive_bayes import GaussianNB

# train the model
GNB = GaussianNB()
GNB.fit(X_train, Y_train)

# use the model to predict the labels of the test data
predicted = GNB.predict(X_cross)

print(metrics.confusion_matrix(expected, predicted))
[[ 209 1732]
 [  26  533]]
from sklearn.ensemble import GradientBoostingClassifier

GBR = GradientBoostingClassifier()
predicted = GBR.predict(X_cross)
print(metrics.confusion_matrix(expected, predicted))
[[1848   93]
 [ 352  207]]
from sklearn.neighbors import KNeighborsClassifier

KNC = KNeighborsClassifier(5)
KNC.fit(X_train, Y_train)
predicted = KNC.predict(X_cross)
print(metrics.confusion_matrix(expected, predicted))
pred = KNC.predict_proba(X_train)
[[1864   77]
 [ 368  191]]
[[ 1.   0. ]
 [ 0.6  0.4]
 [ 0.6  0.4]
 [ 1.   0. ]
 [ 1.   0. ]
 [ 0.6  0.4]
 [ 1.   0. ]
 [ 0.4  0.6]
 [ 0.4  0.6]
 [ 0.8  0.2]]
[ 0.  1.  1.  0.  0.  0.  0.  1.  1.  0.]
# neural network

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

# optimisation - choix du nombre de couches
param_grid = [
  {'hidden_layer_sizes': [(nb,) for nb in range(20,50,10)]},
  {'alpha': [a/100 for a in range(0,40,20)]}

neural2 = GridSearchCV(MLPClassifier(), param_grid, verbose=1)
neural2.fit(X_train, Y_train)
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   27.2s finished
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
neural = MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(170,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
neural.fit(X_train, Y_train)
predicted = neural.predict(X_cross)
print(metrics.confusion_matrix(expected, predicted))
[[1780  161]
 [ 378  181]]
array([[  1.00000000e+000,   4.54194423e-294],
       [  1.00000000e+000,   1.14673239e-101],
       [  1.00000000e+000,   1.13397258e-051],
       [  1.00000000e+000,   1.89540529e-117],
       [  1.00000000e+000,   8.01811448e-032],
       [  1.00000000e+000,   1.14003085e-160],
       [  1.00000000e+000,   1.02562443e-115],
       [  1.00000000e+000,   9.41507727e-017],
       [  1.00000000e+000,   3.16744761e-026],
       [  6.95744980e-001,   3.04255020e-001]])
if_you_have_time = False
if if_you_have_time:
    from sklearn.gaussian_process import GaussianProcessClassifier

    GPC = GaussianProcessClassifier()
    GPC.fit(X_train, Y_train)
    predicted = GPC.predict(X_cross)
    print(metrics.confusion_matrix(expected, predicted))
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(5)
RFC.fit(X_train, Y_train)
predicted = RFC.predict(X_cross)
print(metrics.confusion_matrix(expected, predicted))
[[1928   13]
 [  65  494]]
[[ 1.   0. ]
 [ 1.   0. ]
 [ 0.   1. ]
 [ 1.   0. ]
 [ 1.   0. ]
 [ 0.8  0.2]]
[ 0.  0.  1. ...,  0.  0.  0.]
if if_you_have_time:
    from sklearn.svm import SVC

    SVC = SVC(probability = True)
    SVC.fit(X_train, Y_train)
    predicted = SVC.predict(X_cross)
    print(metrics.confusion_matrix(expected, predicted))
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X_train, Y_train)
predicted = LR.predict(X_cross)
print(metrics.confusion_matrix(expected, predicted))
[[1861   80]
 [ 413  146]]
[[ 0.85603222  0.14396778]
 [ 0.72020052  0.27979948]
 [ 0.69524708  0.30475292]
 [ 0.69884026  0.30115974]
 [ 0.85309221  0.14690779]
 [ 0.75099513  0.24900487]]
[ 0.  1.  1. ...,  0.  0.  0.]
# modèle final #

# dataframe
dfend = pd.read_csv("ensae_competition_test_X.txt", header=[0, 1], sep='\t', encoding="utf8", index_col=0)
dfend.columns = dfend.columns.droplevel(-1)

# modifications colonnes
dfend['TotalDelay'] = dfend.X11 + 2*dfend.X10 + 4*dfend.X9 + 8*dfend.X8 + 16*dfend.X7 + 32*dfend.X6
dfend['TotalPayment'] = dfend.X23 + 2*dfend.X22 + 3*dfend.X21 + 4*dfend.X20 + 5*dfend.X19 + 6*dfend.X18
dfend['PartMay'] = -(dfend.X22 - dfend.X17)/(dfend.X17 + 1)
dfend['PartJune'] = -(dfend.X21 - dfend.X16)/(dfend.X16 + 1)
dfend['PartJuly'] = -(dfend.X20 - dfend.X15)/(dfend.X15 + 1)
dfend['PartAugust'] = -(dfend.X19 - dfend.X14)/(dfend.X14 + 1)
dfend['PartSeptember'] = -(dfend.X18 - dfend.X13)/(dfend.X13 + 1)
dfend = dfend.drop(['X'+str(n) for n in range(7,12)] + ['X'+str(n) for n in range(13,24)], axis=1)

# dataset as array
X = dfend.values
X[X==-inf] = 0
# prédictions

# réseau de neuronnes
l = neural.predict(X)
text_file = open('answerN.txt','w')
for e in l:
    text_file.write(str(int(e)) + '\n')
# random forest
l = RFC.predict(X)
text_file = open('answerRF.txt','w')
for e in l:
    text_file.write(str(int(e)) + '\n')