example with xgboost

Links: notebook, html, PDF, python, slides, GitHub

Test XGBoost after it was compiled, pickle, unpickle.

from jyquickhelper import add_notebook_menu
add_notebook_menu()

This is an example taken from xgboost website.

import pickle

import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris, load_digits, load_diabetes
import xgboost as xgb

Zeros and Ones from the Digits dataset: binary classification

rng = np.random.RandomState(31337)

digits = load_digits(2)
y = digits['target']
X = digits['data']
conf = []
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    conf.append(confusion_matrix(actuals, predictions))
conf
[array([[87,  0],
        [ 1, 92]], dtype=int64), array([[91,  0],
        [ 3, 86]], dtype=int64)]

Iris: multiclass classification

iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
conf = []
for train_index, test_index in kf.split(X, y):
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    conf.append(confusion_matrix(actuals, predictions))
conf
[array([[19,  0,  0],
        [ 0, 31,  3],
        [ 0,  1, 21]], dtype=int64), array([[31,  0,  0],
        [ 0, 16,  0],
        [ 0,  3, 25]], dtype=int64)]

Diabetes: regression

data = load_diabetes()
y = data['target']
X = data['data']
err = []
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
for train_index, test_index in kf.split(X, y):
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    err.append(mean_squared_error(actuals, predictions))
err
[9.860776812557337, 15.942418468446029]

Parameter optimization

import joblib  # to check you can parallelize GridSearchCV
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1, n_jobs=1, pre_dispatch=1, cv=5)
clf.fit(X,y)
clf.best_score_, clf.best_params_
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    2.3s finished
c:python370_x64libsite-packagessklearnmodel_selection_search.py:841: DeprecationWarning: The default of the iid parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
(0.6699572097100618, {'max_depth': 2, 'n_estimators': 100})

Pickling sklearn API models

# The sklearn API models are picklable
# must open in binary format to pickle
pickle.dump(clf, open("best_boston.pkl", "wb"))
clf2 = pickle.load(open("best_boston.pkl", "rb"))
np.allclose(clf.predict(X), clf2.predict(X))
True

Early stopping

X = digits['data']
y = digits['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
        eval_set=[(X_test, y_test)])
[0] validation_0-auc:0.999497
Will train until validation_0-auc hasn't improved in 10 rounds.
[1] validation_0-auc:0.999497
[2] validation_0-auc:0.999497
[3] validation_0-auc:0.999749
[4] validation_0-auc:0.999749
[5] validation_0-auc:0.999749
[6] validation_0-auc:0.999749
[7] validation_0-auc:0.999749
[8] validation_0-auc:0.999749
[9] validation_0-auc:0.999749
[10]        validation_0-auc:1
[11]        validation_0-auc:1
[12]        validation_0-auc:1
[13]        validation_0-auc:1
[14]        validation_0-auc:1
[15]        validation_0-auc:1
[16]        validation_0-auc:1
[17]        validation_0-auc:1
[18]        validation_0-auc:1
[19]        validation_0-auc:1
[20]        validation_0-auc:1
Stopping. Best iteration:
[10]        validation_0-auc:1
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)