Transformed Target#

Links: notebook, html, PDF, python, slides, GitHub

TransformedTargetRegressor proposes a way to modify the target before training. The notebook extends the concept to classifiers.

from jyquickhelper import add_notebook_menu
%matplotlib inline
import warnings


Let’s reuse the example from Effect of transforming the targets in regression model.

import numpy
from numpy.random import random, randn

rnd = random((1000, 1))
rndn = randn(1000)
X = rnd[:, :1] * 10
y = rnd[:, 0] * 5 + rndn / 2
y = numpy.exp((y + abs(y.min())) / 2)
y_trans = numpy.log1p(y)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
ax[0].plot(X[:, 0], y, '.')
ax[0].set_title('Exponential target')
ax[1].plot(X[:, 0], y_trans, '.')
ax[1].set_title('Exponential target transform with log1p');
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor

reg = LinearRegression(), y)

regr_trans = TransformedTargetRegressor(regressor=LinearRegression(),
                                        inverse_func=numpy.expm1), y)
TransformedTargetRegressor(func=<ufunc 'log1p'>, inverse_func=<ufunc 'expm1'>,
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
ax[0].plot(X[:, 0], y, '.')
ax[0].plot(X[:, 0], reg.predict(X), '.', label="Regular Linear Regression")
ax[1].plot(X[:, 0], y, '.')
ax[1].plot(X[:, 0], regr_trans.predict(X), '.', label="Linear Regression with modified target")


Same thing with mlinsights.

from mlinsights.mlmodel import TransformedTargetRegressor2
regr_trans2 = TransformedTargetRegressor2(regressor=LinearRegression(),
                                         transformer='log1p'), y)
TransformedTargetRegressor2(regressor=LinearRegression(), transformer='log1p')
fig, ax = plt.subplots(1, 3, figsize=(14, 4))
ax[0].plot(X[:, 0], y, '.')
ax[0].plot(X[:, 0], reg.predict(X), '.', label="Regular Linear Regression")
ax[1].plot(X[:, 0], y, '.')
ax[1].plot(X[:, 0], regr_trans.predict(X), '.', label="Linear Regression with modified target")
ax[2].plot(X[:, 0], y, '.')
ax[2].plot(X[:, 0], regr_trans2.predict(X), '.', label="Linear Regression with modified target")

It works the same way except the user does not have to specify the inverse function.

Why another?#

import pickle
by1 = pickle.dumps(regr_trans)
by2 = pickle.dumps(regr_trans2)
tr1 = pickle.loads(by1)
tr2 = pickle.loads(by2)
numpy.max(numpy.abs(tr1.predict(X) - tr2.predict(X)))

Well, to be honest, I did not expect numpy functions to be pickable. Lambda functions are not.

from pickle import PicklingError

regr_trans3 = TransformedTargetRegressor(regressor=LinearRegression(),
                                        func=lambda x: numpy.log1p(x),
                                        inverse_func=numpy.expm1), y)

except PicklingError as e:
Can't pickle <function <lambda> at 0x00000195DBFD0C10>: attribute lookup <lambda> on __main__ failed

Classifier and classes permutation#

One question I get sometimes from my students is: regression or classification?

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
X, y =,
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)
from sklearn.linear_model import LinearRegression, LogisticRegression
reg = LinearRegression(), y_train)
log = LogisticRegression(), y_train)
from sklearn.metrics import r2_score
r2_score(y_test, reg.predict(X_test)), r2_score(y_test, log.predict(X_test))
(0.8752883470101486, 0.8325991189427313)

The accuracy does not work on the regression output as it produces float.

from sklearn.metrics import accuracy_score
    accuracy_score(y_test, reg.predict(X_test)), accuracy_score(y_test, log.predict(X_test))
except ValueError as e:
Classification metrics can't handle a mix of multiclass and continuous targets

Based on that figure, a regression model would be better than a classification model on a problem which is known to be a classification problem. Let’s play a little bit.

from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import ignore_warnings

@ignore_warnings(category=(ConvergenceWarning, ))
def evaluation():
    rnd = []
    perf_reg = []
    perf_clr = []
    for rs in range(0, 200):
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs)
        reg = LinearRegression(), y_train)
        log = LogisticRegression(), y_train)
        perf_reg.append(r2_score(y_test, reg.predict(X_test)))
        perf_clr.append(r2_score(y_test, log.predict(X_test)))
    return rnd, perf_reg, perf_clr

rnd, perf_reg, perf_clr = evaluation()
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.plot(rnd, perf_reg, label="regression")
ax.plot(rnd, perf_clr, label="classification")
ax.set_title("Comparison between regression and classificaton\non the same problem");

Difficult to say. Knowing the expected value is an integer. Let’s round the prediction made by the regression which is known to be integer.

from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import ignore_warnings

def float2int(y):
    return numpy.int32(y + 0.5)

fct2float2int = numpy.vectorize(float2int)

@ignore_warnings(category=(ConvergenceWarning, ))
def evaluation2():
    rnd = []
    perf_reg = []
    perf_clr = []
    acc_reg = []
    acc_clr = []
    for rs in range(0, 50):
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rs)
        reg = LinearRegression(), y_train)
        log = LogisticRegression(), y_train)
        perf_reg.append(r2_score(y_test, float2int(reg.predict(X_test))))
        perf_clr.append(r2_score(y_test, log.predict(X_test)))
        acc_reg.append(accuracy_score(y_test, float2int(reg.predict(X_test))))
        acc_clr.append(accuracy_score(y_test, log.predict(X_test)))
    return (numpy.array(rnd), numpy.array(perf_reg), numpy.array(perf_clr),
            numpy.array(acc_reg), numpy.array(acc_clr))

rnd2, perf_reg2, perf_clr2, acc_reg2, acc_clr2 = evaluation2()
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
ax[0].plot(rnd2, perf_reg2, label="regression")
ax[0].plot(rnd2, perf_clr2, label="classification")
ax[0].set_title("Comparison between regression and classificaton\non the same problem with r2_score")
ax[1].plot(rnd2, acc_reg2, label="regression")
ax[1].plot(rnd2, acc_clr2, label="classification")
ax[1].set_title("Comparison between regression and classificaton\non the same problem with accuracy_score");

Pretty visually indecisive.

numpy.sign(perf_reg2 - perf_clr2).sum()
numpy.sign(acc_reg2 - acc_clr2).sum()

As strange as it seems to be, the regression wins on Iris data.

But… There is always a but…

The but…#

There is one tiny difference between regression and classification. Classification is immune to a permutation of the label.

data = load_iris()
X, y =,
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12)
reg = LinearRegression(), y_train)
log = LogisticRegression(), y_train)
(r2_score(y_test, fct2float2int(reg.predict(X_test))),
 r2_score(y_test, log.predict(X_test)))
(1.0, 0.9609053497942387)

Let’s permute between 1 and 2.

def permute(y):
    y2 = y.copy()
    y2[y == 1] = 2
    y2[y == 2] = 1
    return y2

y_train_permuted = permute(y_train)
y_test_permuted = permute(y_test)

regp = LinearRegression(), y_train_permuted)
logp = LogisticRegression(), y_train_permuted)
(r2_score(y_test_permuted, fct2float2int(regp.predict(X_test))),
 r2_score(y_test_permuted, logp.predict(X_test)))
(0.43952802359882015, 0.9626352015732547)

The classifer produces almost the same performance, the regressor seems off. Let’s check that it is just luck.

from mlinsights.mlmodel import TransformedTargetClassifier2
from pandas import DataFrame

rows = []
for i in range(0, 10):
    regpt = TransformedTargetRegressor2(LinearRegression(), transformer='permute'), y_train)
    logpt = TransformedTargetClassifier2(LogisticRegression(max_iter=200), transformer='permute'), y_train)
        'reg_perm': regpt.transformer_.permutation_,
        'reg_score': r2_score(y_test, fct2float2int(regpt.predict(X_test))),
        'log_perm': logpt.transformer_.permutation_,
        'log_score': r2_score(y_test, logpt.predict(X_test))

df = DataFrame(rows)
reg_perm reg_score log_perm log_score
0 {0: 2, 1: 0, 2: 1} 0.061728 {0: 1, 1: 2, 2: 0} 0.960905
1 {0: 1, 1: 2, 2: 0} -0.759259 {0: 0, 1: 2, 2: 1} 0.960905
2 {0: 2, 1: 1, 2: 0} 1.000000 {0: 0, 1: 1, 2: 2} 0.960905
3 {0: 0, 1: 2, 2: 1} 0.061728 {0: 1, 1: 2, 2: 0} 0.960905
4 {0: 1, 1: 0, 2: 2} -0.759259 {0: 1, 1: 2, 2: 0} 0.960905
5 {0: 1, 1: 2, 2: 0} -0.759259 {0: 2, 1: 1, 2: 0} 0.960905
6 {0: 2, 1: 0, 2: 1} 0.061728 {0: 1, 1: 2, 2: 0} 0.960905
7 {0: 0, 1: 1, 2: 2} 1.000000 {0: 2, 1: 1, 2: 0} 0.960905
8 {0: 2, 1: 0, 2: 1} 0.061728 {0: 1, 1: 0, 2: 2} 0.960905
9 {0: 1, 1: 2, 2: 0} -0.759259 {0: 1, 1: 0, 2: 2} 0.960905

The classifier produces a constant performance, the regressor is not.