Coverage for mlinsights/mlmodel/quantile

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Implements a quantile non-linear regression.

5"""

6import inspect

7import numpy as np

8from sklearn.base import RegressorMixin

9from sklearn.utils import check_X_y, column_or_1d

10from sklearn.utils.validation import check_is_fitted

11from sklearn.utils.extmath import safe_sparse_dot

12from sklearn.neural_network._base import DERIVATIVES, LOSS_FUNCTIONS

13try:

14 from sklearn.neural_network._multilayer_perceptron import BaseMultilayerPerceptron

15except ImportError: # pragma: no cover

16 # scikit-learn < 0.22.

17 from sklearn.neural_network.multilayer_perceptron import BaseMultilayerPerceptron

18from sklearn.metrics import mean_absolute_error

21def absolute_loss(y_true, y_pred):

22 """

23 Computes the absolute loss for regression.

25 :param y_true: array-like or label indicator matrix

26 Ground truth (correct) values.

27 :param y_pred: array-like or label indicator matrix

28 Predicted values, as returned by a regression estimator.

29 :return: loss, float

30 The degree to which the samples are correctly predicted.

31 """

32 return np.sum(np.abs(y_true - y_pred)) / y_true.shape[0]

35def float_sign(a):

36 "Returns 1 if *a > 0*, otherwise -1"

37 if a > 1e-8:

38 return 1.

39 if a < -1e-8:

40 return -1.

41 return 0.

44EXTENDED_LOSS_FUNCTIONS = {'absolute_loss': absolute_loss}

45DERIVATIVE_LOSS_FUNCTIONS = {'absolute_loss': np.vectorize(float_sign)}

48class CustomizedMultilayerPerceptron(BaseMultilayerPerceptron):

49 """

50 Customized MLP Perceptron based on

51 `BaseMultilayerPerceptron

52 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/neural_network/multilayer_perceptron.py#L40>`_.

53 """

55 def __init__(self, hidden_layer_sizes, activation, solver,

56 alpha, batch_size, learning_rate, learning_rate_init, power_t,

57 max_iter, loss, shuffle, random_state, tol, verbose,

58 warm_start, momentum, nesterovs_momentum, early_stopping,

59 validation_fraction, beta_1, beta_2, epsilon,

60 n_iter_no_change, max_fun):

61 if 'max_fun' in inspect.signature(BaseMultilayerPerceptron.__init__).parameters:

62 args = [15000]

63 else:

64 args = []

65 BaseMultilayerPerceptron.__init__( # pylint: disable=E1121

66 self, hidden_layer_sizes, activation, solver, alpha, batch_size,

67 learning_rate, learning_rate_init, power_t, max_iter, loss,

68 shuffle, random_state, tol, verbose, warm_start, momentum,

69 nesterovs_momentum, early_stopping, validation_fraction, beta_1, beta_2,

70 epsilon, n_iter_no_change, *args)

72 def _get_loss_function(self, loss_func_name):

73 """

74 Returns the loss functions.

76 @param loss_func_name loss function name, see

77 :epkg:`sklearn:neural_networks:MLPRegressor`

78 """

79 return LOSS_FUNCTIONS.get(loss_func_name, EXTENDED_LOSS_FUNCTIONS[loss_func_name])

81 def _modify_loss_derivatives(self, last_deltas):

82 """

83 Modifies the loss derivatives.

85 @param last_deltas last deltas is the difference between the output and the expected output

86 @return modified derivatives

87 """

88 if self.loss == 'absolute_loss':

89 return DERIVATIVE_LOSS_FUNCTIONS['absolute_loss'](last_deltas)

90 return last_deltas # pragma: no cover

92 def _backprop(self, X, y, activations, deltas, coef_grads,

93 intercept_grads):

94 """

95 Computes the MLP loss function and its corresponding derivatives

96 with respect to each parameter: weights and bias vectors.

98 :param X: {array-like, sparse matrix}, shape (n_samples, n_features)

99 The input data.

100 :param y: array-like, shape (n_samples,)

101 The target values.

102 :param activations: list, length = n_layers - 1

103 The ith element of the list holds the values of the ith layer.

104 :param deltas: list, length = n_layers - 1

105 The ith element of the list holds the difference between the

106 activations of the i + 1 layer and the backpropagated error.

107 More specifically, deltas are gradients of loss with respect to z

108 in each layer, where z = wx + b is the value of a particular layer

109 before passing through the activation function

110 :param coef_grads: list, length = n_layers - 1

111 The ith element contains the amount of change used to update the

112 coefficient parameters of the ith layer in an iteration.

113 :param intercept_grads: list, length = n_layers - 1

114 The ith element contains the amount of change used to update the

115 intercept parameters of the ith layer in an iteration.

116 :return: loss, float

117 :return: coef_grads, list, length = n_layers - 1

118 :return: intercept_grads, list, length = n_layers - 1

119 """

120 n_samples = X.shape[0]

121

122 # Forward propagate

123 activations = self._forward_pass(activations)

124

125 # Get loss

126 loss_func_name = self.loss

127 if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':

128 loss_func_name = 'binary_log_loss'

129 loss_function = self._get_loss_function(loss_func_name)

130 loss = loss_function(y, activations[-1])

131 # Add L2 regularization term to loss

132 values = np.sum(

133 np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_]))

134 loss += (0.5 * self.alpha) * values / n_samples

135

136 # Backward propagate

137 last = self.n_layers_ - 2

138

139 # The calculation of delta[last] here works with following

140 # combinations of output activation and loss function:

141 # sigmoid and binary cross entropy, softmax and categorical cross

142 # entropy, and identity with squared loss

143 deltas[last] = activations[-1] - y

144

145 # We insert the following modification to modify the gradient

146 # due to the modification of the loss function.

147 deltas[last] = self._modify_loss_derivatives(deltas[last])

148

149 # Compute gradient for the last layer

150 temp = self._compute_loss_grad( # pylint: disable=E1111

151 last, n_samples, activations, deltas, coef_grads, intercept_grads)

152 if temp is None:

153 # recent version of scikit-learn

154 # Compute gradient for the last layer

155 self._compute_loss_grad(

156 last, n_samples, activations, deltas, coef_grads, intercept_grads)

157

158 inplace_derivative = DERIVATIVES[self.activation]

159 # Iterate over the hidden layers

160 for i in range(self.n_layers_ - 2, 0, -1):

161 deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)

162 inplace_derivative(activations[i], deltas[i - 1])

163

164 self._compute_loss_grad(

165 i - 1, n_samples, activations, deltas, coef_grads,

166 intercept_grads)

167 else: # pragma: no cover

168 coef_grads, intercept_grads = temp # pylint: disable=E0633

169

170 # Iterate over the hidden layers

171 for i in range(self.n_layers_ - 2, 0, -1):

172 deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)

173 inplace_derivative = DERIVATIVES[self.activation]

174 inplace_derivative(activations[i], deltas[i - 1])

175

176 coef_grads, intercept_grads = self._compute_loss_grad( # pylint: disable=E1111,E0633

177 i - 1, n_samples, activations, deltas, coef_grads,

178 intercept_grads)

179

180 return loss, coef_grads, intercept_grads

181

182

183class QuantileMLPRegressor(CustomizedMultilayerPerceptron, RegressorMixin):

184 """

185 Quantile MLP Regression or neural networks regression

186 trained with norm :epkg:`L1`. This class inherits from

187 :epkg:`sklearn:neural_networks:MLPRegressor`.

188 This model optimizes the absolute-loss using LBFGS or stochastic gradient

189 descent. See @see cl CustomizedMultilayerPerceptron and

190 @see fn absolute_loss.

191

192 :param hidden_layer_sizes: tuple, length = n_layers - 2, default (100,)

193 The ith element represents the number of neurons in the ith

194 hidden layer.

195 :param activation: {'identity', 'logistic', 'tanh', 'relu'}, default 'relu'

196 Activation function for the hidden layer.

197 - 'identity', no-op activation, useful to implement linear bottleneck,

198 returns :math:`f(x) = x`

199 - 'logistic', the logistic sigmoid function,

200 returns :math:`f(x) = 1 / (1 + exp(-x))`.

201 - 'tanh', the hyperbolic tan function,

202 returns :math:`f(x) = tanh(x)`.

203 - 'relu', the rectified linear unit function,

204 returns :math:`f(x) = \\max(0, x)`.

205 :param solver: ``{'lbfgs', 'sgd', 'adam'}``, default 'adam'

206 The solver for weight optimization.

207 - *'lbfgs'* is an optimizer in the family of quasi-Newton methods.

208 - *'sgd'* refers to stochastic gradient descent.

209 - *'adam'* refers to a stochastic gradient-based optimizer proposed by

210 Kingma, Diederik, and Jimmy Ba

211 Note: The default solver 'adam' works pretty well on relatively

212 large datasets (with thousands of training samples or more) in terms of

213 both training time and validation score.

214 For small datasets, however, 'lbfgs' can converge faster and perform

215 better.

216 :param alpha: float, optional, default 0.0001

217 :epkg:`L2` penalty (regularization term) parameter.

218 :param batch_size: int, optional, default 'auto'

219 Size of minibatches for stochastic optimizers.

220 If the solver is 'lbfgs', the classifier will not use minibatch.

221 When set to "auto", `batch_size=min(200, n_samples)`

222 :param learning_rate: {'constant', 'invscaling', 'adaptive'}, default 'constant'

223 Learning rate schedule for weight updates.

224 - 'constant' is a constant learning rate given by

225 'learning_rate_init'.

226 - 'invscaling' gradually decreases the learning rate ``learning_rate_``

227 at each time step 't' using an inverse scaling exponent of 'power_t'.

228 effective_learning_rate = learning_rate_init / pow(t, power_t)

229 - 'adaptive' keeps the learning rate constant to

230 'learning_rate_init' as long as training loss keeps decreasing.

231 Each time two consecutive epochs fail to decrease training loss by at

232 least tol, or fail to increase validation score by at least tol if

233 'early_stopping' is on, the current learning rate is divided by 5.

234 Only used when solver='sgd'.

235 :param learning_rate_init: double, optional, default 0.001

236 The initial learning rate used. It controls the step-size

237 in updating the weights. Only used when solver='sgd' or 'adam'.

238 :param power_t: double, optional, default 0.5

239 The exponent for inverse scaling learning rate.

240 It is used in updating effective learning rate when the learning_rate

241 is set to 'invscaling'. Only used when solver='sgd'.

242 :param max_iter: int, optional, default 200

243 Maximum number of iterations. The solver iterates until convergence

244 (determined by 'tol') or this number of iterations. For stochastic

245 solvers ('sgd', 'adam'), note that this determines the number of epochs

246 (how many times each data point will be used), not the number of

247 gradient steps.

248 :param shuffle: bool, optional, default True

249 Whether to shuffle samples in each iteration. Only used when

250 solver='sgd' or 'adam'.

251 :param random_state: int, RandomState instance or None, optional, default None

252 If int, random_state is the seed used by the random number generator;

253 If RandomState instance, random_state is the random number generator;

254 If None, the random number generator is the RandomState instance used

255 by `np.random`.

256 :param tol: float, optional, default 1e-4

257 Tolerance for the optimization. When the loss or score is not improving

258 by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,

259 unless ``learning_rate`` is set to 'adaptive', convergence is

260 considered to be reached and training stops.

261 :param verbose: bool, optional, default False

262 Whether to print progress messages to stdout.

263 :param warm_start: bool, optional, default False

264 When set to True, reuse the solution of the previous

265 call to fit as initialization, otherwise, just erase the

266 previous solution. See :term:`the Glossary <warm_start>`.

267 :param momentum: float, default 0.9

268 Momentum for gradient descent update. Should be between 0 and 1. Only

269 used when solver='sgd'.

270 :param nesterovs_momentum: boolean, default True

271 Whether to use Nesterov's momentum. Only used when solver='sgd' and

272 momentum > 0.

273 :param early_stopping: bool, default False

274 Whether to use early stopping to terminate training when validation

275 score is not improving. If set to true, it will automatically set

276 aside 10% of training data as validation and terminate training when

277 validation score is not improving by at least ``tol`` for

278 ``n_iter_no_change`` consecutive epochs.

279 Only effective when solver='sgd' or 'adam'

280 :param validation_fraction: float, optional, default 0.1

281 The proportion of training data to set aside as validation set for

282 early stopping. Must be between 0 and 1.

283 Only used if early_stopping is True

284 :param beta_1: float, optional, default 0.9

285 Exponential decay rate for estimates of first moment vector in adam,

286 should be in [0, 1). Only used when solver='adam'

287 :param beta_2: float, optional, default 0.999

288 Exponential decay rate for estimates of second moment vector in adam,

289 should be in [0, 1). Only used when solver='adam'

290 :param epsilon: float, optional, default 1e-8

291 Value for numerical stability in adam. Only used when solver='adam'

292 :param n_iter_no_change: int, optional, default 10

293 Maximum number of epochs to not meet ``tol`` improvement.

294 Only effective when solver='sgd' or 'adam'

295

296 Fitted attributes:

297

298 * `loss_`: float

299 The current loss computed with the loss function.

300 * `coefs_`: list, length n_layers - 1

301 The ith element in the list represents the weight matrix corresponding

302 to layer i.

303 * `intercepts_`: list, length n_layers - 1

304 The ith element in the list represents the bias vector corresponding to

305 layer i + 1.

306 * `n_iter_`: int,

307 The number of iterations the solver has ran.

308 * `n_layers_`: int

309 Number of layers.

310 * `n_outputs_`: int

311 Number of outputs.

312 * `out_activation_`: string

313 Name of the output activation function.

314 """

315

316 def __init__(self,

317 hidden_layer_sizes=(100,), activation="relu",

318 solver='adam', alpha=0.0001,

319 batch_size='auto', learning_rate="constant",

320 learning_rate_init=0.001,

321 power_t=0.5, max_iter=200, shuffle=True,

322 random_state=None, tol=1e-4,

323 verbose=False, warm_start=False, momentum=0.9,

324 nesterovs_momentum=True, early_stopping=False,

325 validation_fraction=0.1, beta_1=0.9, beta_2=0.999,

326 epsilon=1e-8, n_iter_no_change=10,

327 **kwargs):

328 """

329 See :epkg:`sklearn:neural_networks:MLPRegressor`

330 """

331 sup = super(QuantileMLPRegressor, self) # pylint: disable=R1725

332 if "max_fun" not in kwargs:

333 sig = inspect.signature(sup.__init__)

334 if "max_fun" in sig.parameters:

335 kwargs['max_fun'] = 15000

336 sup.__init__(hidden_layer_sizes=hidden_layer_sizes,

337 activation=activation, solver=solver, alpha=alpha,

338 batch_size=batch_size, learning_rate=learning_rate,

339 learning_rate_init=learning_rate_init, power_t=power_t,

340 max_iter=max_iter, loss='absolute_loss', shuffle=shuffle,

341 random_state=random_state, tol=tol, verbose=verbose,

342 warm_start=warm_start, momentum=momentum,

343 nesterovs_momentum=nesterovs_momentum,

344 early_stopping=early_stopping,

345 validation_fraction=validation_fraction,

346 beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,

347 n_iter_no_change=n_iter_no_change, **kwargs)

348

349 def predict(self, X):

350 """

351 Predicts using the multi-layer perceptron model.

352

353 :param X: {array-like, sparse matrix}, shape (n_samples, n_features)

354 The input data.

355 :return: y : array-like, shape (n_samples, n_outputs)

356 The predicted values.

357 """

358 check_is_fitted(self)

359 if hasattr(self, '_predict'):

360 y_pred = self._predict(X)

361 else:

362 y_pred = self._forward_pass_fast(X)

363 if y_pred.shape[1] == 1:

364 return y_pred.ravel()

365 return y_pred

366

367 def _validate_input(self, X, y, incremental, reset=False):

368 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],

369 multi_output=True, y_numeric=True)

370 if y.ndim == 2 and y.shape[1] == 1:

371 y = column_or_1d(y, warn=True)

372 return X, y

373

374 def score(self, X, y, sample_weight=None):

375 """

376 Returns mean absolute error regression loss.

377

378 :param X: array-like, shape = (n_samples, n_features)

379 Test samples.

380 :param y: array-like, shape = (n_samples) or (n_samples, n_outputs)

381 True values for X.

382 :param sample_weight: array-like, shape = [n_samples], optional

383 Sample weights.

384 :return: score, float

385 mean absolute error regression loss

386 """

387 pred = self.predict(X)

388 return mean_absolute_error(y, pred, sample_weight=sample_weight)

Coverage for mlinsights/mlmodel/quantile_mlpregressor.py: 94%

78 statements