Coverage for mlinsights/mlmodel/quantile_mlpregressor.py: 94%

78 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-28 08:46 +0100

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Implements a quantile non-linear regression. 

5""" 

6import inspect 

7import numpy as np 

8from sklearn.base import RegressorMixin 

9from sklearn.utils import check_X_y, column_or_1d 

10from sklearn.utils.validation import check_is_fitted 

11from sklearn.utils.extmath import safe_sparse_dot 

12from sklearn.neural_network._base import DERIVATIVES, LOSS_FUNCTIONS 

13try: 

14 from sklearn.neural_network._multilayer_perceptron import BaseMultilayerPerceptron 

15except ImportError: # pragma: no cover 

16 # scikit-learn < 0.22. 

17 from sklearn.neural_network.multilayer_perceptron import BaseMultilayerPerceptron 

18from sklearn.metrics import mean_absolute_error 

19 

20 

21def absolute_loss(y_true, y_pred): 

22 """ 

23 Computes the absolute loss for regression. 

24 

25 :param y_true: array-like or label indicator matrix 

26 Ground truth (correct) values. 

27 :param y_pred: array-like or label indicator matrix 

28 Predicted values, as returned by a regression estimator. 

29 :return: loss, float 

30 The degree to which the samples are correctly predicted. 

31 """ 

32 return np.sum(np.abs(y_true - y_pred)) / y_true.shape[0] 

33 

34 

35def float_sign(a): 

36 "Returns 1 if *a > 0*, otherwise -1" 

37 if a > 1e-8: 

38 return 1. 

39 if a < -1e-8: 

40 return -1. 

41 return 0. 

42 

43 

44EXTENDED_LOSS_FUNCTIONS = {'absolute_loss': absolute_loss} 

45DERIVATIVE_LOSS_FUNCTIONS = {'absolute_loss': np.vectorize(float_sign)} 

46 

47 

48class CustomizedMultilayerPerceptron(BaseMultilayerPerceptron): 

49 """ 

50 Customized MLP Perceptron based on 

51 `BaseMultilayerPerceptron 

52 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/neural_network/multilayer_perceptron.py#L40>`_. 

53 """ 

54 

55 def __init__(self, hidden_layer_sizes, activation, solver, 

56 alpha, batch_size, learning_rate, learning_rate_init, power_t, 

57 max_iter, loss, shuffle, random_state, tol, verbose, 

58 warm_start, momentum, nesterovs_momentum, early_stopping, 

59 validation_fraction, beta_1, beta_2, epsilon, 

60 n_iter_no_change, max_fun): 

61 if 'max_fun' in inspect.signature(BaseMultilayerPerceptron.__init__).parameters: 

62 args = [15000] 

63 else: 

64 args = [] 

65 BaseMultilayerPerceptron.__init__( # pylint: disable=E1121 

66 self, hidden_layer_sizes, activation, solver, alpha, batch_size, 

67 learning_rate, learning_rate_init, power_t, max_iter, loss, 

68 shuffle, random_state, tol, verbose, warm_start, momentum, 

69 nesterovs_momentum, early_stopping, validation_fraction, beta_1, beta_2, 

70 epsilon, n_iter_no_change, *args) 

71 

72 def _get_loss_function(self, loss_func_name): 

73 """ 

74 Returns the loss functions. 

75 

76 @param loss_func_name loss function name, see 

77 :epkg:`sklearn:neural_networks:MLPRegressor` 

78 """ 

79 return LOSS_FUNCTIONS.get(loss_func_name, EXTENDED_LOSS_FUNCTIONS[loss_func_name]) 

80 

81 def _modify_loss_derivatives(self, last_deltas): 

82 """ 

83 Modifies the loss derivatives. 

84 

85 @param last_deltas last deltas is the difference between the output and the expected output 

86 @return modified derivatives 

87 """ 

88 if self.loss == 'absolute_loss': 

89 return DERIVATIVE_LOSS_FUNCTIONS['absolute_loss'](last_deltas) 

90 return last_deltas # pragma: no cover 

91 

92 def _backprop(self, X, y, activations, deltas, coef_grads, 

93 intercept_grads): 

94 """ 

95 Computes the MLP loss function and its corresponding derivatives 

96 with respect to each parameter: weights and bias vectors. 

97 

98 :param X: {array-like, sparse matrix}, shape (n_samples, n_features) 

99 The input data. 

100 :param y: array-like, shape (n_samples,) 

101 The target values. 

102 :param activations: list, length = n_layers - 1 

103 The ith element of the list holds the values of the ith layer. 

104 :param deltas: list, length = n_layers - 1 

105 The ith element of the list holds the difference between the 

106 activations of the i + 1 layer and the backpropagated error. 

107 More specifically, deltas are gradients of loss with respect to z 

108 in each layer, where z = wx + b is the value of a particular layer 

109 before passing through the activation function 

110 :param coef_grads: list, length = n_layers - 1 

111 The ith element contains the amount of change used to update the 

112 coefficient parameters of the ith layer in an iteration. 

113 :param intercept_grads: list, length = n_layers - 1 

114 The ith element contains the amount of change used to update the 

115 intercept parameters of the ith layer in an iteration. 

116 :return: loss, float 

117 :return: coef_grads, list, length = n_layers - 1 

118 :return: intercept_grads, list, length = n_layers - 1 

119 """ 

120 n_samples = X.shape[0] 

121 

122 # Forward propagate 

123 activations = self._forward_pass(activations) 

124 

125 # Get loss 

126 loss_func_name = self.loss 

127 if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic': 

128 loss_func_name = 'binary_log_loss' 

129 loss_function = self._get_loss_function(loss_func_name) 

130 loss = loss_function(y, activations[-1]) 

131 # Add L2 regularization term to loss 

132 values = np.sum( 

133 np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_])) 

134 loss += (0.5 * self.alpha) * values / n_samples 

135 

136 # Backward propagate 

137 last = self.n_layers_ - 2 

138 

139 # The calculation of delta[last] here works with following 

140 # combinations of output activation and loss function: 

141 # sigmoid and binary cross entropy, softmax and categorical cross 

142 # entropy, and identity with squared loss 

143 deltas[last] = activations[-1] - y 

144 

145 # We insert the following modification to modify the gradient 

146 # due to the modification of the loss function. 

147 deltas[last] = self._modify_loss_derivatives(deltas[last]) 

148 

149 # Compute gradient for the last layer 

150 temp = self._compute_loss_grad( # pylint: disable=E1111 

151 last, n_samples, activations, deltas, coef_grads, intercept_grads) 

152 if temp is None: 

153 # recent version of scikit-learn 

154 # Compute gradient for the last layer 

155 self._compute_loss_grad( 

156 last, n_samples, activations, deltas, coef_grads, intercept_grads) 

157 

158 inplace_derivative = DERIVATIVES[self.activation] 

159 # Iterate over the hidden layers 

160 for i in range(self.n_layers_ - 2, 0, -1): 

161 deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T) 

162 inplace_derivative(activations[i], deltas[i - 1]) 

163 

164 self._compute_loss_grad( 

165 i - 1, n_samples, activations, deltas, coef_grads, 

166 intercept_grads) 

167 else: # pragma: no cover 

168 coef_grads, intercept_grads = temp # pylint: disable=E0633 

169 

170 # Iterate over the hidden layers 

171 for i in range(self.n_layers_ - 2, 0, -1): 

172 deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T) 

173 inplace_derivative = DERIVATIVES[self.activation] 

174 inplace_derivative(activations[i], deltas[i - 1]) 

175 

176 coef_grads, intercept_grads = self._compute_loss_grad( # pylint: disable=E1111,E0633 

177 i - 1, n_samples, activations, deltas, coef_grads, 

178 intercept_grads) 

179 

180 return loss, coef_grads, intercept_grads 

181 

182 

183class QuantileMLPRegressor(CustomizedMultilayerPerceptron, RegressorMixin): 

184 """ 

185 Quantile MLP Regression or neural networks regression 

186 trained with norm :epkg:`L1`. This class inherits from 

187 :epkg:`sklearn:neural_networks:MLPRegressor`. 

188 This model optimizes the absolute-loss using LBFGS or stochastic gradient 

189 descent. See @see cl CustomizedMultilayerPerceptron and 

190 @see fn absolute_loss. 

191 

192 :param hidden_layer_sizes: tuple, length = n_layers - 2, default (100,) 

193 The ith element represents the number of neurons in the ith 

194 hidden layer. 

195 :param activation: {'identity', 'logistic', 'tanh', 'relu'}, default 'relu' 

196 Activation function for the hidden layer. 

197 - 'identity', no-op activation, useful to implement linear bottleneck, 

198 returns :math:`f(x) = x` 

199 - 'logistic', the logistic sigmoid function, 

200 returns :math:`f(x) = 1 / (1 + exp(-x))`. 

201 - 'tanh', the hyperbolic tan function, 

202 returns :math:`f(x) = tanh(x)`. 

203 - 'relu', the rectified linear unit function, 

204 returns :math:`f(x) = \\max(0, x)`. 

205 :param solver: ``{'lbfgs', 'sgd', 'adam'}``, default 'adam' 

206 The solver for weight optimization. 

207 - *'lbfgs'* is an optimizer in the family of quasi-Newton methods. 

208 - *'sgd'* refers to stochastic gradient descent. 

209 - *'adam'* refers to a stochastic gradient-based optimizer proposed by 

210 Kingma, Diederik, and Jimmy Ba 

211 Note: The default solver 'adam' works pretty well on relatively 

212 large datasets (with thousands of training samples or more) in terms of 

213 both training time and validation score. 

214 For small datasets, however, 'lbfgs' can converge faster and perform 

215 better. 

216 :param alpha: float, optional, default 0.0001 

217 :epkg:`L2` penalty (regularization term) parameter. 

218 :param batch_size: int, optional, default 'auto' 

219 Size of minibatches for stochastic optimizers. 

220 If the solver is 'lbfgs', the classifier will not use minibatch. 

221 When set to "auto", `batch_size=min(200, n_samples)` 

222 :param learning_rate: {'constant', 'invscaling', 'adaptive'}, default 'constant' 

223 Learning rate schedule for weight updates. 

224 - 'constant' is a constant learning rate given by 

225 'learning_rate_init'. 

226 - 'invscaling' gradually decreases the learning rate ``learning_rate_`` 

227 at each time step 't' using an inverse scaling exponent of 'power_t'. 

228 effective_learning_rate = learning_rate_init / pow(t, power_t) 

229 - 'adaptive' keeps the learning rate constant to 

230 'learning_rate_init' as long as training loss keeps decreasing. 

231 Each time two consecutive epochs fail to decrease training loss by at 

232 least tol, or fail to increase validation score by at least tol if 

233 'early_stopping' is on, the current learning rate is divided by 5. 

234 Only used when solver='sgd'. 

235 :param learning_rate_init: double, optional, default 0.001 

236 The initial learning rate used. It controls the step-size 

237 in updating the weights. Only used when solver='sgd' or 'adam'. 

238 :param power_t: double, optional, default 0.5 

239 The exponent for inverse scaling learning rate. 

240 It is used in updating effective learning rate when the learning_rate 

241 is set to 'invscaling'. Only used when solver='sgd'. 

242 :param max_iter: int, optional, default 200 

243 Maximum number of iterations. The solver iterates until convergence 

244 (determined by 'tol') or this number of iterations. For stochastic 

245 solvers ('sgd', 'adam'), note that this determines the number of epochs 

246 (how many times each data point will be used), not the number of 

247 gradient steps. 

248 :param shuffle: bool, optional, default True 

249 Whether to shuffle samples in each iteration. Only used when 

250 solver='sgd' or 'adam'. 

251 :param random_state: int, RandomState instance or None, optional, default None 

252 If int, random_state is the seed used by the random number generator; 

253 If RandomState instance, random_state is the random number generator; 

254 If None, the random number generator is the RandomState instance used 

255 by `np.random`. 

256 :param tol: float, optional, default 1e-4 

257 Tolerance for the optimization. When the loss or score is not improving 

258 by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, 

259 unless ``learning_rate`` is set to 'adaptive', convergence is 

260 considered to be reached and training stops. 

261 :param verbose: bool, optional, default False 

262 Whether to print progress messages to stdout. 

263 :param warm_start: bool, optional, default False 

264 When set to True, reuse the solution of the previous 

265 call to fit as initialization, otherwise, just erase the 

266 previous solution. See :term:`the Glossary <warm_start>`. 

267 :param momentum: float, default 0.9 

268 Momentum for gradient descent update. Should be between 0 and 1. Only 

269 used when solver='sgd'. 

270 :param nesterovs_momentum: boolean, default True 

271 Whether to use Nesterov's momentum. Only used when solver='sgd' and 

272 momentum > 0. 

273 :param early_stopping: bool, default False 

274 Whether to use early stopping to terminate training when validation 

275 score is not improving. If set to true, it will automatically set 

276 aside 10% of training data as validation and terminate training when 

277 validation score is not improving by at least ``tol`` for 

278 ``n_iter_no_change`` consecutive epochs. 

279 Only effective when solver='sgd' or 'adam' 

280 :param validation_fraction: float, optional, default 0.1 

281 The proportion of training data to set aside as validation set for 

282 early stopping. Must be between 0 and 1. 

283 Only used if early_stopping is True 

284 :param beta_1: float, optional, default 0.9 

285 Exponential decay rate for estimates of first moment vector in adam, 

286 should be in [0, 1). Only used when solver='adam' 

287 :param beta_2: float, optional, default 0.999 

288 Exponential decay rate for estimates of second moment vector in adam, 

289 should be in [0, 1). Only used when solver='adam' 

290 :param epsilon: float, optional, default 1e-8 

291 Value for numerical stability in adam. Only used when solver='adam' 

292 :param n_iter_no_change: int, optional, default 10 

293 Maximum number of epochs to not meet ``tol`` improvement. 

294 Only effective when solver='sgd' or 'adam' 

295 

296 Fitted attributes: 

297 

298 * `loss_`: float 

299 The current loss computed with the loss function. 

300 * `coefs_`: list, length n_layers - 1 

301 The ith element in the list represents the weight matrix corresponding 

302 to layer i. 

303 * `intercepts_`: list, length n_layers - 1 

304 The ith element in the list represents the bias vector corresponding to 

305 layer i + 1. 

306 * `n_iter_`: int, 

307 The number of iterations the solver has ran. 

308 * `n_layers_`: int 

309 Number of layers. 

310 * `n_outputs_`: int 

311 Number of outputs. 

312 * `out_activation_`: string 

313 Name of the output activation function. 

314 """ 

315 

316 def __init__(self, 

317 hidden_layer_sizes=(100,), activation="relu", 

318 solver='adam', alpha=0.0001, 

319 batch_size='auto', learning_rate="constant", 

320 learning_rate_init=0.001, 

321 power_t=0.5, max_iter=200, shuffle=True, 

322 random_state=None, tol=1e-4, 

323 verbose=False, warm_start=False, momentum=0.9, 

324 nesterovs_momentum=True, early_stopping=False, 

325 validation_fraction=0.1, beta_1=0.9, beta_2=0.999, 

326 epsilon=1e-8, n_iter_no_change=10, 

327 **kwargs): 

328 """ 

329 See :epkg:`sklearn:neural_networks:MLPRegressor` 

330 """ 

331 sup = super(QuantileMLPRegressor, self) # pylint: disable=R1725 

332 if "max_fun" not in kwargs: 

333 sig = inspect.signature(sup.__init__) 

334 if "max_fun" in sig.parameters: 

335 kwargs['max_fun'] = 15000 

336 sup.__init__(hidden_layer_sizes=hidden_layer_sizes, 

337 activation=activation, solver=solver, alpha=alpha, 

338 batch_size=batch_size, learning_rate=learning_rate, 

339 learning_rate_init=learning_rate_init, power_t=power_t, 

340 max_iter=max_iter, loss='absolute_loss', shuffle=shuffle, 

341 random_state=random_state, tol=tol, verbose=verbose, 

342 warm_start=warm_start, momentum=momentum, 

343 nesterovs_momentum=nesterovs_momentum, 

344 early_stopping=early_stopping, 

345 validation_fraction=validation_fraction, 

346 beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, 

347 n_iter_no_change=n_iter_no_change, **kwargs) 

348 

349 def predict(self, X): 

350 """ 

351 Predicts using the multi-layer perceptron model. 

352 

353 :param X: {array-like, sparse matrix}, shape (n_samples, n_features) 

354 The input data. 

355 :return: y : array-like, shape (n_samples, n_outputs) 

356 The predicted values. 

357 """ 

358 check_is_fitted(self) 

359 if hasattr(self, '_predict'): 

360 y_pred = self._predict(X) 

361 else: 

362 y_pred = self._forward_pass_fast(X) 

363 if y_pred.shape[1] == 1: 

364 return y_pred.ravel() 

365 return y_pred 

366 

367 def _validate_input(self, X, y, incremental, reset=False): 

368 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], 

369 multi_output=True, y_numeric=True) 

370 if y.ndim == 2 and y.shape[1] == 1: 

371 y = column_or_1d(y, warn=True) 

372 return X, y 

373 

374 def score(self, X, y, sample_weight=None): 

375 """ 

376 Returns mean absolute error regression loss. 

377 

378 :param X: array-like, shape = (n_samples, n_features) 

379 Test samples. 

380 :param y: array-like, shape = (n_samples) or (n_samples, n_outputs) 

381 True values for X. 

382 :param sample_weight: array-like, shape = [n_samples], optional 

383 Sample weights. 

384 :return: score, float 

385 mean absolute error regression loss 

386 """ 

387 pred = self.predict(X) 

388 return mean_absolute_error(y, pred, sample_weight=sample_weight)