Coverage for src/mlstatpy/optim/sgd.py: 82%

132 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-27 05:59 +0100

1""" 

2@file 

3@brief Implements simple stochastic gradient optimisation. 

4It is inspired from `_stochastic_optimizers.py 

5<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ 

6neural_network/_stochastic_optimizers.py>`_. 

7""" 

8import numpy 

9from numpy.core._exceptions import UFuncTypeError 

10 

11 

12class BaseOptimizer: 

13 """ 

14 Base stochastic gradient descent optimizer. 

15 

16 :param coef: array, initial coefficient 

17 :param learning_rate_init: float 

18 The initial learning rate used. It controls the step-size 

19 in updating the weights. 

20 :param min_threshold: coefficients must be higher than *min_thresold* 

21 :param max_threshold: coefficients must be below than *max_thresold* 

22 

23 The class holds the following attributes: 

24 

25 * *learning_rate*: float, the current learning rate 

26 * *coef*: optimized coefficients 

27 * *min_threshold*, *max_threshold*: coefficients thresholds 

28 * *l2*: L2 regularization 

29 * *l1*: L1 regularization 

30 """ 

31 

32 def __init__(self, coef, learning_rate_init=0.1, 

33 min_threshold=None, max_threshold=None, 

34 l1=0., l2=0.): 

35 if not isinstance(coef, numpy.ndarray): 

36 raise TypeError("coef must be an array.") 

37 self.coef = coef 

38 self.learning_rate_init = learning_rate_init 

39 self.learning_rate = float(learning_rate_init) 

40 if (min_threshold is not None and 

41 not isinstance(min_threshold, (float, numpy.float64, numpy.float32))): 

42 raise TypeError("min_threshold must be a float") 

43 if (max_threshold is not None and 

44 not isinstance(max_threshold, (float, numpy.float64, numpy.float32))): 

45 raise TypeError("min_threshold must be a float") 

46 self.min_threshold = min_threshold 

47 self.max_threshold = max_threshold 

48 self.l1 = l1 

49 self.l2 = l2 

50 

51 def _get_updates(self, grad): 

52 raise NotImplementedError("Must be overwritten.") # pragma no cover 

53 

54 def update_coef(self, grad): 

55 """ 

56 Updates coefficients with given gradient. 

57 

58 :param grad: array, gradient 

59 """ 

60 if self.coef.shape != grad.shape: 

61 raise ValueError( 

62 "coef and grad must have the same shape coef {} != gradient {}." 

63 "".format(self.coef.shape, grad.shape)) 

64 update = self._get_updates(grad) 

65 self.coef += update 

66 if self.min_threshold is not None: 

67 try: 

68 self.coef = numpy.maximum(self.coef, self.min_threshold) 

69 except UFuncTypeError: # pragma: no cover 

70 raise RuntimeError( # pylint: disable=W0707 

71 "Unable to compute an upper bound with coef={} " 

72 "max_threshold={}".format(self.coef, self.min_threshold)) 

73 if self.max_threshold is not None: 

74 try: 

75 self.coef = numpy.minimum(self.coef, self.max_threshold) 

76 except UFuncTypeError: # pragma: no cover 

77 raise RuntimeError( # pylint: disable=W0707 

78 "Unable to compute a lower bound with coef={} " 

79 "max_threshold={}".format(self.coef, self.max_threshold)) 

80 

81 def iteration_ends(self, time_step): 

82 """ 

83 Performs update to learning rate and potentially other states at the 

84 end of an iteration. 

85 """ 

86 pass # pragma: no cover 

87 

88 def train(self, X, y, fct_loss, fct_grad, max_iter=100, 

89 early_th=None, verbose=False): 

90 """ 

91 Optimizes the coefficients. 

92 

93 :param X: datasets (array) 

94 :param y: expected target 

95 :param fct_loss: loss function, signature: `f(coef, X, y) -> float` 

96 :param fct_grad: gradient function, 

97 signature: `g(coef, x, y, i) -> array` 

98 :param max_iter: number maximum of iteration 

99 :param early_th: stops the training if the error goes below 

100 this threshold 

101 :param verbose: display information 

102 :return: loss 

103 

104 The method keeps the best coefficients for the 

105 minimal loss. 

106 """ 

107 if not isinstance(X, numpy.ndarray): 

108 raise TypeError("X must be an array.") 

109 if not isinstance(y, numpy.ndarray): 

110 raise TypeError("y must be an array.") 

111 if X.shape[0] != y.shape[0]: 

112 raise ValueError("X and y must have the same number of rows.") 

113 if any(numpy.isnan(X.ravel())): 

114 raise ValueError("X contains nan value.") 

115 if any(numpy.isnan(y.ravel())): 

116 raise ValueError("y contains nan value.") 

117 

118 loss = fct_loss(self.coef, X, y) 

119 losses = [loss] 

120 best_coef = None 

121 best_loss = None 

122 if verbose: 

123 self._display_progress(0, max_iter, loss) 

124 n_samples = 0 

125 for it in range(max_iter): 

126 irows = numpy.random.choice(X.shape[0], X.shape[0]) 

127 for irow in irows: 

128 grad = fct_grad(self.coef, X[irow, :], y[irow], irow) 

129 self._regularize_gradient(grad) 

130 if isinstance(verbose, int) and verbose >= 10: 

131 self._display_progress(0, max_iter, loss, grad, 'grad') 

132 if numpy.isnan(grad).sum() > 0: 

133 raise RuntimeError( # pragma: no cover 

134 "The gradient has nan values.") 

135 self.update_coef(grad) 

136 n_samples += 1 

137 

138 self.iteration_ends(n_samples) 

139 loss = fct_loss(self.coef, X, y) + \ 

140 self.loss_regularization(self.coef) 

141 if verbose: 

142 self._display_progress(it + 1, max_iter, loss) 

143 self.iter_ = it + 1 

144 losses.append(loss) 

145 if best_loss is None or loss < best_loss: 

146 best_loss = loss 

147 best_coef = self.coef.copy() 

148 if self._evaluate_early_stopping( 

149 it, max_iter, losses, early_th, verbose=verbose): 

150 break 

151 self.coef = best_coef 

152 return best_loss 

153 

154 def loss_regularization(self, coef): 

155 loss = 0 

156 if self.l1 > 0: 

157 loss += numpy.sum(numpy.abs(coef)) * self.l1 

158 if self.l2 > 0: 

159 loss += numpy.sum(coef ** 2) * self.l2 

160 return loss 

161 

162 def _regularize_gradient(self, grad): 

163 """ 

164 Applies regularization. 

165 """ 

166 self.velocity_grad = grad 

167 if self.l2 > 0: 

168 grad += self.coef * self.l2 

169 if self.l1 > 0: 

170 grad += numpy.sign(self.coef) * self.l1 

171 

172 def _evaluate_early_stopping( 

173 self, 

174 it, 

175 max_iter, 

176 losses, 

177 early_th, 

178 verbose=False): 

179 if len(losses) < 5 or early_th is None: 

180 return False 

181 if numpy.isnan(losses[-5]): 

182 if numpy.isnan(losses[-1]): 

183 if verbose: # pragma: no cover 

184 self._display_progress(it + 1, max_iter, losses[-1], 

185 losses=losses[-5:]) 

186 return True 

187 return False 

188 if numpy.isnan(losses[-1]): 

189 if verbose: # pragma: no cover 

190 self._display_progress(it + 1, max_iter, losses[-1], 

191 losses=losses[-5:]) 

192 return True 

193 if abs(losses[-1] - losses[-5]) <= early_th: 

194 if verbose: # pragma: no cover 

195 self._display_progress(it + 1, max_iter, losses[-1], 

196 losses=losses[-5:]) 

197 return True 

198 return False 

199 

200 def _display_progress(self, it, max_iter, loss, losses=None, msg=None): 

201 'Displays training progress.' 

202 mxc = numpy.abs(self.coef.ravel()).max() 

203 l1 = numpy.sum(numpy.abs(self.coef)) 

204 l2 = numpy.sum(self.coef * self.coef) 

205 vl1 = numpy.sum(numpy.abs(self.velocity_grad)) 

206 vl2 = numpy.sum(self.velocity_grad * self.velocity_grad) 

207 if losses is None: 

208 print('{}/{}: loss: {:1.4g} max(coef): {:1.2g} ' 

209 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format( 

210 it, max_iter, loss, mxc, vl1, l1, vl2, l2)) 

211 else: 

212 print('{}/{}: loss: {:1.4g} losses: {} max(coef): {:1.4g} ' 

213 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format( 

214 it, max_iter, loss, losses, mxc, vl1, l1, vl2, l2)) 

215 

216 

217class SGDOptimizer(BaseOptimizer): 

218 """ 

219 Stochastic gradient descent optimizer with momentum. 

220 

221 :param coef: array, initial coefficient 

222 :param learning_rate_init: float 

223 The initial learning rate used. It controls the step-size 

224 in updating the weights, 

225 :param lr_schedule: `{'constant', 'adaptive', 'invscaling'}`, 

226 learning rate schedule for weight updates, 

227 `'constant'` for a constant learning rate given by 

228 *learning_rate_init*. `'invscaling'` gradually decreases 

229 the learning rate *learning_rate_* at each time step *t* 

230 using an inverse scaling exponent of *power_t*. 

231 `learning_rate_ = learning_rate_init / pow(t, power_t)`, 

232 `'adaptive'`, keeps the learning rate constant to 

233 *learning_rate_init* as long as the training keeps decreasing. 

234 Each time 2 consecutive epochs fail to decrease the training loss by 

235 tol, or fail to increase validation score by tol if 'early_stopping' 

236 is on, the current learning rate is divided by 5. 

237 :param momentum: float 

238 Value of momentum used, must be larger than or equal to 0 

239 :param power_t: double 

240 The exponent for inverse scaling learning rate. 

241 :param early_th: stops if the error goes below that threshold 

242 :param min_threshold: lower bound for parameters (can be None) 

243 :param max_threshold: upper bound for parameters (can be None) 

244 :param l1: L1 regularization 

245 :param l2: L2 regularization 

246 

247 The class holds the following attributes: 

248 

249 * *learning_rate*: float, the current learning rate 

250 * velocity*: array, velocity that are used to update params 

251 

252 .. exref:: 

253 :title: Stochastic Gradient Descent applied to linear regression 

254 

255 The following example how to optimize a simple linear regression. 

256 

257 .. runpython:: 

258 :showcode: 

259 

260 import numpy 

261 from mlstatpy.optim import SGDOptimizer 

262 

263 

264 def fct_loss(c, X, y): 

265 return numpy.linalg.norm(X @ c - y) ** 2 

266 

267 

268 def fct_grad(c, x, y, i=0): 

269 return x * (x @ c - y) * 0.1 

270 

271 

272 coef = numpy.array([0.5, 0.6, -0.7]) 

273 X = numpy.random.randn(10, 3) 

274 y = X @ coef 

275 

276 sgd = SGDOptimizer(numpy.random.randn(3)) 

277 sgd.train(X, y, fct_loss, fct_grad, max_iter=15, verbose=True) 

278 print('optimized coefficients:', sgd.coef) 

279 """ 

280 

281 def __init__(self, coef, learning_rate_init=0.1, lr_schedule='invscaling', 

282 momentum=0.9, power_t=0.5, early_th=None, 

283 min_threshold=None, max_threshold=None, 

284 l1=0., l2=0.): 

285 super().__init__(coef, learning_rate_init, 

286 min_threshold=min_threshold, 

287 max_threshold=max_threshold, 

288 l1=l1, l2=l2) 

289 self.lr_schedule = lr_schedule 

290 self.momentum = momentum 

291 self.power_t = power_t 

292 self.early_th = early_th 

293 self.velocity = numpy.zeros_like(coef) 

294 self.velocity_grad = numpy.zeros_like(coef) 

295 

296 def iteration_ends(self, time_step): 

297 """ 

298 Performs updates to learning rate and potential other states at the 

299 end of an iteration. 

300 

301 :param time_step: int 

302 number of training samples trained on so far, used to update 

303 learning rate for 'invscaling' 

304 """ 

305 if self.lr_schedule == 'invscaling': 

306 self.learning_rate = (float(self.learning_rate_init) / 

307 (time_step + 1) ** self.power_t) 

308 elif self.lr_schedule == 'constant': 

309 pass 

310 else: 

311 raise ValueError( # pragma: no cover 

312 f"Unexpected value: lr_schedule='{self.lr_schedule}'.") 

313 

314 def _get_updates(self, grad): 

315 """ 

316 Gets the values used to update params with given gradients. 

317 

318 :param grad: array, gradient 

319 :return: updates, array, the values to add to params 

320 """ 

321 update = self.momentum * self.velocity - self.learning_rate * grad 

322 self.velocity = update 

323 return update 

324 

325 def _display_progress(self, it, max_iter, loss, losses=None, msg='loss'): 

326 'Displays training progress.' 

327 mxc = numpy.abs(self.coef.ravel()).max() 

328 l1 = numpy.sum(numpy.abs(self.coef)) 

329 l2 = numpy.sum(self.coef * self.coef) 

330 vl1 = numpy.sum(numpy.abs(self.velocity_grad)) 

331 vl2 = numpy.sum(self.velocity_grad * self.velocity_grad) 

332 if losses is None: 

333 print('{}/{}: {}: {:1.4g} lr={:1.3g} max(coef): {:1.2g} ' 

334 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format( 

335 it, max_iter, msg, loss, self.learning_rate, mxc, 

336 vl1, l1, vl2, l2)) 

337 else: 

338 print('{}/{}: {}: {:1.4g} lr={:1.3g} {}es: {} ' # pylint: disable=W1308 

339 'max(coef): {:1.2g} l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format( # pylint: disable=W1308 

340 it, max_iter, msg, loss, self.learning_rate, msg, losses, 

341 mxc, vl1, l1, vl2, l2))