Coverage for src/mlstatpy/optim/sgd.py: 82%

1"""

2@file

3@brief Implements simple stochastic gradient optimisation.

4It is inspired from `_stochastic_optimizers.py

5<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/

6neural_network/_stochastic_optimizers.py>`_.

7"""

8import numpy

9from numpy.core._exceptions import UFuncTypeError

12class BaseOptimizer:

13 """

14 Base stochastic gradient descent optimizer.

16 :param coef: array, initial coefficient

17 :param learning_rate_init: float

18 The initial learning rate used. It controls the step-size

19 in updating the weights.

20 :param min_threshold: coefficients must be higher than *min_thresold*

21 :param max_threshold: coefficients must be below than *max_thresold*

23 The class holds the following attributes:

25 * *learning_rate*: float, the current learning rate

26 * *coef*: optimized coefficients

27 * *min_threshold*, *max_threshold*: coefficients thresholds

28 * *l2*: L2 regularization

29 * *l1*: L1 regularization

30 """

32 def __init__(self, coef, learning_rate_init=0.1,

33 min_threshold=None, max_threshold=None,

34 l1=0., l2=0.):

35 if not isinstance(coef, numpy.ndarray):

36 raise TypeError("coef must be an array.")

37 self.coef = coef

38 self.learning_rate_init = learning_rate_init

39 self.learning_rate = float(learning_rate_init)

40 if (min_threshold is not None and

41 not isinstance(min_threshold, (float, numpy.float64, numpy.float32))):

42 raise TypeError("min_threshold must be a float")

43 if (max_threshold is not None and

44 not isinstance(max_threshold, (float, numpy.float64, numpy.float32))):

45 raise TypeError("min_threshold must be a float")

46 self.min_threshold = min_threshold

47 self.max_threshold = max_threshold

48 self.l1 = l1

49 self.l2 = l2

51 def _get_updates(self, grad):

52 raise NotImplementedError("Must be overwritten.") # pragma no cover

54 def update_coef(self, grad):

55 """

56 Updates coefficients with given gradient.

58 :param grad: array, gradient

59 """

60 if self.coef.shape != grad.shape:

61 raise ValueError(

62 "coef and grad must have the same shape coef {} != gradient {}."

63 "".format(self.coef.shape, grad.shape))

64 update = self._get_updates(grad)

65 self.coef += update

66 if self.min_threshold is not None:

67 try:

68 self.coef = numpy.maximum(self.coef, self.min_threshold)

69 except UFuncTypeError: # pragma: no cover

70 raise RuntimeError( # pylint: disable=W0707

71 "Unable to compute an upper bound with coef={} "

72 "max_threshold={}".format(self.coef, self.min_threshold))

73 if self.max_threshold is not None:

74 try:

75 self.coef = numpy.minimum(self.coef, self.max_threshold)

76 except UFuncTypeError: # pragma: no cover

77 raise RuntimeError( # pylint: disable=W0707

78 "Unable to compute a lower bound with coef={} "

79 "max_threshold={}".format(self.coef, self.max_threshold))

81 def iteration_ends(self, time_step):

82 """

83 Performs update to learning rate and potentially other states at the

84 end of an iteration.

85 """

86 pass # pragma: no cover

88 def train(self, X, y, fct_loss, fct_grad, max_iter=100,

89 early_th=None, verbose=False):

90 """

91 Optimizes the coefficients.

93 :param X: datasets (array)

94 :param y: expected target

95 :param fct_loss: loss function, signature: `f(coef, X, y) -> float`

96 :param fct_grad: gradient function,

97 signature: `g(coef, x, y, i) -> array`

98 :param max_iter: number maximum of iteration

99 :param early_th: stops the training if the error goes below

100 this threshold

101 :param verbose: display information

102 :return: loss

103

104 The method keeps the best coefficients for the

105 minimal loss.

106 """

107 if not isinstance(X, numpy.ndarray):

108 raise TypeError("X must be an array.")

109 if not isinstance(y, numpy.ndarray):

110 raise TypeError("y must be an array.")

111 if X.shape[0] != y.shape[0]:

112 raise ValueError("X and y must have the same number of rows.")

113 if any(numpy.isnan(X.ravel())):

114 raise ValueError("X contains nan value.")

115 if any(numpy.isnan(y.ravel())):

116 raise ValueError("y contains nan value.")

117

118 loss = fct_loss(self.coef, X, y)

119 losses = [loss]

120 best_coef = None

121 best_loss = None

122 if verbose:

123 self._display_progress(0, max_iter, loss)

124 n_samples = 0

125 for it in range(max_iter):

126 irows = numpy.random.choice(X.shape[0], X.shape[0])

127 for irow in irows:

128 grad = fct_grad(self.coef, X[irow, :], y[irow], irow)

129 self._regularize_gradient(grad)

130 if isinstance(verbose, int) and verbose >= 10:

131 self._display_progress(0, max_iter, loss, grad, 'grad')

132 if numpy.isnan(grad).sum() > 0:

133 raise RuntimeError( # pragma: no cover

134 "The gradient has nan values.")

135 self.update_coef(grad)

136 n_samples += 1

137

138 self.iteration_ends(n_samples)

139 loss = fct_loss(self.coef, X, y) + \

140 self.loss_regularization(self.coef)

141 if verbose:

142 self._display_progress(it + 1, max_iter, loss)

143 self.iter_ = it + 1

144 losses.append(loss)

145 if best_loss is None or loss < best_loss:

146 best_loss = loss

147 best_coef = self.coef.copy()

148 if self._evaluate_early_stopping(

149 it, max_iter, losses, early_th, verbose=verbose):

150 break

151 self.coef = best_coef

152 return best_loss

153

154 def loss_regularization(self, coef):

155 loss = 0

156 if self.l1 > 0:

157 loss += numpy.sum(numpy.abs(coef)) * self.l1

158 if self.l2 > 0:

159 loss += numpy.sum(coef ** 2) * self.l2

160 return loss

161

162 def _regularize_gradient(self, grad):

163 """

164 Applies regularization.

165 """

166 self.velocity_grad = grad

167 if self.l2 > 0:

168 grad += self.coef * self.l2

169 if self.l1 > 0:

170 grad += numpy.sign(self.coef) * self.l1

171

172 def _evaluate_early_stopping(

173 self,

174 it,

175 max_iter,

176 losses,

177 early_th,

178 verbose=False):

179 if len(losses) < 5 or early_th is None:

180 return False

181 if numpy.isnan(losses[-5]):

182 if numpy.isnan(losses[-1]):

183 if verbose: # pragma: no cover

184 self._display_progress(it + 1, max_iter, losses[-1],

185 losses=losses[-5:])

186 return True

187 return False

188 if numpy.isnan(losses[-1]):

189 if verbose: # pragma: no cover

190 self._display_progress(it + 1, max_iter, losses[-1],

191 losses=losses[-5:])

192 return True

193 if abs(losses[-1] - losses[-5]) <= early_th:

194 if verbose: # pragma: no cover

195 self._display_progress(it + 1, max_iter, losses[-1],

196 losses=losses[-5:])

197 return True

198 return False

199

200 def _display_progress(self, it, max_iter, loss, losses=None, msg=None):

201 'Displays training progress.'

202 mxc = numpy.abs(self.coef.ravel()).max()

203 l1 = numpy.sum(numpy.abs(self.coef))

204 l2 = numpy.sum(self.coef * self.coef)

205 vl1 = numpy.sum(numpy.abs(self.velocity_grad))

206 vl2 = numpy.sum(self.velocity_grad * self.velocity_grad)

207 if losses is None:

208 print('{}/{}: loss: {:1.4g} max(coef): {:1.2g} '

209 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format(

210 it, max_iter, loss, mxc, vl1, l1, vl2, l2))

211 else:

212 print('{}/{}: loss: {:1.4g} losses: {} max(coef): {:1.4g} '

213 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format(

214 it, max_iter, loss, losses, mxc, vl1, l1, vl2, l2))

215

216

217class SGDOptimizer(BaseOptimizer):

218 """

219 Stochastic gradient descent optimizer with momentum.

220

221 :param coef: array, initial coefficient

222 :param learning_rate_init: float

223 The initial learning rate used. It controls the step-size

224 in updating the weights,

225 :param lr_schedule: `{'constant', 'adaptive', 'invscaling'}`,

226 learning rate schedule for weight updates,

227 `'constant'` for a constant learning rate given by

228 *learning_rate_init*. `'invscaling'` gradually decreases

229 the learning rate *learning_rate_* at each time step *t*

230 using an inverse scaling exponent of *power_t*.

231 `learning_rate_ = learning_rate_init / pow(t, power_t)`,

232 `'adaptive'`, keeps the learning rate constant to

233 *learning_rate_init* as long as the training keeps decreasing.

234 Each time 2 consecutive epochs fail to decrease the training loss by

235 tol, or fail to increase validation score by tol if 'early_stopping'

236 is on, the current learning rate is divided by 5.

237 :param momentum: float

238 Value of momentum used, must be larger than or equal to 0

239 :param power_t: double

240 The exponent for inverse scaling learning rate.

241 :param early_th: stops if the error goes below that threshold

242 :param min_threshold: lower bound for parameters (can be None)

243 :param max_threshold: upper bound for parameters (can be None)

244 :param l1: L1 regularization

245 :param l2: L2 regularization

246

247 The class holds the following attributes:

248

249 * *learning_rate*: float, the current learning rate

250 * velocity*: array, velocity that are used to update params

251

252 .. exref::

253 :title: Stochastic Gradient Descent applied to linear regression

254

255 The following example how to optimize a simple linear regression.

256

257 .. runpython::

258 :showcode:

259

260 import numpy

261 from mlstatpy.optim import SGDOptimizer

262

263

264 def fct_loss(c, X, y):

265 return numpy.linalg.norm(X @ c - y) ** 2

266

267

268 def fct_grad(c, x, y, i=0):

269 return x * (x @ c - y) * 0.1

270

271

272 coef = numpy.array([0.5, 0.6, -0.7])

273 X = numpy.random.randn(10, 3)

274 y = X @ coef

275

276 sgd = SGDOptimizer(numpy.random.randn(3))

277 sgd.train(X, y, fct_loss, fct_grad, max_iter=15, verbose=True)

278 print('optimized coefficients:', sgd.coef)

279 """

280

281 def __init__(self, coef, learning_rate_init=0.1, lr_schedule='invscaling',

282 momentum=0.9, power_t=0.5, early_th=None,

283 min_threshold=None, max_threshold=None,

284 l1=0., l2=0.):

285 super().__init__(coef, learning_rate_init,

286 min_threshold=min_threshold,

287 max_threshold=max_threshold,

288 l1=l1, l2=l2)

289 self.lr_schedule = lr_schedule

290 self.momentum = momentum

291 self.power_t = power_t

292 self.early_th = early_th

293 self.velocity = numpy.zeros_like(coef)

294 self.velocity_grad = numpy.zeros_like(coef)

295

296 def iteration_ends(self, time_step):

297 """

298 Performs updates to learning rate and potential other states at the

299 end of an iteration.

300

301 :param time_step: int

302 number of training samples trained on so far, used to update

303 learning rate for 'invscaling'

304 """

305 if self.lr_schedule == 'invscaling':

306 self.learning_rate = (float(self.learning_rate_init) /

307 (time_step + 1) ** self.power_t)

308 elif self.lr_schedule == 'constant':

309 pass

310 else:

311 raise ValueError( # pragma: no cover

312 f"Unexpected value: lr_schedule='{self.lr_schedule}'.")

313

314 def _get_updates(self, grad):

315 """

316 Gets the values used to update params with given gradients.

317

318 :param grad: array, gradient

319 :return: updates, array, the values to add to params

320 """

321 update = self.momentum * self.velocity - self.learning_rate * grad

322 self.velocity = update

323 return update

324

325 def _display_progress(self, it, max_iter, loss, losses=None, msg='loss'):

326 'Displays training progress.'

327 mxc = numpy.abs(self.coef.ravel()).max()

328 l1 = numpy.sum(numpy.abs(self.coef))

329 l2 = numpy.sum(self.coef * self.coef)

330 vl1 = numpy.sum(numpy.abs(self.velocity_grad))

331 vl2 = numpy.sum(self.velocity_grad * self.velocity_grad)

332 if losses is None:

333 print('{}/{}: {}: {:1.4g} lr={:1.3g} max(coef): {:1.2g} '

334 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format(

335 it, max_iter, msg, loss, self.learning_rate, mxc,

336 vl1, l1, vl2, l2))

337 else:

338 print('{}/{}: {}: {:1.4g} lr={:1.3g} {}es: {} ' # pylint: disable=W1308

339 'max(coef): {:1.2g} l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format( # pylint: disable=W1308

340 it, max_iter, msg, loss, self.learning_rate, msg, losses,

341 mxc, vl1, l1, vl2, l2))