Coverage for src/mlstatpy/optim/sgd.py: 82%
132 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-27 05:59 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-27 05:59 +0100
1"""
2@file
3@brief Implements simple stochastic gradient optimisation.
4It is inspired from `_stochastic_optimizers.py
5<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/
6neural_network/_stochastic_optimizers.py>`_.
7"""
8import numpy
9from numpy.core._exceptions import UFuncTypeError
12class BaseOptimizer:
13 """
14 Base stochastic gradient descent optimizer.
16 :param coef: array, initial coefficient
17 :param learning_rate_init: float
18 The initial learning rate used. It controls the step-size
19 in updating the weights.
20 :param min_threshold: coefficients must be higher than *min_thresold*
21 :param max_threshold: coefficients must be below than *max_thresold*
23 The class holds the following attributes:
25 * *learning_rate*: float, the current learning rate
26 * *coef*: optimized coefficients
27 * *min_threshold*, *max_threshold*: coefficients thresholds
28 * *l2*: L2 regularization
29 * *l1*: L1 regularization
30 """
32 def __init__(self, coef, learning_rate_init=0.1,
33 min_threshold=None, max_threshold=None,
34 l1=0., l2=0.):
35 if not isinstance(coef, numpy.ndarray):
36 raise TypeError("coef must be an array.")
37 self.coef = coef
38 self.learning_rate_init = learning_rate_init
39 self.learning_rate = float(learning_rate_init)
40 if (min_threshold is not None and
41 not isinstance(min_threshold, (float, numpy.float64, numpy.float32))):
42 raise TypeError("min_threshold must be a float")
43 if (max_threshold is not None and
44 not isinstance(max_threshold, (float, numpy.float64, numpy.float32))):
45 raise TypeError("min_threshold must be a float")
46 self.min_threshold = min_threshold
47 self.max_threshold = max_threshold
48 self.l1 = l1
49 self.l2 = l2
51 def _get_updates(self, grad):
52 raise NotImplementedError("Must be overwritten.") # pragma no cover
54 def update_coef(self, grad):
55 """
56 Updates coefficients with given gradient.
58 :param grad: array, gradient
59 """
60 if self.coef.shape != grad.shape:
61 raise ValueError(
62 "coef and grad must have the same shape coef {} != gradient {}."
63 "".format(self.coef.shape, grad.shape))
64 update = self._get_updates(grad)
65 self.coef += update
66 if self.min_threshold is not None:
67 try:
68 self.coef = numpy.maximum(self.coef, self.min_threshold)
69 except UFuncTypeError: # pragma: no cover
70 raise RuntimeError( # pylint: disable=W0707
71 "Unable to compute an upper bound with coef={} "
72 "max_threshold={}".format(self.coef, self.min_threshold))
73 if self.max_threshold is not None:
74 try:
75 self.coef = numpy.minimum(self.coef, self.max_threshold)
76 except UFuncTypeError: # pragma: no cover
77 raise RuntimeError( # pylint: disable=W0707
78 "Unable to compute a lower bound with coef={} "
79 "max_threshold={}".format(self.coef, self.max_threshold))
81 def iteration_ends(self, time_step):
82 """
83 Performs update to learning rate and potentially other states at the
84 end of an iteration.
85 """
86 pass # pragma: no cover
88 def train(self, X, y, fct_loss, fct_grad, max_iter=100,
89 early_th=None, verbose=False):
90 """
91 Optimizes the coefficients.
93 :param X: datasets (array)
94 :param y: expected target
95 :param fct_loss: loss function, signature: `f(coef, X, y) -> float`
96 :param fct_grad: gradient function,
97 signature: `g(coef, x, y, i) -> array`
98 :param max_iter: number maximum of iteration
99 :param early_th: stops the training if the error goes below
100 this threshold
101 :param verbose: display information
102 :return: loss
104 The method keeps the best coefficients for the
105 minimal loss.
106 """
107 if not isinstance(X, numpy.ndarray):
108 raise TypeError("X must be an array.")
109 if not isinstance(y, numpy.ndarray):
110 raise TypeError("y must be an array.")
111 if X.shape[0] != y.shape[0]:
112 raise ValueError("X and y must have the same number of rows.")
113 if any(numpy.isnan(X.ravel())):
114 raise ValueError("X contains nan value.")
115 if any(numpy.isnan(y.ravel())):
116 raise ValueError("y contains nan value.")
118 loss = fct_loss(self.coef, X, y)
119 losses = [loss]
120 best_coef = None
121 best_loss = None
122 if verbose:
123 self._display_progress(0, max_iter, loss)
124 n_samples = 0
125 for it in range(max_iter):
126 irows = numpy.random.choice(X.shape[0], X.shape[0])
127 for irow in irows:
128 grad = fct_grad(self.coef, X[irow, :], y[irow], irow)
129 self._regularize_gradient(grad)
130 if isinstance(verbose, int) and verbose >= 10:
131 self._display_progress(0, max_iter, loss, grad, 'grad')
132 if numpy.isnan(grad).sum() > 0:
133 raise RuntimeError( # pragma: no cover
134 "The gradient has nan values.")
135 self.update_coef(grad)
136 n_samples += 1
138 self.iteration_ends(n_samples)
139 loss = fct_loss(self.coef, X, y) + \
140 self.loss_regularization(self.coef)
141 if verbose:
142 self._display_progress(it + 1, max_iter, loss)
143 self.iter_ = it + 1
144 losses.append(loss)
145 if best_loss is None or loss < best_loss:
146 best_loss = loss
147 best_coef = self.coef.copy()
148 if self._evaluate_early_stopping(
149 it, max_iter, losses, early_th, verbose=verbose):
150 break
151 self.coef = best_coef
152 return best_loss
154 def loss_regularization(self, coef):
155 loss = 0
156 if self.l1 > 0:
157 loss += numpy.sum(numpy.abs(coef)) * self.l1
158 if self.l2 > 0:
159 loss += numpy.sum(coef ** 2) * self.l2
160 return loss
162 def _regularize_gradient(self, grad):
163 """
164 Applies regularization.
165 """
166 self.velocity_grad = grad
167 if self.l2 > 0:
168 grad += self.coef * self.l2
169 if self.l1 > 0:
170 grad += numpy.sign(self.coef) * self.l1
172 def _evaluate_early_stopping(
173 self,
174 it,
175 max_iter,
176 losses,
177 early_th,
178 verbose=False):
179 if len(losses) < 5 or early_th is None:
180 return False
181 if numpy.isnan(losses[-5]):
182 if numpy.isnan(losses[-1]):
183 if verbose: # pragma: no cover
184 self._display_progress(it + 1, max_iter, losses[-1],
185 losses=losses[-5:])
186 return True
187 return False
188 if numpy.isnan(losses[-1]):
189 if verbose: # pragma: no cover
190 self._display_progress(it + 1, max_iter, losses[-1],
191 losses=losses[-5:])
192 return True
193 if abs(losses[-1] - losses[-5]) <= early_th:
194 if verbose: # pragma: no cover
195 self._display_progress(it + 1, max_iter, losses[-1],
196 losses=losses[-5:])
197 return True
198 return False
200 def _display_progress(self, it, max_iter, loss, losses=None, msg=None):
201 'Displays training progress.'
202 mxc = numpy.abs(self.coef.ravel()).max()
203 l1 = numpy.sum(numpy.abs(self.coef))
204 l2 = numpy.sum(self.coef * self.coef)
205 vl1 = numpy.sum(numpy.abs(self.velocity_grad))
206 vl2 = numpy.sum(self.velocity_grad * self.velocity_grad)
207 if losses is None:
208 print('{}/{}: loss: {:1.4g} max(coef): {:1.2g} '
209 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format(
210 it, max_iter, loss, mxc, vl1, l1, vl2, l2))
211 else:
212 print('{}/{}: loss: {:1.4g} losses: {} max(coef): {:1.4g} '
213 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format(
214 it, max_iter, loss, losses, mxc, vl1, l1, vl2, l2))
217class SGDOptimizer(BaseOptimizer):
218 """
219 Stochastic gradient descent optimizer with momentum.
221 :param coef: array, initial coefficient
222 :param learning_rate_init: float
223 The initial learning rate used. It controls the step-size
224 in updating the weights,
225 :param lr_schedule: `{'constant', 'adaptive', 'invscaling'}`,
226 learning rate schedule for weight updates,
227 `'constant'` for a constant learning rate given by
228 *learning_rate_init*. `'invscaling'` gradually decreases
229 the learning rate *learning_rate_* at each time step *t*
230 using an inverse scaling exponent of *power_t*.
231 `learning_rate_ = learning_rate_init / pow(t, power_t)`,
232 `'adaptive'`, keeps the learning rate constant to
233 *learning_rate_init* as long as the training keeps decreasing.
234 Each time 2 consecutive epochs fail to decrease the training loss by
235 tol, or fail to increase validation score by tol if 'early_stopping'
236 is on, the current learning rate is divided by 5.
237 :param momentum: float
238 Value of momentum used, must be larger than or equal to 0
239 :param power_t: double
240 The exponent for inverse scaling learning rate.
241 :param early_th: stops if the error goes below that threshold
242 :param min_threshold: lower bound for parameters (can be None)
243 :param max_threshold: upper bound for parameters (can be None)
244 :param l1: L1 regularization
245 :param l2: L2 regularization
247 The class holds the following attributes:
249 * *learning_rate*: float, the current learning rate
250 * velocity*: array, velocity that are used to update params
252 .. exref::
253 :title: Stochastic Gradient Descent applied to linear regression
255 The following example how to optimize a simple linear regression.
257 .. runpython::
258 :showcode:
260 import numpy
261 from mlstatpy.optim import SGDOptimizer
264 def fct_loss(c, X, y):
265 return numpy.linalg.norm(X @ c - y) ** 2
268 def fct_grad(c, x, y, i=0):
269 return x * (x @ c - y) * 0.1
272 coef = numpy.array([0.5, 0.6, -0.7])
273 X = numpy.random.randn(10, 3)
274 y = X @ coef
276 sgd = SGDOptimizer(numpy.random.randn(3))
277 sgd.train(X, y, fct_loss, fct_grad, max_iter=15, verbose=True)
278 print('optimized coefficients:', sgd.coef)
279 """
281 def __init__(self, coef, learning_rate_init=0.1, lr_schedule='invscaling',
282 momentum=0.9, power_t=0.5, early_th=None,
283 min_threshold=None, max_threshold=None,
284 l1=0., l2=0.):
285 super().__init__(coef, learning_rate_init,
286 min_threshold=min_threshold,
287 max_threshold=max_threshold,
288 l1=l1, l2=l2)
289 self.lr_schedule = lr_schedule
290 self.momentum = momentum
291 self.power_t = power_t
292 self.early_th = early_th
293 self.velocity = numpy.zeros_like(coef)
294 self.velocity_grad = numpy.zeros_like(coef)
296 def iteration_ends(self, time_step):
297 """
298 Performs updates to learning rate and potential other states at the
299 end of an iteration.
301 :param time_step: int
302 number of training samples trained on so far, used to update
303 learning rate for 'invscaling'
304 """
305 if self.lr_schedule == 'invscaling':
306 self.learning_rate = (float(self.learning_rate_init) /
307 (time_step + 1) ** self.power_t)
308 elif self.lr_schedule == 'constant':
309 pass
310 else:
311 raise ValueError( # pragma: no cover
312 f"Unexpected value: lr_schedule='{self.lr_schedule}'.")
314 def _get_updates(self, grad):
315 """
316 Gets the values used to update params with given gradients.
318 :param grad: array, gradient
319 :return: updates, array, the values to add to params
320 """
321 update = self.momentum * self.velocity - self.learning_rate * grad
322 self.velocity = update
323 return update
325 def _display_progress(self, it, max_iter, loss, losses=None, msg='loss'):
326 'Displays training progress.'
327 mxc = numpy.abs(self.coef.ravel()).max()
328 l1 = numpy.sum(numpy.abs(self.coef))
329 l2 = numpy.sum(self.coef * self.coef)
330 vl1 = numpy.sum(numpy.abs(self.velocity_grad))
331 vl2 = numpy.sum(self.velocity_grad * self.velocity_grad)
332 if losses is None:
333 print('{}/{}: {}: {:1.4g} lr={:1.3g} max(coef): {:1.2g} '
334 'l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format(
335 it, max_iter, msg, loss, self.learning_rate, mxc,
336 vl1, l1, vl2, l2))
337 else:
338 print('{}/{}: {}: {:1.4g} lr={:1.3g} {}es: {} ' # pylint: disable=W1308
339 'max(coef): {:1.2g} l1={:1.2g}/{:1.2g} l2={:1.2g}/{:1.2g}'.format( # pylint: disable=W1308
340 it, max_iter, msg, loss, self.learning_rate, msg, losses,
341 mxc, vl1, l1, vl2, l2))