Coverage for mlinsights/mlmodel/_kmeans_constraint_.py: 96%

329 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-09 08:45 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Implémente la classe @see cl ConstraintKMeans. 

5""" 

6import bisect 

7from collections import Counter 

8from pandas import DataFrame 

9import numpy 

10import scipy.sparse 

11from scipy.spatial.distance import cdist 

12from sklearn.metrics.pairwise import euclidean_distances 

13from sklearn.utils.extmath import row_norms 

14from ._kmeans_022 import ( 

15 _centers_dense, _centers_sparse, _labels_inertia_skl) 

16 

17 

18def linearize_matrix(mat, *adds): 

19 """ 

20 Linearizes a matrix into a new one 

21 with 3 columns value, row, column. 

22 The output format is similar to 

23 :epkg:`csr_matrix` but null values are kept. 

24 

25 @param mat matrix 

26 @param adds additional square matrices 

27 @return new matrix 

28 

29 *adds* defines additional matrices, it adds 

30 columns on the right side and fill them with 

31 the corresponding value taken into the additional 

32 matrices. 

33 """ 

34 if scipy.sparse.issparse(mat): 

35 if isinstance(mat, scipy.sparse.csr_matrix): 

36 max_row = mat.shape[0] 

37 res = numpy.empty((len(mat.data), 3 + len(adds)), dtype=mat.dtype) 

38 row = 0 

39 for i, v in enumerate(mat.data): 

40 while row < max_row and i >= mat.indptr[row]: 

41 row += 1 

42 res[i, 0] = v 

43 a, b = row - 1, mat.indices[i] 

44 res[i, 1] = a 

45 res[i, 2] = b 

46 for k, am in enumerate(adds): 

47 res[i, k + 3] = am[a, b] 

48 return res 

49 raise NotImplementedError( # pragma: no cover 

50 f"This kind of sparse matrix is not handled: {type(mat)}") 

51 else: 

52 n = mat.shape[0] 

53 c = mat.shape[1] 

54 ic = numpy.arange(mat.shape[1]) 

55 res = numpy.empty((n * c, 3 + len(adds)), dtype=mat.dtype) 

56 for i in range(0, n): 

57 a = i * c 

58 b = (i + 1) * c 

59 res[a:b, 1] = i 

60 res[a:b, 2] = ic 

61 res[:, 0] = mat.ravel() 

62 for k, am in enumerate(adds): 

63 res[:, 3 + k] = am.ravel() 

64 return res 

65 

66 

67def constraint_kmeans(X, labels, sample_weight, centers, inertia, 

68 iter, max_iter, # pylint: disable=W0622 

69 strategy='gain', verbose=0, state=None, 

70 learning_rate=1., history=False, fLOG=None): 

71 """ 

72 Completes the constraint :epkg:`k-means`. 

73 

74 @param X features 

75 @param labels initialized labels (unused) 

76 @param sample_weight sample weight 

77 @param centers initialized centers 

78 @param inertia initialized inertia (unused) 

79 @param iter number of iteration already done 

80 @param max_iter maximum of number of iteration 

81 @param strategy strategy used to sort observations before 

82 mapping them to clusters 

83 @param verbose verbose 

84 @param state random state 

85 @param learning_rate used by strategy `'weights'` 

86 @param history return list of centers accross iterations 

87 @param fLOG logging function (needs to be specified otherwise 

88 verbose has no effects) 

89 @return tuple (best_labels, best_centers, best_inertia, 

90 iter, all_centers) 

91 """ 

92 if labels.dtype != numpy.int32: 

93 raise TypeError( # pragma: no cover 

94 f"Labels must be an array of int not '{labels.dtype}'") 

95 

96 if strategy == 'weights': 

97 return _constraint_kmeans_weights( 

98 X, labels, sample_weight, centers, inertia, iter, 

99 max_iter, verbose=verbose, state=state, 

100 learning_rate=learning_rate, history=history, fLOG=fLOG) 

101 else: 

102 if isinstance(X, DataFrame): 

103 X = X.values 

104 x_squared_norms = row_norms(X, squared=True) 

105 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

106 limit = X.shape[0] // centers.shape[0] 

107 leftover = X.shape[0] - limit * centers.shape[0] 

108 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

109 n_clusters = centers.shape[0] 

110 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) 

111 best_inertia = None 

112 best_iter = None 

113 all_centers = [] 

114 

115 # association 

116 _constraint_association(leftover, counters, labels, leftclose, distances_close, 

117 centers, X, x_squared_norms, limit, strategy, state=state) 

118 

119 if sample_weight is None: 

120 sw = numpy.ones((X.shape[0],)) 

121 else: 

122 sw = sample_weight 

123 

124 if scipy.sparse.issparse(X): 

125 _centers_fct = _centers_sparse 

126 else: 

127 _centers_fct = _centers_dense 

128 

129 while iter < max_iter: 

130 # compute new clusters 

131 centers = _centers_fct( 

132 X, sw, labels, n_clusters, distances_close) 

133 

134 if history: 

135 all_centers.append(centers) 

136 

137 # association 

138 _constraint_association( 

139 leftover, counters, labels, leftclose, distances_close, 

140 centers, X, x_squared_norms, limit, strategy, state=state) 

141 

142 # inertia 

143 _, inertia = _labels_inertia_skl( 

144 X=X, sample_weight=sw, x_squared_norms=x_squared_norms, 

145 centers=centers, distances=distances_close) 

146 

147 iter += 1 

148 if verbose and fLOG: # pragma: no cover 

149 fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia)) 

150 

151 # best option so far? 

152 if best_inertia is None or inertia < best_inertia: 

153 best_inertia = inertia 

154 best_centers = centers.copy() 

155 best_labels = labels.copy() 

156 best_iter = iter 

157 

158 # early stop 

159 if (best_inertia is not None and inertia >= best_inertia and 

160 iter > best_iter + 5): 

161 break 

162 

163 return (best_labels, best_centers, best_inertia, None, 

164 iter, all_centers) 

165 

166 

167def constraint_predictions(X, centers, strategy, state=None): 

168 """ 

169 Computes the predictions but tries 

170 to associates the same numbers of points 

171 in each cluster. 

172 

173 @param X features 

174 @param centers centers of each clusters 

175 @param strategy strategy used to sort point before 

176 mapping them to a cluster 

177 @param state random state 

178 @return labels, distances, distances_close 

179 """ 

180 if isinstance(X, DataFrame): 

181 X = X.values 

182 x_squared_norms = row_norms(X, squared=True) 

183 counters = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

184 limit = X.shape[0] // centers.shape[0] 

185 leftover = X.shape[0] - limit * centers.shape[0] 

186 leftclose = numpy.empty((centers.shape[0],), dtype=numpy.int32) 

187 distances_close = numpy.empty((X.shape[0],), dtype=X.dtype) 

188 labels = numpy.empty((X.shape[0],), dtype=numpy.int32) 

189 

190 distances = _constraint_association( 

191 leftover, counters, labels, leftclose, 

192 distances_close, centers, X, x_squared_norms, 

193 limit, strategy, state=state) 

194 

195 return labels, distances, distances_close 

196 

197 

198def _constraint_association(leftover, counters, labels, leftclose, distances_close, 

199 centers, X, x_squared_norms, limit, strategy, state=None): 

200 """ 

201 Completes the constraint :epkg:`k-means`. 

202 

203 @param X features 

204 @param labels initialized labels (unused) 

205 @param centers initialized centers 

206 @param x_squared_norms norm of *X* 

207 @param limit number of point to associate per cluster 

208 @param leftover number of points to associate at the end 

209 @param counters allocated array 

210 @param leftclose allocated array 

211 @param labels allocated array 

212 @param distances_close allocated array 

213 @param strategy strategy used to sort point before 

214 mapping them to a cluster 

215 @param state random state 

216 """ 

217 if strategy in ('distance', 'distance_p'): 

218 return _constraint_association_distance( 

219 leftover, counters, labels, leftclose, distances_close, 

220 centers, X, x_squared_norms, limit, strategy, state=state) 

221 if strategy in ('gain', 'gain_p'): 

222 return _constraint_association_gain( 

223 leftover, counters, labels, leftclose, distances_close, 

224 centers, X, x_squared_norms, limit, strategy, state=state) 

225 raise ValueError(f"Unknwon strategy '{strategy}'.") # pragma: no cover 

226 

227 

228def _compute_strategy_coefficient(distances, strategy, labels): 

229 """ 

230 Creates a matrix 

231 """ 

232 if strategy in ('gain', 'gain_p'): 

233 ar = numpy.arange(distances.shape[0]) 

234 dist = distances[ar, labels] 

235 return distances - dist[:, numpy.newaxis] 

236 raise ValueError( # pragma: no cover 

237 f"Unknwon strategy '{strategy}'.") 

238 

239 

240def _randomize_index(index, weights): 

241 """ 

242 Randomizes index depending on the value. 

243 Swap indexes. 

244 Modifies *index*. 

245 """ 

246 maxi = weights.max() 

247 mini = weights.min() 

248 diff = max(maxi - mini, 1e-5) 

249 rand = numpy.random.rand(weights.shape[0]) 

250 for i in range(1, index.shape[0]): 

251 ind1 = index[i - 1] 

252 ind2 = index[i] 

253 w1 = weights[ind1] 

254 w2 = weights[ind2] 

255 ratio = abs(w2 - w1) / diff * 0.5 

256 if rand[i] >= ratio + 0.5: 

257 index[i - 1], index[i] = ind2, ind1 

258 weights[i - 1], weights[i] = w2, w1 

259 

260 

261def _switch_clusters(labels, distances): 

262 """ 

263 Tries to switch clusters. 

264 Modifies *labels* inplace. 

265 

266 @param labels labels 

267 @param distances distances 

268 """ 

269 perm = numpy.random.permutation(numpy.arange(0, labels.shape[0])) 

270 niter = 0 

271 modif = 1 

272 while modif > 0 and niter < 10: 

273 modif = 0 

274 niter += 1 

275 for i_ in range(labels.shape[0]): 

276 for j_ in range(i_ + 1, labels.shape[0]): 

277 i = perm[i_] 

278 j = perm[j_] 

279 c1 = labels[i] 

280 c2 = labels[j] 

281 if c1 == c2: 

282 continue 

283 d11 = distances[i, c1] 

284 d12 = distances[i, c2] 

285 d21 = distances[j, c1] 

286 d22 = distances[j, c2] 

287 if d11**2 + d22**2 > d21**2 + d12**2: 

288 labels[i], labels[j] = c2, c1 

289 modif += 1 

290 

291 

292def _constraint_association_distance(leftover, counters, labels, leftclose, distances_close, 

293 centers, X, x_squared_norms, limit, strategy, state=None): 

294 """ 

295 Completes the constraint *k-means*, 

296 the function sorts points by distance to the closest 

297 cluster and associates them into that order. 

298 It deals first with the further point and maps it to 

299 the closest center. 

300 

301 @param X features 

302 @param labels initialized labels (unused) 

303 @param centers initialized centers 

304 @param x_squared_norms norm of *X* 

305 @param limit number of point to associate per cluster 

306 @param leftover number of points to associate at the end 

307 @param counters allocated array 

308 @param leftclose allocated array 

309 @param labels allocated array 

310 @param distances_close allocated array 

311 @param strategy strategy used to sort point before 

312 mapping them to a cluster 

313 @param state random state (unused) 

314 """ 

315 

316 # initialisation 

317 counters[:] = 0 

318 leftclose[:] = -1 

319 labels[:] = -1 

320 

321 # distances 

322 distances = euclidean_distances( 

323 centers, X, Y_norm_squared=x_squared_norms, squared=True) 

324 distances = distances.T 

325 distances0 = distances.copy() 

326 maxi = distances.ravel().max() * 2 

327 centers_index = numpy.argsort(distances, axis=1) 

328 

329 while labels.min() == -1: 

330 mini = numpy.min(distances, axis=1) 

331 sorted_index = numpy.argsort(mini) 

332 _randomize_index(sorted_index, mini) 

333 

334 nover = leftover 

335 for ind in sorted_index: 

336 if labels[ind] >= 0: 

337 continue 

338 for c in centers_index[ind, :]: 

339 if counters[c] < limit: 

340 # The cluster still accepts new points. 

341 counters[c] += 1 

342 labels[ind] = c 

343 distances[ind, c] = maxi 

344 break 

345 if nover > 0 and leftclose[c] == -1: 

346 # The cluster may accept one point if the number 

347 # of clusters does not divide the number of points in X. 

348 counters[c] += 1 

349 labels[ind] = c 

350 nover -= 1 

351 leftclose[c] = 0 

352 distances[ind, c] = maxi 

353 break 

354 

355 _switch_clusters(labels, distances0) 

356 distances_close[:] = distances[numpy.arange(X.shape[0]), labels] 

357 return distances0 

358 

359 

360def _constraint_association_gain(leftover, counters, labels, leftclose, distances_close, 

361 centers, X, x_squared_norms, limit, strategy, state=None): 

362 """ 

363 Completes the constraint *k-means*. 

364 Follows the method described in `Same-size k-Means Variation 

365 <https://elki-project.github.io/tutorial/same-size_k_means>`_. 

366 

367 @param X features 

368 @param labels initialized labels (unused) 

369 @param centers initialized centers 

370 @param x_squared_norms norm of *X* 

371 @param limit number of points to associate per cluster 

372 @param leftover number of points to associate at the end 

373 @param counters allocated array 

374 @param leftclose allocated array 

375 @param labels allocated array 

376 @param distances_close allocated array 

377 @param strategy strategy used to sort point before 

378 mapping them to a cluster 

379 @param state random state 

380 

381 See `Same-size k-Means Variation <https://elki-project.github.io/tutorial/same-size_k_means>`_. 

382 """ 

383 # distances 

384 distances = euclidean_distances( 

385 centers, X, Y_norm_squared=x_squared_norms, squared=True) 

386 distances = distances.T 

387 

388 if strategy == 'gain_p': 

389 labels[:] = numpy.argmin(distances, axis=1) 

390 else: 

391 # We assume labels comes from a previous iteration. 

392 pass 

393 

394 strategy_coef = _compute_strategy_coefficient(distances, strategy, labels) 

395 distance_linear = linearize_matrix(distances, strategy_coef) 

396 sorted_distances = distance_linear[distance_linear[:, 3].argsort()] 

397 distances_close[:] = 0 

398 

399 # counters 

400 ave = limit 

401 counters[:] = 0 

402 for i in labels: 

403 counters[i] += 1 

404 leftclose[:] = counters[:] - ave 

405 leftclose[leftclose < 0] = 0 

406 nover = X.shape[0] - ave * counters.shape[0] 

407 sumi = nover - leftclose.sum() 

408 if sumi != 0: 

409 if state is None: 

410 state = numpy.random.RandomState() # pylint: disable=E1101 

411 

412 def loopf(h, sumi): 

413 if sumi < 0 and leftclose[h] > 0: # pylint: disable=R1716 

414 sumi -= leftclose[h] 

415 leftclose[h] = 0 

416 elif sumi > 0 and leftclose[h] == 0: 

417 leftclose[h] = 1 

418 sumi += 1 

419 return sumi 

420 

421 it = 0 

422 while sumi != 0: 

423 h = state.randint(0, counters.shape[0]) 

424 sumi = loopf(h, sumi) 

425 it += 1 

426 if it > counters.shape[0] * 2: 

427 break 

428 for h in range(counters.shape[0]): 

429 if sumi == 0: 

430 break 

431 sumi = loopf(h, sumi) 

432 

433 transfer = {} 

434 

435 for i in range(0, sorted_distances.shape[0]): 

436 gain = sorted_distances[i, 3] 

437 ind = int(sorted_distances[i, 1]) 

438 dest = int(sorted_distances[i, 2]) 

439 cur = labels[ind] 

440 if distances_close[ind]: 

441 continue 

442 if cur == dest: 

443 continue 

444 if ((counters[dest] < ave + leftclose[dest]) and 

445 (counters[cur] > ave + leftclose[cur])): 

446 labels[ind] = dest 

447 counters[cur] -= 1 

448 counters[dest] += 1 

449 distances_close[ind] = 1 # moved 

450 else: 

451 cp = transfer.get((dest, cur), []) 

452 while len(cp) > 0: 

453 g, destind = cp[0] 

454 if distances_close[destind]: 

455 del cp[0] 

456 else: 

457 break 

458 if len(cp) > 0: 

459 g, destind = cp[0] 

460 if g + gain < 0: 

461 del cp[0] 

462 labels[ind] = dest 

463 labels[destind] = cur 

464 add = False 

465 distances_close[ind] = 1 # moved 

466 distances_close[destind] = 1 # moved 

467 else: 

468 add = True 

469 else: 

470 add = True 

471 if add: 

472 # We add the point to the list of points willing to transfer. 

473 if (cur, dest) not in transfer: 

474 transfer[cur, dest] = [] 

475 gain = sorted_distances[i, 3] 

476 bisect.insort(transfer[cur, dest], (gain, ind)) 

477 

478 neg = (counters < ave).sum() 

479 if neg > 0: 

480 raise RuntimeError( # pragma: no cover 

481 f"The algorithm failed, counters={counters}") 

482 

483 _switch_clusters(labels, distances) 

484 distances_close[:] = distances[numpy.arange(X.shape[0]), labels] 

485 

486 return distances 

487 

488 

489def _constraint_kmeans_weights(X, labels, sample_weight, centers, inertia, it, 

490 max_iter, verbose=0, state=None, learning_rate=1., 

491 history=False, fLOG=None): 

492 """ 

493 Runs KMeans iterator but weights cluster among them. 

494 

495 @param X features 

496 @param labels initialized labels (unused) 

497 @param sample_weight sample weight 

498 @param centers initialized centers 

499 @param inertia initialized inertia (unused) 

500 @param it number of iteration already done 

501 @param max_iter maximum of number of iteration 

502 @param verbose verbose 

503 @param state random state 

504 @param learning_rate learning rate 

505 @param history keeps all centers accross iterations 

506 @param fLOG logging function (needs to be specified otherwise 

507 verbose has no effects) 

508 @return tuple (best_labels, best_centers, best_inertia, weights, it) 

509 """ 

510 if isinstance(X, DataFrame): 

511 X = X.values 

512 n_clusters = centers.shape[0] 

513 best_inertia = None 

514 best_iter = None 

515 weights = numpy.ones(centers.shape[0]) 

516 if sample_weight is None: 

517 sw = numpy.ones((X.shape[0],)) 

518 else: 

519 sw = sample_weight 

520 

521 if scipy.sparse.issparse(X): 

522 _centers_fct = _centers_sparse 

523 else: 

524 _centers_fct = _centers_dense 

525 

526 total_inertia = _inertia(X, sw) 

527 all_centers = [] 

528 

529 while it < max_iter: 

530 

531 # compute new clusters 

532 centers = _centers_fct( 

533 X, sw, labels, n_clusters, None) 

534 if history: 

535 all_centers.append(centers) 

536 

537 # association 

538 labels = _constraint_association_weights(X, centers, sw, weights) 

539 if len(set(labels)) != centers.shape[0]: 

540 if verbose and fLOG: # pragma: no cover 

541 if isinstance(verbose, int) and verbose >= 10: 

542 fLOG(f"CKMeans new weights: w={weights!r}") 

543 else: 

544 fLOG("CKMeans new weights") 

545 weights[:] = 1 

546 labels = _constraint_association_weights(X, centers, sw, weights) 

547 

548 # inertia 

549 inertia, diff = _labels_inertia_weights( 

550 X, centers, sw, weights, labels, total_inertia) 

551 if numpy.isnan(inertia): 

552 raise RuntimeError( # pragma: no cover 

553 f"nanNobs={X.shape[0]} Nclus={centers.shape[0]}\n" 

554 f"inertia={inertia}\nweights={weights}\ndiff={diff}\n" 

555 f"labels={set(labels)}") 

556 

557 # best option so far? 

558 if best_inertia is None or inertia < best_inertia: 

559 best_inertia = inertia 

560 best_centers = centers.copy() 

561 best_labels = labels.copy() 

562 best_weights = weights.copy() 

563 best_iter = it 

564 

565 # moves weights 

566 weights, _ = _adjust_weights(X, sw, weights, labels, 

567 learning_rate / (it + 10)) 

568 

569 it += 1 

570 if verbose and fLOG: 

571 if isinstance(verbose, int) and verbose >= 10: 

572 fLOG("CKMeans %d/%d inertia=%f (%f T=%f) dw=%r w=%r" % ( 

573 it, max_iter, inertia, best_inertia, total_inertia, 

574 diff, weights)) 

575 elif isinstance(verbose, int) and verbose >= 5: 

576 hist = Counter(labels) 

577 fLOG("CKMeans %d/%d inertia=%f (%f) hist=%r" % ( 

578 it, max_iter, inertia, best_inertia, hist)) 

579 else: 

580 fLOG("CKMeans %d/%d inertia=%f (%f T=%f)" % ( # pragma: no cover 

581 it, max_iter, inertia, best_inertia, total_inertia)) 

582 

583 # early stop 

584 if (best_inertia is not None and inertia >= best_inertia and 

585 it > best_iter + 5 and numpy.abs(diff).sum() <= weights.shape[0] / 2): 

586 break 

587 

588 return (best_labels, best_centers, best_inertia, best_weights, 

589 it, all_centers) 

590 

591 

592def _constraint_association_weights(X, centers, sw, weights): 

593 """ 

594 Associates points to clusters. 

595 

596 @param X features 

597 @param centers centers 

598 @param sw sample weights 

599 @param weights cluster weights 

600 @return labels 

601 """ 

602 dist = cdist(X, centers) * weights.reshape((1, -1)) 

603 index = numpy.argmin(dist, axis=1) 

604 return index 

605 

606 

607def _inertia(X, sw): 

608 """ 

609 Computes total weighted inertia. 

610 

611 @param X features 

612 @param sw sample weights 

613 @return inertia 

614 """ 

615 bary = numpy.mean(X, axis=0) 

616 diff = X - bary 

617 norm = numpy.linalg.norm(diff, axis=1) 

618 if sw is not None: 

619 norm *= sw 

620 return sw.sum() 

621 

622 

623def _labels_inertia_weights(X, centers, sw, weights, labels, total_inertia): 

624 """ 

625 Computes weighted inertia. It also adds a fraction 

626 of the whole inertia depending on how balanced the 

627 clusters are. 

628 

629 @param X features 

630 @param centers centers 

631 @param sw sample weights 

632 @param weights cluster weights 

633 @param labels labels 

634 @param total_inertia total inertia 

635 @return inertia 

636 """ 

637 www, exp, _ = _compute_balance(X, sw, labels, centers.shape[0]) 

638 www -= exp 

639 wwwa = numpy.square(www) 

640 ratio = wwwa.sum() ** 0.5 / X.shape[0] 

641 dist = cdist(X, centers) * weights.reshape((1, -1)) * sw.reshape((-1, 1)) 

642 return dist.sum() + ratio * total_inertia, www 

643 

644 

645def _compute_balance(X, sw, labels, nbc=None): 

646 """ 

647 Computes weights difference. 

648 

649 @param X features 

650 @param sw sample weights 

651 @param labels known labels 

652 @param nbc number of clusters 

653 @return (weights per cluster, expected weight, 

654 total weight) 

655 """ 

656 if nbc is None: 

657 nbc = labels.max() + 1 

658 N = numpy.float64(nbc) 

659 www = numpy.zeros((nbc, ), dtype=numpy.float64) 

660 if sw is None: 

661 for la in labels: 

662 www[la] += 1 

663 else: 

664 for w, la in zip(sw, labels): 

665 www[la] += w 

666 exp = www.sum() / N 

667 return www, exp, N 

668 

669 

670def _adjust_weights(X, sw, weights, labels, lr): 

671 """ 

672 Changes *weights* mapped to every cluster. 

673 *weights < 1* are used for big clusters, 

674 *weights > 1* are used for small clusters. 

675 

676 @param X features 

677 @param centers centers 

678 @param sw sample weights 

679 @param weights cluster weights 

680 @param lr learning rate 

681 @param labels known labels 

682 @return labels 

683 """ 

684 www, exp, N = _compute_balance(X, sw, labels, weights.shape[0]) 

685 

686 for i in range(0, weights.shape[0]): 

687 nw = (www[i] - exp) / exp 

688 delta = nw * lr 

689 weights[i] += delta 

690 N += delta 

691 

692 return weights / N * weights.shape[0], www