Coverage for mlinsights/mlmodel/kmeans_l1.py: 93%

194 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-28 08:46 +0100

1# pylint: disable=C0302 

2""" 

3@file 

4@brief Implements k-means with norms L1 and L2. 

5""" 

6import warnings 

7import numpy 

8from scipy.sparse import issparse 

9from sklearn.cluster import KMeans 

10from sklearn.cluster._kmeans import _tolerance as _tolerance_skl 

11from sklearn.exceptions import ConvergenceWarning 

12from sklearn.metrics.pairwise import ( 

13 euclidean_distances, manhattan_distances, 

14 pairwise_distances_argmin_min) 

15from sklearn.utils import check_random_state, check_array 

16from sklearn.utils.validation import _num_samples, check_is_fitted 

17from sklearn.utils.extmath import stable_cumsum 

18try: 

19 from sklearn.cluster._kmeans import _check_sample_weight 

20except ImportError: # pragma: no cover 

21 from sklearn.cluster._kmeans import ( 

22 _check_normalize_sample_weight as _check_sample_weight) 

23try: 

24 from sklearn.utils._param_validation import StrOptions 

25except ImportError: 

26 def StrOptions(*args): 

27 "Dummy replacement for a class introduced in scikit-learn==1.1." 

28 return None 

29 

30from ._kmeans_022 import ( 

31 _labels_inertia_skl, 

32 _labels_inertia_precompute_dense) 

33 

34 

35def _k_init(norm, X, n_clusters, random_state, n_local_trials=None): 

36 """Init n_clusters seeds according to k-means++ 

37 

38 :param norm: `L1` or `L2` 

39 manhattan or euclidean distance 

40 :param X: array or sparse matrix, shape (n_samples, n_features) 

41 The data to pick seeds for. To avoid memory copy, the input data 

42 should be double precision (dtype=numpy.float64). 

43 :param n_clusters: integer 

44 The number of seeds to choose 

45 :param random_state: int, RandomState instance 

46 The generator used to initialize the centers. Use an int to make the 

47 randomness deterministic. 

48 See :term:`Glossary <random_state>`. 

49 :param n_local_trials: integer, optional 

50 The number of seeding trials for each center (except the first), 

51 of which the one reducing inertia the most is greedily chosen. 

52 Set to None to make the number of trials depend logarithmically 

53 on the number of seeds (2+log(k)); this is the default. 

54 """ 

55 n_samples, n_features = X.shape 

56 

57 centers = numpy.empty((n_clusters, n_features), dtype=X.dtype) 

58 

59 # Set the number of local seeding trials if none is given 

60 if n_local_trials is None: 

61 # This is what Arthur/Vassilvitskii tried, but did not report 

62 # specific results for other than mentioning in the conclusion 

63 # that it helped. 

64 n_local_trials = 2 + int(numpy.log(n_clusters)) 

65 

66 # Pick first center randomly 

67 center_id = random_state.randint(n_samples) 

68 if issparse(X): 

69 centers[0] = X[center_id].toarray() 

70 else: 

71 centers[0] = X[center_id] 

72 

73 # Initialize list of closest distances and calculate current potential 

74 if norm == 'L2': 

75 dist_fct = lambda x, y: euclidean_distances(x, y, squared=True) 

76 elif norm == 'L1': 

77 dist_fct = lambda x, y: manhattan_distances(x, y) 

78 else: 

79 raise NotImplementedError( # pragma no cover 

80 f"norm must be 'L1' or 'L2' not '{norm}'.") 

81 

82 closest_dist_sq = dist_fct(centers[0, numpy.newaxis], X) 

83 current_pot = closest_dist_sq.sum() 

84 

85 # Pick the remaining n_clusters-1 points 

86 for c in range(1, n_clusters): 

87 # Choose center candidates by sampling with probability proportional 

88 # to the squared distance to the closest existing center 

89 rand_vals = random_state.random_sample(n_local_trials) * current_pot 

90 candidate_ids = numpy.searchsorted(stable_cumsum(closest_dist_sq), 

91 rand_vals) 

92 numpy.clip(candidate_ids, None, closest_dist_sq.size - 1, 

93 out=candidate_ids) 

94 

95 # Compute distances to center candidates 

96 distance_to_candidates = dist_fct(X[candidate_ids], X) 

97 

98 # update closest distances squared and potential for each candidate 

99 numpy.minimum(closest_dist_sq, distance_to_candidates, 

100 out=distance_to_candidates) 

101 candidates_pot = distance_to_candidates.sum(axis=1) 

102 

103 # Decide which candidate is the best 

104 best_candidate = numpy.argmin(candidates_pot) 

105 current_pot = candidates_pot[best_candidate] 

106 closest_dist_sq = distance_to_candidates[best_candidate] 

107 best_candidate = candidate_ids[best_candidate] 

108 

109 # Permanently add best center candidate found in local tries 

110 if issparse(X): 

111 centers[c] = X[best_candidate].toarray() 

112 else: 

113 centers[c] = X[best_candidate] 

114 

115 return centers 

116 

117 

118def _init_centroids(norm, X, k, init, random_state=None, 

119 init_size=None): 

120 """Compute the initial centroids 

121 

122 :param norm: 'L1' or 'L2' 

123 :param X: array, shape (n_samples, n_features) 

124 :param k: int 

125 number of centroids 

126 :param init: {'k-means++', 'random' or ndarray or callable} optional 

127 Method for initialization 

128 :param random_state: int, RandomState instance or None (default) 

129 Determines random number generation for centroid initialization. Use 

130 an int to make the randomness deterministic. 

131 See :term:`Glossary <random_state>`. 

132 :param init_size: int, optional 

133 Number of samples to randomly sample for speeding up the 

134 initialization (sometimes at the expense of accuracy): the 

135 only algorithm is initialized by running a batch KMeans on a 

136 random subset of the data. This needs to be larger than k. 

137 :return: centers, array, shape(k, n_features) 

138 """ 

139 random_state = check_random_state(random_state) 

140 n_samples = X.shape[0] 

141 

142 if init_size is not None and init_size < n_samples: 

143 if init_size < k: # pragma: no cover 

144 warnings.warn( 

145 "init_size=%d should be larger than k=%d. " 

146 "Setting it to 3*k" % (init_size, k), 

147 RuntimeWarning, stacklevel=2) 

148 init_size = 3 * k 

149 init_indices = random_state.randint(0, n_samples, init_size) 

150 X = X[init_indices] 

151 n_samples = X.shape[0] 

152 elif n_samples < k: 

153 raise ValueError( # pragma: no cover 

154 "n_samples=%d should be larger than k=%d" % (n_samples, k)) 

155 

156 if isinstance(init, str) and init == 'k-means++': 

157 centers = _k_init(norm, X, k, random_state=random_state) 

158 elif isinstance(init, str) and init == 'random': 

159 seeds = random_state.permutation(n_samples)[:k] 

160 centers = X[seeds] 

161 elif hasattr(init, '__array__'): 

162 # ensure that the centers have the same dtype as X 

163 # this is a requirement of fused types of cython 

164 centers = numpy.array(init, dtype=X.dtype) 

165 elif callable(init): 

166 centers = init(norm, X, k, random_state=random_state) 

167 centers = numpy.asarray(centers, dtype=X.dtype) 

168 else: 

169 raise ValueError( # pragma: no cover 

170 "init parameter for the k-means should " 

171 "be 'k-means++' or 'random' or an ndarray, " 

172 "'%s' (type '%s') was passed." % (init, type(init))) 

173 

174 if issparse(centers): 

175 centers = centers.toarray() 

176 

177 def _validate_center_shape(X, k, centers): 

178 """Check if centers is compatible with X and n_clusters""" 

179 if centers.shape[0] != k: 

180 raise ValueError( # pragma: no cover 

181 f"The shape of the initial centers {centers.shape} does not " 

182 f"match the number of clusters {k}.") 

183 if centers.shape[1] != X.shape[1]: 

184 raise ValueError( # pragma: no cover 

185 f"The shape of the initial centers {centers.shape} does not " 

186 f"match the number of features of the data {X.shape[1]}.") 

187 

188 _validate_center_shape(X, k, centers) 

189 return centers 

190 

191 

192def _centers_dense(X, sample_weight, labels, n_clusters, distances, 

193 X_sort_index): 

194 """ 

195 M step of the K-means EM algorithm. 

196 Computation of cluster centers / means. 

197 

198 :param X: array-like, shape (n_samples, n_features) 

199 :param sample_weight: array-like, shape (n_samples,) 

200 The weights for each observation in X. 

201 :param labels: array of integers, shape (n_samples) 

202 Current label assignment 

203 :param n_clusters: int 

204 Number of desired clusters 

205 :param distances: array-like, shape (n_samples) 

206 Distance to closest cluster for each sample. 

207 :param X_sort_index: array-like, shape (n_samples, n_features) 

208 index of each feature in all features 

209 :return: centers, array, shape (n_clusters, n_features) 

210 The resulting centers 

211 """ 

212 dtype = X.dtype 

213 n_features = X.shape[1] 

214 n_samples = X.shape[0] 

215 

216 centers = numpy.zeros((n_clusters, n_features), dtype=dtype) 

217 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype) 

218 

219 for i in range(n_samples): 

220 c = labels[i] 

221 weight_in_cluster[c] += sample_weight[i] 

222 empty_clusters = numpy.where(weight_in_cluster == 0)[0] 

223 

224 if len(empty_clusters) > 0: # pragma: no cover 

225 # find points to reassign empty clusters to 

226 far_from_centers = distances.argsort()[::-1] 

227 

228 for i, cluster_id in enumerate(empty_clusters): 

229 far_index = far_from_centers[i] 

230 new_center = X[far_index] * sample_weight[far_index] 

231 centers[cluster_id] = new_center 

232 weight_in_cluster[cluster_id] = sample_weight[far_index] 

233 

234 if sample_weight.min() == sample_weight.max(): 

235 # to optimize 

236 for i in range(n_clusters): 

237 sub = X[labels == i] 

238 med = numpy.median(sub, axis=0) 

239 centers[i, :] = med 

240 else: 

241 raise NotImplementedError( # pragma: no cover 

242 "Non uniform weights are not implemented yet as " 

243 "the cost would be very high. " 

244 "See https://en.wikipedia.org/wiki/Weighted_median#Algorithm.") 

245 return centers 

246 

247 

248def _kmeans_single_lloyd(norm, X, sample_weight, n_clusters, max_iter=300, 

249 init='k-means++', verbose=False, 

250 random_state=None, tol=1e-4): 

251 """ 

252 A single run of k-means, assumes preparation completed prior. 

253 

254 :param norm: 'L1' or 'L2' 

255 :param X: array-like of floats, shape (n_samples, n_features) 

256 The observations to cluster. 

257 :param n_clusters: int 

258 The number of clusters to form as well as the number of 

259 centroids to generate. 

260 :param sample_weight: array-like, shape (n_samples,) 

261 The weights for each observation in X. 

262 :param max_iter: int, optional, default 300 

263 Maximum number of iterations of the k-means algorithm to run. 

264 :param init: {'k-means++', 'random', or ndarray, or a callable}, optional 

265 Method for initialization, default to 'k-means++': 

266 

267 'k-means++' : selects initial cluster centers for k-mean 

268 clustering in a smart way to speed up convergence. See section 

269 Notes in k_init for more details. 

270 

271 'random': choose k observations (rows) at random from data for 

272 the initial centroids. 

273 

274 If an ndarray is passed, it should be of shape (k, p) and gives 

275 the initial centers. 

276 

277 If a callable is passed, it should take arguments X, k and 

278 and a random state and return an initialization. 

279 

280 :param tol: float, optional 

281 The relative increment in the results before declaring convergence. 

282 :param verbose: boolean, optional 

283 Verbosity mode 

284 :param random_state: int, RandomState instance or None (default) 

285 Determines random number generation for centroid initialization. Use 

286 an int to make the randomness deterministic. 

287 See :term:`Glossary <random_state>`. 

288 :return: centroid : float ndarray with shape (k, n_features) 

289 Centroids found at the last iteration of k-means. 

290 :return: label : integer ndarray with shape (n_samples,) 

291 label[i] is the code or index of the centroid the 

292 i'th observation is closest to. 

293 :return: inertia : float 

294 The final value of the inertia criterion (sum of squared distances to 

295 the closest centroid for all observations in the training set). 

296 :return: n_iter : int 

297 Number of iterations run. 

298 """ 

299 random_state = check_random_state(random_state) 

300 

301 sample_weight = _check_sample_weight(sample_weight, X) 

302 

303 best_labels, best_inertia, best_centers = None, None, None 

304 # init 

305 centers = _init_centroids( 

306 norm, X, n_clusters, init, random_state=random_state) 

307 if verbose: # pragma no cover 

308 print("Initialization complete") 

309 

310 # Allocate memory to store the distances for each sample to its 

311 # closer center for reallocation in case of ties 

312 distances = numpy.zeros(shape=(X.shape[0],), dtype=X.dtype) 

313 X_sort_index = numpy.argsort(X, axis=0) 

314 

315 # iterations 

316 for i in range(max_iter): 

317 centers_old = centers.copy() 

318 # labels assignment is also called the E-step of EM 

319 labels, inertia = _labels_inertia( 

320 norm, X, sample_weight, centers, distances=distances) 

321 

322 # computation of the means is also called the M-step of EM 

323 centers = _centers_dense(X, sample_weight, labels, n_clusters, distances, 

324 X_sort_index) 

325 

326 if verbose: # pragma no cover 

327 print("Iteration %2d, inertia %.3f" % (i, inertia)) 

328 

329 if best_inertia is None or inertia < best_inertia: 

330 best_labels = labels.copy() 

331 best_centers = centers.copy() 

332 best_inertia = inertia 

333 

334 center_shift_total = numpy.sum( 

335 numpy.abs(centers_old - centers).ravel()) 

336 if center_shift_total <= tol: 

337 if verbose: # pragma no cover 

338 print("Converged at iteration %d: " 

339 "center shift %r within tolerance %r" 

340 % (i, center_shift_total, tol)) 

341 break 

342 

343 if center_shift_total > 0: 

344 # rerun E-step in case of non-convergence so that predicted labels 

345 # match cluster centers 

346 best_labels, best_inertia = _labels_inertia( 

347 norm, X, sample_weight, best_centers, distances=distances) 

348 

349 return best_labels, best_inertia, best_centers, i + 1 

350 

351 

352def _labels_inertia(norm, X, sample_weight, centers, distances=None): 

353 """ 

354 E step of the K-means EM algorithm. 

355 

356 Computes the labels and the inertia of the given samples and centers. 

357 This will compute the distances in-place. 

358 

359 :param norm: 'L1' or 'L2' 

360 :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features) 

361 The input samples to assign to the labels. 

362 :param sample_weight: array-like, shape (n_samples,) 

363 The weights for each observation in X. 

364 :param centers: float array, shape (k, n_features) 

365 The cluster centers. 

366 :param distances: existing distances 

367 :return: labels : int array of shape(n) 

368 The resulting assignment 

369 :return: inertia : float 

370 Sum of squared distances of samples to their closest cluster center. 

371 """ 

372 if norm == 'l2': 

373 return _labels_inertia_skl( 

374 X, sample_weight=sample_weight, centers=centers, 

375 x_squared_norms=None) 

376 

377 sample_weight = _check_sample_weight(sample_weight, X) 

378 # set the default value of centers to -1 to be able to detect any anomaly 

379 # easily 

380 if distances is None: 

381 distances = numpy.zeros(shape=(0,), dtype=X.dtype) 

382 # distances will be changed in-place 

383 if issparse(X): 

384 raise NotImplementedError( # pragma no cover 

385 "Sparse matrix is not implemented for norm 'L1'.") 

386 return _labels_inertia_precompute_dense( 

387 norm=norm, X=X, sample_weight=sample_weight, 

388 centers=centers, distances=distances) 

389 

390 

391def _tolerance(norm, X, tol): 

392 """Return a tolerance which is independent of the dataset""" 

393 if norm == 'L2': 

394 return _tolerance_skl(X, tol) 

395 if norm == 'L1': 

396 variances = numpy.sum(numpy.abs(X), axis=0) / X.shape[0] 

397 return variances.sum() 

398 raise NotImplementedError( # pragma no cover 

399 f"not implemented for norm '{norm}'.") 

400 

401 

402class KMeansL1L2(KMeans): 

403 """ 

404 K-Means clustering with either norm L1 or L2. 

405 See notebook :ref:`kmeansl1rst` for an example. 

406 

407 :param n_clusters: int, default=8 

408 The number of clusters to form as well as the number of 

409 centroids to generate. 

410 :param init: {'k-means++', 'random'} or ndarray of shape \ 

411 (n_clusters, n_features), default='k-means++' 

412 Method for initialization, defaults to 'k-means++': 

413 

414 'k-means++' : selects initial cluster centers for k-mean 

415 clustering in a smart way to speed up convergence. See section 

416 Notes in k_init for more details. 

417 

418 'random': choose k observations (rows) at random from data for 

419 the initial centroids. 

420 

421 If an ndarray is passed, it should be of shape (n_clusters, n_features) 

422 and gives the initial centers. 

423 

424 :param n_init: int, default=10 

425 Number of time the k-means algorithm will be run with different 

426 centroid seeds. The final results will be the best output of 

427 n_init consecutive runs in terms of inertia. 

428 :param max_iter: int, default=300 

429 Maximum number of iterations of the k-means algorithm for a 

430 single run. 

431 :param tol: float, default=1e-4 

432 Relative tolerance with regards to inertia to declare convergence. 

433 :param precompute_distances: 'auto' or bool, default='auto' 

434 Precompute distances (faster but takes more memory). 

435 

436 'auto' : do not precompute distances if n_samples * n_clusters > 12 

437 million. This corresponds to about 100MB overhead per job using 

438 double precision. 

439 

440 True : always precompute distances. 

441 

442 False : never precompute distances. 

443 

444 :param verbose: int, default=0 

445 Verbosity mode. 

446 :param random_state: int, RandomState instance, default=None 

447 Determines random number generation for centroid initialization. Use 

448 an int to make the randomness deterministic. 

449 See :term:`Glossary <random_state>`. 

450 :param copy_x: bool, default=True 

451 When pre-computing distances it is more numerically accurate to center 

452 the data first. If copy_x is True (default), then the original data is 

453 not modified, ensuring X is C-contiguous. If False, the original data 

454 is modified, and put back before the function returns, but small 

455 numerical differences may be introduced by subtracting and then adding 

456 the data mean, in this case it will also not ensure that data is 

457 C-contiguous which may cause a significant slowdown. 

458 :param n_jobs: int, default=None 

459 The number of jobs to use for the computation. This works by computing 

460 each of the n_init runs in parallel. 

461 

462 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 

463 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 

464 for more details. 

465 :param algorithm: {"auto", "full", "elkan"}, default="auto" 

466 K-means algorithm to use. The classical EM-style algorithm is "full". 

467 The "elkan" variation is more efficient by using the triangle 

468 inequality, but currently doesn't support sparse data. "auto" chooses 

469 "elkan" for dense data and "full" for sparse data. 

470 :param norm: {"L1", "L2"} 

471 The norm *L2* is identical to :epkg:`KMeans`. 

472 Norm *L1* uses a complete different path. 

473 

474 Fitted attributes: 

475 

476 * `cluster_centers_`: ndarray of shape (n_clusters, n_features) 

477 Coordinates of cluster centers. If the algorithm stops before fully 

478 converging (see ``tol`` and ``max_iter``), these will not be 

479 consistent with ``labels_``. 

480 * `labels_`: ndarray of shape (n_samples,) 

481 Labels of each point 

482 * `inertia_`: float 

483 Sum of squared distances of samples to their closest cluster center. 

484 * `n_iter_`: int 

485 Number of iterations run. 

486 """ 

487 

488 _parameter_constraints = { 

489 **getattr(KMeans, '_parameter_constraints', {}), 

490 "norm": [StrOptions({"L1", "L2"})], 

491 } 

492 

493 def __init__(self, n_clusters=8, init='k-means++', n_init=10, 

494 max_iter=300, tol=1e-4, 

495 verbose=0, random_state=None, copy_x=True, 

496 algorithm='full', norm='L2'): 

497 

498 KMeans.__init__(self, n_clusters=n_clusters, init=init, n_init=n_init, 

499 max_iter=max_iter, tol=tol, 

500 verbose=verbose, random_state=random_state, 

501 copy_x=copy_x, algorithm=algorithm) 

502 self.norm = norm 

503 if self.norm == 'L1' and self.algorithm != 'full': 

504 raise NotImplementedError( # pragma no cover 

505 "Only algorithm 'full' is implemented with norm 'l1'.") 

506 

507 def fit(self, X, y=None, sample_weight=None): 

508 """ 

509 Computes k-means clustering. 

510 

511 :param X: array-like or sparse matrix, shape=(n_samples, n_features) 

512 Training instances to cluster. It must be noted that the data 

513 will be converted to C ordering, which will cause a memory 

514 copy if the given data is not C-contiguous. 

515 :param y: Ignored 

516 Not used, present here for API consistency by convention. 

517 :param sample_weight: array-like, shape (n_samples,), optional 

518 The weights for each observation in X. If None, all observations 

519 are assigned equal weight (default: None). 

520 :return: self 

521 Fitted estimator. 

522 """ 

523 if self.norm == 'L2': 

524 KMeans.fit(self, X=X, y=y, sample_weight=sample_weight) 

525 elif self.norm == 'L1': 

526 self._fit_l1(X=X, y=y, sample_weight=sample_weight) 

527 else: 

528 raise NotImplementedError( # pragma no cover 

529 f"Norm is not 'L1' or 'L2' but '{self.norm}'.") 

530 return self 

531 

532 def _fit_l1(self, X, y=None, sample_weight=None): 

533 """ 

534 Computes k-means clustering with norm `'L1'`. 

535 

536 :param X: array-like or sparse matrix, shape=(n_samples, n_features) 

537 Training instances to cluster. It must be noted that the data 

538 will be converted to C ordering, which will cause a memory 

539 copy if the given data is not C-contiguous. 

540 :param y: Ignored 

541 Not used, present here for API consistency by convention. 

542 :param sample_weight: array-like, shape (n_samples,), optional 

543 The weights for each observation in X. If None, all observations 

544 are assigned equal weight (default: None). 

545 :return: self 

546 Fitted estimator. 

547 """ 

548 random_state = check_random_state(self.random_state) 

549 

550 n_init = self.n_init 

551 if n_init <= 0: 

552 raise ValueError( # pragma no cover 

553 "Invalid number of initializations." 

554 " n_init=%d must be bigger than zero." % n_init) 

555 

556 if self.max_iter <= 0: 

557 raise ValueError( # pragma no cover 

558 'Number of iterations should be a positive number,' 

559 ' got %d instead' % self.max_iter) 

560 

561 # avoid forcing order when copy_x=False 

562 order = "C" if self.copy_x else None 

563 X = check_array(X, accept_sparse='csr', dtype=[numpy.float64, numpy.float32], 

564 order=order, copy=self.copy_x) 

565 # verify that the number of samples given is larger than k 

566 if _num_samples(X) < self.n_clusters: 

567 raise ValueError( # pragma no cover 

568 "n_samples=%d should be >= n_clusters=%d" % ( 

569 _num_samples(X), self.n_clusters)) 

570 

571 tol = _tolerance(self.norm, X, self.tol) 

572 

573 # Validate init array 

574 init = self.init 

575 if hasattr(init, '__array__'): 

576 init = check_array(init, dtype=X.dtype.type, copy=True) 

577 if hasattr(self, '_validate_center_shape'): 

578 self._validate_center_shape( # pylint: disable=E1101 

579 X, init) 

580 

581 if n_init != 1: 

582 warnings.warn( # pragma: no cover 

583 'Explicit initial center position passed: ' 

584 'performing only one init in k-means instead of n_init=%d' 

585 % n_init, RuntimeWarning, stacklevel=2) 

586 n_init = 1 

587 

588 best_labels, best_inertia, best_centers = None, None, None 

589 algorithm = self.algorithm 

590 if self.n_clusters == 1: 

591 # elkan doesn't make sense for a single cluster, full will produce 

592 # the right result. 

593 algorithm = "full" # pragma: no cover 

594 if algorithm == "auto": 

595 algorithm = "full" # pragma: no cover 

596 if algorithm == "full": 

597 kmeans_single = _kmeans_single_lloyd 

598 else: 

599 raise ValueError( # pragma no cover 

600 f"Algorithm must be 'auto', 'full' or 'elkan', got {str(algorithm)}") 

601 

602 seeds = random_state.randint(numpy.iinfo(numpy.int32).max, size=n_init) 

603 

604 for seed in seeds: 

605 # run a k-means once 

606 labels, inertia, centers, n_iter_ = kmeans_single( 

607 self.norm, X, sample_weight, n_clusters=self.n_clusters, 

608 max_iter=self.max_iter, init=init, verbose=self.verbose, 

609 tol=tol, random_state=seed) 

610 # determine if these results are the best so far 

611 if best_inertia is None or inertia < best_inertia: 

612 best_labels = labels.copy() 

613 best_centers = centers.copy() 

614 best_inertia = inertia 

615 best_n_iter = n_iter_ 

616 

617 distinct_clusters = len(set(best_labels)) 

618 if distinct_clusters < self.n_clusters: 

619 warnings.warn( # pragma no cover 

620 f"Number of distinct clusters ({distinct_clusters}) " 

621 f"found smaller than " 

622 f"n_clusters ({self.n_clusters}). Possibly " 

623 f"due to duplicate points in X.", 

624 ConvergenceWarning, stacklevel=2) 

625 

626 self.cluster_centers_ = best_centers 

627 self.labels_ = best_labels 

628 self.inertia_ = best_inertia 

629 self.n_iter_ = best_n_iter 

630 return self 

631 

632 def transform(self, X): 

633 """ 

634 Transforms *X* to a cluster-distance space. 

635 

636 In the new space, each dimension is the distance to the cluster 

637 centers. Note that even if X is sparse, the array returned by 

638 `transform` will typically be dense. 

639 

640 :param X: {array-like, sparse matrix} of shape (n_samples, n_features) 

641 New data to transform. 

642 :return: X_new : array, shape [n_samples, k] 

643 X transformed in the new space. 

644 """ 

645 if self.norm == 'L2': 

646 return KMeans.transform(self, X) 

647 if self.norm == 'L1': 

648 return self._transform_l1(X) 

649 raise NotImplementedError( # pragma no cover 

650 f"Norm is not L1 or L2 but '{self.norm}'.") 

651 

652 def _transform_l1(self, X): 

653 """ 

654 Returns the distance of each point in *X* to 

655 every fit clusters. 

656 """ 

657 check_is_fitted(self) 

658 X = self._check_test_data(X) 

659 return manhattan_distances(X, self.cluster_centers_) 

660 

661 def predict(self, X, sample_weight=None): 

662 """ 

663 Predicts the closest cluster each sample in X belongs to. 

664 

665 In the vector quantization literature, `cluster_centers_` is called 

666 the code book and each value returned by `predict` is the index of 

667 the closest code in the code book. 

668 

669 :param X: {array-like, sparse matrix} of shape (n_samples, n_features) 

670 New data to predict. 

671 :param sample_weight: array-like, shape (n_samples,), optional 

672 The weights for each observation in X. If None, all observations 

673 are assigned equal weight (default: None), unused here 

674 :return: labels : array, shape [n_samples,] 

675 Index of the cluster each sample belongs to. 

676 """ 

677 if self.norm == 'L2': 

678 return KMeans.predict(self, X) 

679 if self.norm == 'L1': 

680 return self._predict_l1(X, sample_weight=sample_weight) 

681 raise NotImplementedError( # pragma no cover 

682 f"Norm is not L1 or L2 but '{self.norm}'.") 

683 

684 def _predict_l1(self, X, sample_weight=None, return_distances=False): 

685 """ 

686 Returns the distance of each point in *X* to 

687 every fit clusters. 

688 

689 :param X: features 

690 :param sample_weight: (unused) 

691 :param return_distances: returns distances as well 

692 :return: labels or `labels, distances` 

693 """ 

694 labels, mindist = pairwise_distances_argmin_min( 

695 X=X, Y=self.cluster_centers_, metric='manhattan') 

696 labels = labels.astype(numpy.int32, copy=False) 

697 if return_distances: 

698 return labels, mindist 

699 return labels