Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# pylint: disable=C0302 

2""" 

3@file 

4@brief Implements k-means with norms L1 and L2. 

5""" 

6import warnings 

7import numpy 

8from scipy.sparse import issparse 

9from sklearn.cluster import KMeans 

10from sklearn.cluster._kmeans import _tolerance as _tolerance_skl 

11from sklearn.exceptions import ConvergenceWarning 

12from sklearn.metrics.pairwise import ( 

13 euclidean_distances, manhattan_distances, 

14 pairwise_distances_argmin_min) 

15from sklearn.utils import check_random_state, check_array 

16from sklearn.utils.validation import _num_samples, check_is_fitted 

17from sklearn.utils.extmath import stable_cumsum 

18try: 

19 from sklearn.cluster._kmeans import _check_sample_weight 

20except ImportError: # pragma: no cover 

21 from sklearn.cluster._kmeans import ( 

22 _check_normalize_sample_weight as _check_sample_weight) 

23from ._kmeans_022 import ( 

24 _labels_inertia_skl, 

25 _labels_inertia_precompute_dense) 

26 

27 

28def _k_init(norm, X, n_clusters, random_state, n_local_trials=None): 

29 """Init n_clusters seeds according to k-means++ 

30 

31 :param norm: `l1` or `l2` 

32 manhattan or euclidean distance 

33 :param X: array or sparse matrix, shape (n_samples, n_features) 

34 The data to pick seeds for. To avoid memory copy, the input data 

35 should be double precision (dtype=numpy.float64). 

36 :param n_clusters: integer 

37 The number of seeds to choose 

38 :param random_state: int, RandomState instance 

39 The generator used to initialize the centers. Use an int to make the 

40 randomness deterministic. 

41 See :term:`Glossary <random_state>`. 

42 :param n_local_trials: integer, optional 

43 The number of seeding trials for each center (except the first), 

44 of which the one reducing inertia the most is greedily chosen. 

45 Set to None to make the number of trials depend logarithmically 

46 on the number of seeds (2+log(k)); this is the default. 

47 """ 

48 n_samples, n_features = X.shape 

49 

50 centers = numpy.empty((n_clusters, n_features), dtype=X.dtype) 

51 

52 # Set the number of local seeding trials if none is given 

53 if n_local_trials is None: 

54 # This is what Arthur/Vassilvitskii tried, but did not report 

55 # specific results for other than mentioning in the conclusion 

56 # that it helped. 

57 n_local_trials = 2 + int(numpy.log(n_clusters)) 

58 

59 # Pick first center randomly 

60 center_id = random_state.randint(n_samples) 

61 if issparse(X): 

62 centers[0] = X[center_id].toarray() 

63 else: 

64 centers[0] = X[center_id] 

65 

66 # Initialize list of closest distances and calculate current potential 

67 if norm.lower() == 'l2': 

68 dist_fct = lambda x, y: euclidean_distances(x, y, squared=True) 

69 elif norm.lower() == 'l1': 

70 dist_fct = lambda x, y: manhattan_distances(x, y) 

71 else: 

72 raise NotImplementedError( # pragma no cover 

73 "norm must be 'l1' or 'l2' not '{}'.".format(norm)) 

74 

75 closest_dist_sq = dist_fct(centers[0, numpy.newaxis], X) 

76 current_pot = closest_dist_sq.sum() 

77 

78 # Pick the remaining n_clusters-1 points 

79 for c in range(1, n_clusters): 

80 # Choose center candidates by sampling with probability proportional 

81 # to the squared distance to the closest existing center 

82 rand_vals = random_state.random_sample(n_local_trials) * current_pot 

83 candidate_ids = numpy.searchsorted(stable_cumsum(closest_dist_sq), 

84 rand_vals) 

85 numpy.clip(candidate_ids, None, closest_dist_sq.size - 1, 

86 out=candidate_ids) 

87 

88 # Compute distances to center candidates 

89 distance_to_candidates = dist_fct(X[candidate_ids], X) 

90 

91 # update closest distances squared and potential for each candidate 

92 numpy.minimum(closest_dist_sq, distance_to_candidates, 

93 out=distance_to_candidates) 

94 candidates_pot = distance_to_candidates.sum(axis=1) 

95 

96 # Decide which candidate is the best 

97 best_candidate = numpy.argmin(candidates_pot) 

98 current_pot = candidates_pot[best_candidate] 

99 closest_dist_sq = distance_to_candidates[best_candidate] 

100 best_candidate = candidate_ids[best_candidate] 

101 

102 # Permanently add best center candidate found in local tries 

103 if issparse(X): 

104 centers[c] = X[best_candidate].toarray() 

105 else: 

106 centers[c] = X[best_candidate] 

107 

108 return centers 

109 

110 

111def _init_centroids(norm, X, k, init, random_state=None, 

112 init_size=None): 

113 """Compute the initial centroids 

114 

115 :param norm: 'l1' or 'l2' 

116 :param X: array, shape (n_samples, n_features) 

117 :param k: int 

118 number of centroids 

119 :param init: {'k-means++', 'random' or ndarray or callable} optional 

120 Method for initialization 

121 :param random_state: int, RandomState instance or None (default) 

122 Determines random number generation for centroid initialization. Use 

123 an int to make the randomness deterministic. 

124 See :term:`Glossary <random_state>`. 

125 :param init_size: int, optional 

126 Number of samples to randomly sample for speeding up the 

127 initialization (sometimes at the expense of accuracy): the 

128 only algorithm is initialized by running a batch KMeans on a 

129 random subset of the data. This needs to be larger than k. 

130 :return: centers, array, shape(k, n_features) 

131 """ 

132 random_state = check_random_state(random_state) 

133 n_samples = X.shape[0] 

134 

135 if init_size is not None and init_size < n_samples: 

136 if init_size < k: # pragma: no cover 

137 warnings.warn( 

138 "init_size=%d should be larger than k=%d. " 

139 "Setting it to 3*k" % (init_size, k), 

140 RuntimeWarning, stacklevel=2) 

141 init_size = 3 * k 

142 init_indices = random_state.randint(0, n_samples, init_size) 

143 X = X[init_indices] 

144 n_samples = X.shape[0] 

145 elif n_samples < k: 

146 raise ValueError( # pragma: no cover 

147 "n_samples=%d should be larger than k=%d" % (n_samples, k)) 

148 

149 if isinstance(init, str) and init == 'k-means++': 

150 centers = _k_init(norm, X, k, random_state=random_state) 

151 elif isinstance(init, str) and init == 'random': 

152 seeds = random_state.permutation(n_samples)[:k] 

153 centers = X[seeds] 

154 elif hasattr(init, '__array__'): 

155 # ensure that the centers have the same dtype as X 

156 # this is a requirement of fused types of cython 

157 centers = numpy.array(init, dtype=X.dtype) 

158 elif callable(init): 

159 centers = init(norm, X, k, random_state=random_state) 

160 centers = numpy.asarray(centers, dtype=X.dtype) 

161 else: 

162 raise ValueError( # pragma: no cover 

163 "init parameter for the k-means should " 

164 "be 'k-means++' or 'random' or an ndarray, " 

165 "'%s' (type '%s') was passed." % (init, type(init))) 

166 

167 if issparse(centers): 

168 centers = centers.toarray() 

169 

170 def _validate_center_shape(X, k, centers): 

171 """Check if centers is compatible with X and n_clusters""" 

172 if centers.shape[0] != k: 

173 raise ValueError( # pragma: no cover 

174 f"The shape of the initial centers {centers.shape} does not " 

175 f"match the number of clusters {k}.") 

176 if centers.shape[1] != X.shape[1]: 

177 raise ValueError( # pragma: no cover 

178 f"The shape of the initial centers {centers.shape} does not " 

179 f"match the number of features of the data {X.shape[1]}.") 

180 

181 _validate_center_shape(X, k, centers) 

182 return centers 

183 

184 

185def _centers_dense(X, sample_weight, labels, n_clusters, distances, 

186 X_sort_index): 

187 """ 

188 M step of the K-means EM algorithm. 

189 Computation of cluster centers / means. 

190 

191 :param X: array-like, shape (n_samples, n_features) 

192 :param sample_weight: array-like, shape (n_samples,) 

193 The weights for each observation in X. 

194 :param labels: array of integers, shape (n_samples) 

195 Current label assignment 

196 :param n_clusters: int 

197 Number of desired clusters 

198 :param distances: array-like, shape (n_samples) 

199 Distance to closest cluster for each sample. 

200 :param X_sort_index: array-like, shape (n_samples, n_features) 

201 index of each feature in all features 

202 :return: centers, array, shape (n_clusters, n_features) 

203 The resulting centers 

204 """ 

205 dtype = X.dtype 

206 n_features = X.shape[1] 

207 n_samples = X.shape[0] 

208 

209 centers = numpy.zeros((n_clusters, n_features), dtype=dtype) 

210 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype) 

211 

212 for i in range(n_samples): 

213 c = labels[i] 

214 weight_in_cluster[c] += sample_weight[i] 

215 empty_clusters = numpy.where(weight_in_cluster == 0)[0] 

216 

217 if len(empty_clusters) > 0: # pragma: no cover 

218 # find points to reassign empty clusters to 

219 far_from_centers = distances.argsort()[::-1] 

220 

221 for i, cluster_id in enumerate(empty_clusters): 

222 far_index = far_from_centers[i] 

223 new_center = X[far_index] * sample_weight[far_index] 

224 centers[cluster_id] = new_center 

225 weight_in_cluster[cluster_id] = sample_weight[far_index] 

226 

227 if sample_weight.min() == sample_weight.max(): 

228 # to optimize 

229 for i in range(n_clusters): 

230 sub = X[labels == i] 

231 med = numpy.median(sub, axis=0) 

232 centers[i, :] = med 

233 else: 

234 raise NotImplementedError( # pragma: no cover 

235 "Non uniform weights are not implemented yet as " 

236 "the cost would be very high. " 

237 "See https://en.wikipedia.org/wiki/Weighted_median#Algorithm.") 

238 return centers 

239 

240 

241def _kmeans_single_lloyd(norm, X, sample_weight, n_clusters, max_iter=300, 

242 init='k-means++', verbose=False, 

243 random_state=None, tol=1e-4): 

244 """ 

245 A single run of k-means, assumes preparation completed prior. 

246 

247 :param norm: 'l1' or 'l2' 

248 :param X: array-like of floats, shape (n_samples, n_features) 

249 The observations to cluster. 

250 :param n_clusters: int 

251 The number of clusters to form as well as the number of 

252 centroids to generate. 

253 :param sample_weight: array-like, shape (n_samples,) 

254 The weights for each observation in X. 

255 :param max_iter: int, optional, default 300 

256 Maximum number of iterations of the k-means algorithm to run. 

257 :param init: {'k-means++', 'random', or ndarray, or a callable}, optional 

258 Method for initialization, default to 'k-means++': 

259 

260 'k-means++' : selects initial cluster centers for k-mean 

261 clustering in a smart way to speed up convergence. See section 

262 Notes in k_init for more details. 

263 

264 'random': choose k observations (rows) at random from data for 

265 the initial centroids. 

266 

267 If an ndarray is passed, it should be of shape (k, p) and gives 

268 the initial centers. 

269 

270 If a callable is passed, it should take arguments X, k and 

271 and a random state and return an initialization. 

272 

273 :param tol: float, optional 

274 The relative increment in the results before declaring convergence. 

275 :param verbose: boolean, optional 

276 Verbosity mode 

277 :param random_state: int, RandomState instance or None (default) 

278 Determines random number generation for centroid initialization. Use 

279 an int to make the randomness deterministic. 

280 See :term:`Glossary <random_state>`. 

281 :return: centroid : float ndarray with shape (k, n_features) 

282 Centroids found at the last iteration of k-means. 

283 :return: label : integer ndarray with shape (n_samples,) 

284 label[i] is the code or index of the centroid the 

285 i'th observation is closest to. 

286 :return: inertia : float 

287 The final value of the inertia criterion (sum of squared distances to 

288 the closest centroid for all observations in the training set). 

289 :return: n_iter : int 

290 Number of iterations run. 

291 """ 

292 random_state = check_random_state(random_state) 

293 

294 sample_weight = _check_sample_weight(sample_weight, X) 

295 

296 best_labels, best_inertia, best_centers = None, None, None 

297 # init 

298 centers = _init_centroids( 

299 norm, X, n_clusters, init, random_state=random_state) 

300 if verbose: # pragma no cover 

301 print("Initialization complete") 

302 

303 # Allocate memory to store the distances for each sample to its 

304 # closer center for reallocation in case of ties 

305 distances = numpy.zeros(shape=(X.shape[0],), dtype=X.dtype) 

306 X_sort_index = numpy.argsort(X, axis=0) 

307 

308 # iterations 

309 for i in range(max_iter): 

310 centers_old = centers.copy() 

311 # labels assignment is also called the E-step of EM 

312 labels, inertia = _labels_inertia( 

313 norm, X, sample_weight, centers, distances=distances) 

314 

315 # computation of the means is also called the M-step of EM 

316 centers = _centers_dense(X, sample_weight, labels, n_clusters, distances, 

317 X_sort_index) 

318 

319 if verbose: # pragma no cover 

320 print("Iteration %2d, inertia %.3f" % (i, inertia)) 

321 

322 if best_inertia is None or inertia < best_inertia: 

323 best_labels = labels.copy() 

324 best_centers = centers.copy() 

325 best_inertia = inertia 

326 

327 center_shift_total = numpy.sum( 

328 numpy.abs(centers_old - centers).ravel()) 

329 if center_shift_total <= tol: 

330 if verbose: # pragma no cover 

331 print("Converged at iteration %d: " 

332 "center shift %r within tolerance %r" 

333 % (i, center_shift_total, tol)) 

334 break 

335 

336 if center_shift_total > 0: 

337 # rerun E-step in case of non-convergence so that predicted labels 

338 # match cluster centers 

339 best_labels, best_inertia = _labels_inertia( 

340 norm, X, sample_weight, best_centers, distances=distances) 

341 

342 return best_labels, best_inertia, best_centers, i + 1 

343 

344 

345def _labels_inertia(norm, X, sample_weight, centers, distances=None): 

346 """ 

347 E step of the K-means EM algorithm. 

348 

349 Computes the labels and the inertia of the given samples and centers. 

350 This will compute the distances in-place. 

351 

352 :param norm: 'l1' or 'l2' 

353 :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features) 

354 The input samples to assign to the labels. 

355 :param sample_weight: array-like, shape (n_samples,) 

356 The weights for each observation in X. 

357 :param centers: float array, shape (k, n_features) 

358 The cluster centers. 

359 :param distances: existing distances 

360 :return: labels : int array of shape(n) 

361 The resulting assignment 

362 :return: inertia : float 

363 Sum of squared distances of samples to their closest cluster center. 

364 """ 

365 if norm == 'l2': 

366 return _labels_inertia_skl( 

367 X, sample_weight=sample_weight, centers=centers, 

368 x_squared_norms=None) 

369 

370 sample_weight = _check_sample_weight(sample_weight, X) 

371 # set the default value of centers to -1 to be able to detect any anomaly 

372 # easily 

373 if distances is None: 

374 distances = numpy.zeros(shape=(0,), dtype=X.dtype) 

375 # distances will be changed in-place 

376 if issparse(X): 

377 raise NotImplementedError( # pragma no cover 

378 "Sparse matrix is not implemented for norm 'l1'.") 

379 return _labels_inertia_precompute_dense( 

380 norm=norm, X=X, sample_weight=sample_weight, 

381 centers=centers, distances=distances) 

382 

383 

384def _tolerance(norm, X, tol): 

385 """Return a tolerance which is independent of the dataset""" 

386 if norm == 'l2': 

387 return _tolerance_skl(X, tol) 

388 if norm == 'l1': 

389 variances = numpy.sum(numpy.abs(X), axis=0) / X.shape[0] 

390 return variances.sum() 

391 raise NotImplementedError( # pragma no cover 

392 "not implemented for norm '{}'.".format(norm)) 

393 

394 

395class KMeansL1L2(KMeans): 

396 """ 

397 K-Means clustering with either norm L1 or L2. 

398 See notebook :ref:`kmeansl1rst` for an example. 

399 

400 :param n_clusters: int, default=8 

401 The number of clusters to form as well as the number of 

402 centroids to generate. 

403 :param init: {'k-means++', 'random'} or ndarray of shape \ 

404 (n_clusters, n_features), default='k-means++' 

405 Method for initialization, defaults to 'k-means++': 

406 

407 'k-means++' : selects initial cluster centers for k-mean 

408 clustering in a smart way to speed up convergence. See section 

409 Notes in k_init for more details. 

410 

411 'random': choose k observations (rows) at random from data for 

412 the initial centroids. 

413 

414 If an ndarray is passed, it should be of shape (n_clusters, n_features) 

415 and gives the initial centers. 

416 

417 :param n_init: int, default=10 

418 Number of time the k-means algorithm will be run with different 

419 centroid seeds. The final results will be the best output of 

420 n_init consecutive runs in terms of inertia. 

421 :param max_iter: int, default=300 

422 Maximum number of iterations of the k-means algorithm for a 

423 single run. 

424 :param tol: float, default=1e-4 

425 Relative tolerance with regards to inertia to declare convergence. 

426 :param precompute_distances: 'auto' or bool, default='auto' 

427 Precompute distances (faster but takes more memory). 

428 

429 'auto' : do not precompute distances if n_samples * n_clusters > 12 

430 million. This corresponds to about 100MB overhead per job using 

431 double precision. 

432 

433 True : always precompute distances. 

434 

435 False : never precompute distances. 

436 

437 :param verbose: int, default=0 

438 Verbosity mode. 

439 :param random_state: int, RandomState instance, default=None 

440 Determines random number generation for centroid initialization. Use 

441 an int to make the randomness deterministic. 

442 See :term:`Glossary <random_state>`. 

443 :param copy_x: bool, default=True 

444 When pre-computing distances it is more numerically accurate to center 

445 the data first. If copy_x is True (default), then the original data is 

446 not modified, ensuring X is C-contiguous. If False, the original data 

447 is modified, and put back before the function returns, but small 

448 numerical differences may be introduced by subtracting and then adding 

449 the data mean, in this case it will also not ensure that data is 

450 C-contiguous which may cause a significant slowdown. 

451 :param n_jobs: int, default=None 

452 The number of jobs to use for the computation. This works by computing 

453 each of the n_init runs in parallel. 

454 

455 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 

456 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 

457 for more details. 

458 :param algorithm: {"auto", "full", "elkan"}, default="auto" 

459 K-means algorithm to use. The classical EM-style algorithm is "full". 

460 The "elkan" variation is more efficient by using the triangle 

461 inequality, but currently doesn't support sparse data. "auto" chooses 

462 "elkan" for dense data and "full" for sparse data. 

463 :param norm: {"L1", "L2"} 

464 The norm *L2* is identical to :epkg:`KMeans`. 

465 Norm *L1* uses a complete different path. 

466 

467 Fitted attributes: 

468 

469 * `cluster_centers_`: ndarray of shape (n_clusters, n_features) 

470 Coordinates of cluster centers. If the algorithm stops before fully 

471 converging (see ``tol`` and ``max_iter``), these will not be 

472 consistent with ``labels_``. 

473 * `labels_`: ndarray of shape (n_samples,) 

474 Labels of each point 

475 * `inertia_`: float 

476 Sum of squared distances of samples to their closest cluster center. 

477 * `n_iter_`: int 

478 Number of iterations run. 

479 """ 

480 

481 def __init__(self, n_clusters=8, init='k-means++', n_init=10, 

482 max_iter=300, tol=1e-4, 

483 verbose=0, random_state=None, copy_x=True, 

484 algorithm='full', norm='L2'): 

485 

486 KMeans.__init__(self, n_clusters=n_clusters, init=init, n_init=n_init, 

487 max_iter=max_iter, tol=tol, 

488 verbose=verbose, random_state=random_state, 

489 copy_x=copy_x, algorithm=algorithm) 

490 self.norm = norm.lower() 

491 if self.norm == 'l1' and self.algorithm != 'full': 

492 raise NotImplementedError( # pragma no cover 

493 "Only algorithm 'full' is implemented with norm 'l1'.") 

494 

495 def fit(self, X, y=None, sample_weight=None): 

496 """ 

497 Computes k-means clustering. 

498 

499 :param X: array-like or sparse matrix, shape=(n_samples, n_features) 

500 Training instances to cluster. It must be noted that the data 

501 will be converted to C ordering, which will cause a memory 

502 copy if the given data is not C-contiguous. 

503 :param y: Ignored 

504 Not used, present here for API consistency by convention. 

505 :param sample_weight: array-like, shape (n_samples,), optional 

506 The weights for each observation in X. If None, all observations 

507 are assigned equal weight (default: None). 

508 :return: self 

509 Fitted estimator. 

510 """ 

511 if self.norm == 'l2': 

512 KMeans.fit(self, X=X, y=y, sample_weight=sample_weight) 

513 elif self.norm == 'l1': 

514 self._fit_l1(X=X, y=y, sample_weight=sample_weight) 

515 else: 

516 raise NotImplementedError( # pragma no cover 

517 "Norm is not L1 or L2 but '{}'.".format(self.norm)) 

518 return self 

519 

520 def _fit_l1(self, X, y=None, sample_weight=None): 

521 """ 

522 Computes k-means clustering with norm `'l1'`. 

523 

524 :param X: array-like or sparse matrix, shape=(n_samples, n_features) 

525 Training instances to cluster. It must be noted that the data 

526 will be converted to C ordering, which will cause a memory 

527 copy if the given data is not C-contiguous. 

528 :param y: Ignored 

529 Not used, present here for API consistency by convention. 

530 :param sample_weight: array-like, shape (n_samples,), optional 

531 The weights for each observation in X. If None, all observations 

532 are assigned equal weight (default: None). 

533 :return: self 

534 Fitted estimator. 

535 """ 

536 random_state = check_random_state(self.random_state) 

537 

538 n_init = self.n_init 

539 if n_init <= 0: 

540 raise ValueError( # pragma no cover 

541 "Invalid number of initializations." 

542 " n_init=%d must be bigger than zero." % n_init) 

543 

544 if self.max_iter <= 0: 

545 raise ValueError( # pragma no cover 

546 'Number of iterations should be a positive number,' 

547 ' got %d instead' % self.max_iter) 

548 

549 # avoid forcing order when copy_x=False 

550 order = "C" if self.copy_x else None 

551 X = check_array(X, accept_sparse='csr', dtype=[numpy.float64, numpy.float32], 

552 order=order, copy=self.copy_x) 

553 # verify that the number of samples given is larger than k 

554 if _num_samples(X) < self.n_clusters: 

555 raise ValueError( # pragma no cover 

556 "n_samples=%d should be >= n_clusters=%d" % ( 

557 _num_samples(X), self.n_clusters)) 

558 

559 tol = _tolerance(self.norm, X, self.tol) 

560 

561 # Validate init array 

562 init = self.init 

563 if hasattr(init, '__array__'): 

564 init = check_array(init, dtype=X.dtype.type, copy=True) 

565 if hasattr(self, '_validate_center_shape'): 

566 self._validate_center_shape( # pylint: disable=E1101 

567 X, init) 

568 

569 if n_init != 1: 

570 warnings.warn( # pragma: no cover 

571 'Explicit initial center position passed: ' 

572 'performing only one init in k-means instead of n_init=%d' 

573 % n_init, RuntimeWarning, stacklevel=2) 

574 n_init = 1 

575 

576 best_labels, best_inertia, best_centers = None, None, None 

577 algorithm = self.algorithm 

578 if self.n_clusters == 1: 

579 # elkan doesn't make sense for a single cluster, full will produce 

580 # the right result. 

581 algorithm = "full" # pragma: no cover 

582 if algorithm == "auto": 

583 algorithm = "full" # pragma: no cover 

584 if algorithm == "full": 

585 kmeans_single = _kmeans_single_lloyd 

586 else: 

587 raise ValueError( # pragma no cover 

588 "Algorithm must be 'auto', 'full' or 'elkan', got" 

589 " %s" % str(algorithm)) 

590 

591 seeds = random_state.randint(numpy.iinfo(numpy.int32).max, size=n_init) 

592 

593 for seed in seeds: 

594 # run a k-means once 

595 labels, inertia, centers, n_iter_ = kmeans_single( 

596 self.norm, X, sample_weight, n_clusters=self.n_clusters, 

597 max_iter=self.max_iter, init=init, verbose=self.verbose, 

598 tol=tol, random_state=seed) 

599 # determine if these results are the best so far 

600 if best_inertia is None or inertia < best_inertia: 

601 best_labels = labels.copy() 

602 best_centers = centers.copy() 

603 best_inertia = inertia 

604 best_n_iter = n_iter_ 

605 

606 distinct_clusters = len(set(best_labels)) 

607 if distinct_clusters < self.n_clusters: 

608 warnings.warn( # pragma no cover 

609 "Number of distinct clusters ({}) found smaller than " 

610 "n_clusters ({}). Possibly due to duplicate points " 

611 "in X.".format(distinct_clusters, self.n_clusters), 

612 ConvergenceWarning, stacklevel=2) 

613 

614 self.cluster_centers_ = best_centers 

615 self.labels_ = best_labels 

616 self.inertia_ = best_inertia 

617 self.n_iter_ = best_n_iter 

618 return self 

619 

620 def transform(self, X): 

621 """ 

622 Transforms *X* to a cluster-distance space. 

623 

624 In the new space, each dimension is the distance to the cluster 

625 centers. Note that even if X is sparse, the array returned by 

626 `transform` will typically be dense. 

627 

628 :param X: {array-like, sparse matrix} of shape (n_samples, n_features) 

629 New data to transform. 

630 :return: X_new : array, shape [n_samples, k] 

631 X transformed in the new space. 

632 """ 

633 if self.norm == 'l2': 

634 return KMeans.transform(self, X) 

635 if self.norm == 'l1': 

636 return self._transform_l1(X) 

637 raise NotImplementedError( # pragma no cover 

638 "Norm is not L1 or L2 but '{}'.".format(self.norm)) 

639 

640 def _transform_l1(self, X): 

641 """ 

642 Returns the distance of each point in *X* to 

643 every fit clusters. 

644 """ 

645 check_is_fitted(self) 

646 X = self._check_test_data(X) 

647 return manhattan_distances(X, self.cluster_centers_) 

648 

649 def predict(self, X, sample_weight=None): 

650 """ 

651 Predicts the closest cluster each sample in X belongs to. 

652 

653 In the vector quantization literature, `cluster_centers_` is called 

654 the code book and each value returned by `predict` is the index of 

655 the closest code in the code book. 

656 

657 :param X: {array-like, sparse matrix} of shape (n_samples, n_features) 

658 New data to predict. 

659 :param sample_weight: array-like, shape (n_samples,), optional 

660 The weights for each observation in X. If None, all observations 

661 are assigned equal weight (default: None), unused here 

662 :return: labels : array, shape [n_samples,] 

663 Index of the cluster each sample belongs to. 

664 """ 

665 if self.norm == 'l2': 

666 return KMeans.predict(self, X) 

667 if self.norm == 'l1': 

668 return self._predict_l1(X, sample_weight=sample_weight) 

669 raise NotImplementedError( # pragma no cover 

670 "Norm is not L1 or L2 but '{}'.".format(self.norm)) 

671 

672 def _predict_l1(self, X, sample_weight=None, return_distances=False): 

673 """ 

674 Returns the distance of each point in *X* to 

675 every fit clusters. 

676 

677 :param X: features 

678 :param sample_weight: (unused) 

679 :param return_distances: returns distances as well 

680 :return: labels or `labels, distances` 

681 """ 

682 labels, mindist = pairwise_distances_argmin_min( 

683 X=X, Y=self.cluster_centers_, metric='manhattan') 

684 labels = labels.astype(numpy.int32, copy=False) 

685 if return_distances: 

686 return labels, mindist 

687 return labels