Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# pylint: disable=C0302 

2""" 

3@file 

4@brief Implements k-means with norms L1 and L2. 

5""" 

6import numpy 

7from scipy.sparse import issparse 

8# Source: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/cluster/_k_means_fast.pyx 

9from sklearn.utils.sparsefuncs_fast import assign_rows_csr # pylint: disable=W0611,E0611 

10try: 

11 from sklearn.cluster._kmeans import _check_sample_weight 

12except ImportError: # pragma: no cover 

13 from sklearn.cluster._kmeans import ( 

14 _check_normalize_sample_weight as _check_sample_weight) 

15from sklearn.metrics.pairwise import pairwise_distances_argmin_min 

16 

17 

18def _labels_inertia_precompute_dense(norm, X, sample_weight, centers, distances): 

19 """ 

20 Computes labels and inertia using a full distance matrix. 

21 

22 This will overwrite the 'distances' array in-place. 

23 

24 :param norm: 'l1' or 'l2' 

25 :param X: numpy array, shape (n_sample, n_features) 

26 Input data. 

27 :param sample_weight: array-like, shape (n_samples,) 

28 The weights for each observation in X. 

29 :param centers: numpy array, shape (n_clusters, n_features) 

30 Cluster centers which data is assigned to. 

31 :param distances: numpy array, shape (n_samples,) 

32 Pre-allocated array in which distances are stored. 

33 :return: labels : numpy array, dtype=numpy.int, shape (n_samples,) 

34 Indices of clusters that samples are assigned to. 

35 :return: inertia : float 

36 Sum of squared distances of samples to their closest 

37 cluster center. 

38 """ 

39 n_samples = X.shape[0] 

40 if norm == 'l2': 

41 labels, mindist = pairwise_distances_argmin_min( 

42 X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) 

43 elif norm == 'l1': 

44 labels, mindist = pairwise_distances_argmin_min( 

45 X=X, Y=centers, metric='manhattan') 

46 else: # pragma no cover 

47 raise NotImplementedError( 

48 "Not implemented for norm '{}'.".format(norm)) 

49 # cython k-means code assumes int32 inputs 

50 labels = labels.astype(numpy.int32, copy=False) 

51 if n_samples == distances.shape[0]: 

52 # distances will be changed in-place 

53 distances[:] = mindist 

54 inertia = (mindist * sample_weight).sum() 

55 return labels, inertia 

56 

57 

58def _assign_labels_csr(X, sample_weight, x_squared_norms, centers, 

59 labels, distances): 

60 """Compute label assignment and inertia for a CSR input 

61 Return the inertia (sum of squared distances to the centers). 

62 """ 

63 if (distances is not None and 

64 distances.shape != (X.shape[0], )): 

65 raise ValueError( # pragma: no cover 

66 "Dimension mismatch for distance got {}, expecting {}." 

67 "".format(distances.shape, (X.shape[0], centers.shape[0]))) 

68 n_clusters = centers.shape[0] 

69 n_samples = X.shape[0] 

70 store_distances = 0 

71 inertia = 0.0 

72 

73 if centers.dtype == numpy.float32: 

74 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float32) 

75 else: 

76 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float64) 

77 

78 if n_samples == distances.shape[0]: 

79 store_distances = 1 

80 

81 for center_idx in range(n_clusters): 

82 center_squared_norms[center_idx] = numpy.dot( 

83 centers[center_idx, :], centers[center_idx, :]) 

84 

85 for sample_idx in range(n_samples): 

86 min_dist = -1 

87 for center_idx in range(n_clusters): 

88 dist = 0.0 

89 # hardcoded: minimize euclidean distance to cluster center: 

90 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b> 

91 dist += X[sample_idx, :] @ centers[center_idx, :].reshape((-1, 1)) 

92 dist *= -2 

93 dist += center_squared_norms[center_idx] 

94 dist += x_squared_norms[sample_idx] 

95 dist *= sample_weight[sample_idx] 

96 if min_dist == -1 or dist < min_dist: 

97 min_dist = dist 

98 labels[sample_idx] = center_idx 

99 if store_distances: 

100 distances[sample_idx] = dist 

101 inertia += min_dist 

102 

103 return inertia 

104 

105 

106def _assign_labels_array(X, sample_weight, x_squared_norms, centers, 

107 labels, distances): 

108 """Compute label assignment and inertia for a dense array 

109 Return the inertia (sum of squared distances to the centers). 

110 """ 

111 n_clusters = centers.shape[0] 

112 n_samples = X.shape[0] 

113 store_distances = 0 

114 inertia = 0.0 

115 

116 dtype = numpy.float32 if centers.dtype == numpy.float32 else numpy.float64 

117 center_squared_norms = numpy.zeros(n_clusters, dtype=dtype) 

118 

119 if n_samples == distances.shape[0]: 

120 store_distances = 1 

121 

122 for center_idx in range(n_clusters): 

123 center_squared_norms[center_idx] = numpy.dot( 

124 centers[center_idx, :], centers[center_idx, :]) 

125 

126 for sample_idx in range(n_samples): 

127 min_dist = -1 

128 for center_idx in range(n_clusters): 

129 dist = 0.0 

130 # hardcoded: minimize euclidean distance to cluster center: 

131 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b> 

132 dist += numpy.dot(X[sample_idx, :], centers[center_idx, :]) 

133 dist *= -2 

134 dist += center_squared_norms[center_idx] 

135 dist += x_squared_norms[sample_idx] 

136 dist *= sample_weight[sample_idx] 

137 if min_dist == -1 or dist < min_dist: 

138 min_dist = dist 

139 labels[sample_idx] = center_idx 

140 

141 if store_distances: 

142 distances[sample_idx] = min_dist 

143 inertia += min_dist 

144 

145 return inertia 

146 

147 

148def _labels_inertia_skl(X, sample_weight, x_squared_norms, centers, 

149 distances=None): 

150 """E step of the K-means EM algorithm. 

151 Compute the labels and the inertia of the given samples and centers. 

152 This will compute the distances in-place. 

153 

154 :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features) 

155 The input samples to assign to the labels. 

156 :param sample_weight: array-like, shape (n_samples,) 

157 The weights for each observation in X. 

158 :param x_squared_norms: array, shape (n_samples,) 

159 Precomputed squared euclidean norm of each data point, to speed up 

160 computations. 

161 :param centers: float array, shape (k, n_features) 

162 The cluster centers. 

163 :param distances: float array, shape (n_samples,) 

164 Pre-allocated array to be filled in with each sample's distance 

165 to the closest center. 

166 :return: labels, int array of shape(n) 

167 The resulting assignment 

168 :return: inertia, float 

169 Sum of squared distances of samples to their closest cluster center. 

170 """ 

171 n_samples = X.shape[0] 

172 sample_weight = _check_sample_weight(sample_weight, X) 

173 # set the default value of centers to -1 to be able to detect any anomaly 

174 # easily 

175 labels = numpy.full(n_samples, -1, numpy.int32) 

176 if distances is None: 

177 distances = numpy.zeros(shape=(0,), dtype=X.dtype) 

178 # distances will be changed in-place 

179 if issparse(X): 

180 inertia = _assign_labels_csr( 

181 X, sample_weight, x_squared_norms, centers, labels, 

182 distances=distances) 

183 else: 

184 inertia = _assign_labels_array( 

185 X, sample_weight, x_squared_norms, centers, labels, 

186 distances=distances) 

187 return labels, inertia 

188 

189 

190def _centers_dense(X, sample_weight, labels, n_clusters, distances): 

191 """ 

192 M step of the K-means EM algorithm 

193 Computation of cluster centers / means. 

194 

195 :param X: array-like, shape (n_samples, n_features) 

196 :param sample_weight: array-like, shape (n_samples,) 

197 The weights for each observation in X. 

198 :param labels: array of integers, shape (n_samples) 

199 Current label assignment 

200 :param n_clusters: int 

201 Number of desired clusters 

202 :param distances: array-like, shape (n_samples) 

203 Distance to closest cluster for each sample. 

204 :return: centers : array, shape (n_clusters, n_features) 

205 The resulting centers 

206 """ 

207 n_samples = X.shape[0] 

208 n_features = X.shape[1] 

209 

210 dtype = X.dtype 

211 centers = numpy.zeros((n_clusters, n_features), dtype=dtype) 

212 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype) 

213 

214 for i in range(n_samples): 

215 c = labels[i] 

216 weight_in_cluster[c] += sample_weight[i] 

217 empty_clusters = numpy.where(weight_in_cluster == 0)[0] 

218 # maybe also relocate small clusters? 

219 

220 if distances is not None and len(empty_clusters): 

221 # find points to reassign empty clusters to 

222 far_from_centers = distances.argsort()[::-1] 

223 

224 for i, cluster_id in enumerate(empty_clusters): 

225 far_index = far_from_centers[i] 

226 new_center = X[far_index] * sample_weight[far_index] 

227 centers[cluster_id] = new_center 

228 weight_in_cluster[cluster_id] = sample_weight[far_index] 

229 

230 for i in range(n_samples): 

231 for j in range(n_features): 

232 centers[labels[i], j] += X[i, j] * sample_weight[i] 

233 

234 centers /= weight_in_cluster[:, numpy.newaxis] 

235 

236 return centers 

237 

238 

239def _centers_sparse(X, sample_weight, labels, n_clusters, distances): 

240 """ 

241 M step of the K-means EM algorithm 

242 Computation of cluster centers / means. 

243 

244 :param X: scipy.sparse.csr_matrix, shape (n_samples, n_features) 

245 :param sample_weight: array-like, shape (n_samples,) 

246 The weights for each observation in X. 

247 :param labels: array of integers, shape (n_samples) 

248 Current label assignment 

249 :param n_clusters: int 

250 Number of desired clusters 

251 :param distances: array-like, shape (n_samples) 

252 Distance to closest cluster for each sample. 

253 :return: centers, array, shape (n_clusters, n_features) 

254 The resulting centers 

255 """ 

256 n_samples = X.shape[0] 

257 n_features = X.shape[1] 

258 

259 data = X.data 

260 indices = X.indices 

261 indptr = X.indptr 

262 

263 dtype = X.dtype 

264 centers = numpy.zeros((n_clusters, n_features), dtype=dtype) 

265 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype) 

266 for i in range(n_samples): 

267 c = labels[i] 

268 weight_in_cluster[c] += sample_weight[i] 

269 empty_clusters = numpy.where(weight_in_cluster == 0)[0] 

270 n_empty_clusters = empty_clusters.shape[0] 

271 

272 # maybe also relocate small clusters? 

273 

274 if n_empty_clusters > 0: 

275 # find points to reassign empty clusters to 

276 far_from_centers = distances.argsort()[::-1][:n_empty_clusters] 

277 assign_rows_csr(X, far_from_centers, empty_clusters, centers) 

278 

279 for i in range(n_empty_clusters): 

280 weight_in_cluster[empty_clusters[i]] = 1 

281 

282 for i in range(labels.shape[0]): 

283 curr_label = labels[i] 

284 for ind in range(indptr[i], indptr[i + 1]): 

285 j = indices[ind] 

286 centers[curr_label, j] += data[ind] * sample_weight[i] 

287 

288 centers /= weight_in_cluster[:, numpy.newaxis] 

289 

290 return centers