Coverage for mlinsights/mlmodel/_kmeans_022.py: 95%

132 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-28 08:46 +0100

1# pylint: disable=C0302 

2""" 

3@file 

4@brief Implements k-means with norms L1 and L2. 

5""" 

6import numpy 

7from scipy.sparse import issparse 

8# Source: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/cluster/_k_means_fast.pyx 

9from sklearn.utils.sparsefuncs_fast import assign_rows_csr # pylint: disable=W0611,E0611 

10try: 

11 from sklearn.cluster._kmeans import _check_sample_weight 

12except ImportError: # pragma: no cover 

13 from sklearn.cluster._kmeans import ( 

14 _check_normalize_sample_weight as _check_sample_weight) 

15from sklearn.metrics.pairwise import pairwise_distances_argmin_min 

16 

17 

18def _labels_inertia_precompute_dense(norm, X, sample_weight, centers, distances): 

19 """ 

20 Computes labels and inertia using a full distance matrix. 

21 

22 This will overwrite the 'distances' array in-place. 

23 

24 :param norm: 'L1' or 'L2' 

25 :param X: numpy array, shape (n_sample, n_features) 

26 Input data. 

27 :param sample_weight: array-like, shape (n_samples,) 

28 The weights for each observation in X. 

29 :param centers: numpy array, shape (n_clusters, n_features) 

30 Cluster centers which data is assigned to. 

31 :param distances: numpy array, shape (n_samples,) 

32 Pre-allocated array in which distances are stored. 

33 :return: labels : numpy array, dtype=numpy.int, shape (n_samples,) 

34 Indices of clusters that samples are assigned to. 

35 :return: inertia : float 

36 Sum of squared distances of samples to their closest 

37 cluster center. 

38 """ 

39 n_samples = X.shape[0] 

40 if norm == 'L2': 

41 labels, mindist = pairwise_distances_argmin_min( 

42 X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) 

43 elif norm == 'L1': 

44 labels, mindist = pairwise_distances_argmin_min( 

45 X=X, Y=centers, metric='manhattan') 

46 else: # pragma no cover 

47 raise NotImplementedError( 

48 f"Not implemented for norm '{norm}'.") 

49 # cython k-means code assumes int32 inputs 

50 labels = labels.astype(numpy.int32, copy=False) 

51 if n_samples == distances.shape[0]: 

52 # distances will be changed in-place 

53 distances[:] = mindist 

54 inertia = (mindist * sample_weight).sum() 

55 return labels, inertia 

56 

57 

58def _assign_labels_csr(X, sample_weight, x_squared_norms, centers, 

59 labels, distances): 

60 """Compute label assignment and inertia for a CSR input 

61 Return the inertia (sum of squared distances to the centers). 

62 """ 

63 if (distances is not None and 

64 distances.shape != (X.shape[0], )): 

65 raise ValueError( # pragma: no cover 

66 f"Dimension mismatch for distance got " 

67 f"{distances.shape}, expecting " 

68 f"{(X.shape[0], centers.shape[0])}.") 

69 n_clusters = centers.shape[0] 

70 n_samples = X.shape[0] 

71 store_distances = 0 

72 inertia = 0.0 

73 

74 if centers.dtype == numpy.float32: 

75 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float32) 

76 else: 

77 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float64) 

78 

79 if n_samples == distances.shape[0]: 

80 store_distances = 1 

81 

82 for center_idx in range(n_clusters): 

83 center_squared_norms[center_idx] = numpy.dot( 

84 centers[center_idx, :], centers[center_idx, :]) 

85 

86 for sample_idx in range(n_samples): 

87 min_dist = -1 

88 for center_idx in range(n_clusters): 

89 dist = 0.0 

90 # hardcoded: minimize euclidean distance to cluster center: 

91 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b> 

92 dist += X[sample_idx, :] @ centers[center_idx, :].reshape((-1, 1)) 

93 dist *= -2 

94 dist += center_squared_norms[center_idx] 

95 dist += x_squared_norms[sample_idx] 

96 dist *= sample_weight[sample_idx] 

97 if min_dist == -1 or dist < min_dist: 

98 min_dist = dist 

99 labels[sample_idx] = center_idx 

100 if store_distances: 

101 distances[sample_idx] = dist 

102 inertia += min_dist 

103 

104 return inertia 

105 

106 

107def _assign_labels_array(X, sample_weight, x_squared_norms, centers, 

108 labels, distances): 

109 """Compute label assignment and inertia for a dense array 

110 Return the inertia (sum of squared distances to the centers). 

111 """ 

112 n_clusters = centers.shape[0] 

113 n_samples = X.shape[0] 

114 store_distances = 0 

115 inertia = 0.0 

116 

117 dtype = numpy.float32 if centers.dtype == numpy.float32 else numpy.float64 

118 center_squared_norms = numpy.zeros(n_clusters, dtype=dtype) 

119 

120 if n_samples == distances.shape[0]: 

121 store_distances = 1 

122 

123 for center_idx in range(n_clusters): 

124 center_squared_norms[center_idx] = numpy.dot( 

125 centers[center_idx, :], centers[center_idx, :]) 

126 

127 for sample_idx in range(n_samples): 

128 min_dist = -1 

129 for center_idx in range(n_clusters): 

130 dist = 0.0 

131 # hardcoded: minimize euclidean distance to cluster center: 

132 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b> 

133 dist += numpy.dot(X[sample_idx, :], centers[center_idx, :]) 

134 dist *= -2 

135 dist += center_squared_norms[center_idx] 

136 dist += x_squared_norms[sample_idx] 

137 dist *= sample_weight[sample_idx] 

138 if min_dist == -1 or dist < min_dist: 

139 min_dist = dist 

140 labels[sample_idx] = center_idx 

141 

142 if store_distances: 

143 distances[sample_idx] = min_dist 

144 inertia += min_dist 

145 

146 return inertia 

147 

148 

149def _labels_inertia_skl(X, sample_weight, x_squared_norms, centers, 

150 distances=None): 

151 """E step of the K-means EM algorithm. 

152 Compute the labels and the inertia of the given samples and centers. 

153 This will compute the distances in-place. 

154 

155 :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features) 

156 The input samples to assign to the labels. 

157 :param sample_weight: array-like, shape (n_samples,) 

158 The weights for each observation in X. 

159 :param x_squared_norms: array, shape (n_samples,) 

160 Precomputed squared euclidean norm of each data point, to speed up 

161 computations. 

162 :param centers: float array, shape (k, n_features) 

163 The cluster centers. 

164 :param distances: float array, shape (n_samples,) 

165 Pre-allocated array to be filled in with each sample's distance 

166 to the closest center. 

167 :return: labels, int array of shape(n) 

168 The resulting assignment 

169 :return: inertia, float 

170 Sum of squared distances of samples to their closest cluster center. 

171 """ 

172 n_samples = X.shape[0] 

173 sample_weight = _check_sample_weight(sample_weight, X) 

174 # set the default value of centers to -1 to be able to detect any anomaly 

175 # easily 

176 labels = numpy.full(n_samples, -1, numpy.int32) 

177 if distances is None: 

178 distances = numpy.zeros(shape=(0,), dtype=X.dtype) 

179 # distances will be changed in-place 

180 if issparse(X): 

181 inertia = _assign_labels_csr( 

182 X, sample_weight, x_squared_norms, centers, labels, 

183 distances=distances) 

184 else: 

185 inertia = _assign_labels_array( 

186 X, sample_weight, x_squared_norms, centers, labels, 

187 distances=distances) 

188 return labels, inertia 

189 

190 

191def _centers_dense(X, sample_weight, labels, n_clusters, distances): 

192 """ 

193 M step of the K-means EM algorithm 

194 Computation of cluster centers / means. 

195 

196 :param X: array-like, shape (n_samples, n_features) 

197 :param sample_weight: array-like, shape (n_samples,) 

198 The weights for each observation in X. 

199 :param labels: array of integers, shape (n_samples) 

200 Current label assignment 

201 :param n_clusters: int 

202 Number of desired clusters 

203 :param distances: array-like, shape (n_samples) 

204 Distance to closest cluster for each sample. 

205 :return: centers : array, shape (n_clusters, n_features) 

206 The resulting centers 

207 """ 

208 n_samples = X.shape[0] 

209 n_features = X.shape[1] 

210 

211 dtype = X.dtype 

212 centers = numpy.zeros((n_clusters, n_features), dtype=dtype) 

213 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype) 

214 

215 for i in range(n_samples): 

216 c = labels[i] 

217 weight_in_cluster[c] += sample_weight[i] 

218 empty_clusters = numpy.where(weight_in_cluster == 0)[0] 

219 # maybe also relocate small clusters? 

220 

221 if distances is not None and len(empty_clusters): 

222 # find points to reassign empty clusters to 

223 far_from_centers = distances.argsort()[::-1] 

224 

225 for i, cluster_id in enumerate(empty_clusters): 

226 far_index = far_from_centers[i] 

227 new_center = X[far_index] * sample_weight[far_index] 

228 centers[cluster_id] = new_center 

229 weight_in_cluster[cluster_id] = sample_weight[far_index] 

230 

231 for i in range(n_samples): 

232 for j in range(n_features): 

233 centers[labels[i], j] += X[i, j] * sample_weight[i] 

234 

235 centers /= weight_in_cluster[:, numpy.newaxis] 

236 

237 return centers 

238 

239 

240def _centers_sparse(X, sample_weight, labels, n_clusters, distances): 

241 """ 

242 M step of the K-means EM algorithm 

243 Computation of cluster centers / means. 

244 

245 :param X: scipy.sparse.csr_matrix, shape (n_samples, n_features) 

246 :param sample_weight: array-like, shape (n_samples,) 

247 The weights for each observation in X. 

248 :param labels: array of integers, shape (n_samples) 

249 Current label assignment 

250 :param n_clusters: int 

251 Number of desired clusters 

252 :param distances: array-like, shape (n_samples) 

253 Distance to closest cluster for each sample. 

254 :return: centers, array, shape (n_clusters, n_features) 

255 The resulting centers 

256 """ 

257 n_samples = X.shape[0] 

258 n_features = X.shape[1] 

259 

260 data = X.data 

261 indices = X.indices 

262 indptr = X.indptr 

263 

264 dtype = X.dtype 

265 centers = numpy.zeros((n_clusters, n_features), dtype=dtype) 

266 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype) 

267 for i in range(n_samples): 

268 c = labels[i] 

269 weight_in_cluster[c] += sample_weight[i] 

270 empty_clusters = numpy.where(weight_in_cluster == 0)[0] 

271 n_empty_clusters = empty_clusters.shape[0] 

272 

273 # maybe also relocate small clusters? 

274 

275 if n_empty_clusters > 0: 

276 # find points to reassign empty clusters to 

277 far_from_centers = distances.argsort()[::-1][:n_empty_clusters] 

278 assign_rows_csr(X, far_from_centers, empty_clusters, centers) 

279 

280 for i in range(n_empty_clusters): 

281 weight_in_cluster[empty_clusters[i]] = 1 

282 

283 for i in range(labels.shape[0]): 

284 curr_label = labels[i] 

285 for ind in range(indptr[i], indptr[i + 1]): 

286 j = indices[ind] 

287 centers[curr_label, j] += data[ind] * sample_weight[i] 

288 

289 centers /= weight_in_cluster[:, numpy.newaxis] 

290 

291 return centers