Coverage for mlinsights/mlmodel/_kmeans

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# pylint: disable=C0302

2"""

3@file

4@brief Implements k-means with norms L1 and L2.

5"""

6import numpy

7from scipy.sparse import issparse

8# Source: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/cluster/_k_means_fast.pyx

9from sklearn.utils.sparsefuncs_fast import assign_rows_csr # pylint: disable=W0611,E0611

10try:

11 from sklearn.cluster._kmeans import _check_sample_weight

12except ImportError: # pragma: no cover

13 from sklearn.cluster._kmeans import (

14 _check_normalize_sample_weight as _check_sample_weight)

15from sklearn.metrics.pairwise import pairwise_distances_argmin_min

18def _labels_inertia_precompute_dense(norm, X, sample_weight, centers, distances):

19 """

20 Computes labels and inertia using a full distance matrix.

22 This will overwrite the 'distances' array in-place.

24 :param norm: 'l1' or 'l2'

25 :param X: numpy array, shape (n_sample, n_features)

26 Input data.

27 :param sample_weight: array-like, shape (n_samples,)

28 The weights for each observation in X.

29 :param centers: numpy array, shape (n_clusters, n_features)

30 Cluster centers which data is assigned to.

31 :param distances: numpy array, shape (n_samples,)

32 Pre-allocated array in which distances are stored.

33 :return: labels : numpy array, dtype=numpy.int, shape (n_samples,)

34 Indices of clusters that samples are assigned to.

35 :return: inertia : float

36 Sum of squared distances of samples to their closest

37 cluster center.

38 """

39 n_samples = X.shape[0]

40 if norm == 'l2':

41 labels, mindist = pairwise_distances_argmin_min(

42 X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})

43 elif norm == 'l1':

44 labels, mindist = pairwise_distances_argmin_min(

45 X=X, Y=centers, metric='manhattan')

46 else: # pragma no cover

47 raise NotImplementedError(

48 "Not implemented for norm '{}'.".format(norm))

49 # cython k-means code assumes int32 inputs

50 labels = labels.astype(numpy.int32, copy=False)

51 if n_samples == distances.shape[0]:

52 # distances will be changed in-place

53 distances[:] = mindist

54 inertia = (mindist * sample_weight).sum()

55 return labels, inertia

58def _assign_labels_csr(X, sample_weight, x_squared_norms, centers,

59 labels, distances):

60 """Compute label assignment and inertia for a CSR input

61 Return the inertia (sum of squared distances to the centers).

62 """

63 if (distances is not None and

64 distances.shape != (X.shape[0], )):

65 raise ValueError( # pragma: no cover

66 "Dimension mismatch for distance got {}, expecting {}."

67 "".format(distances.shape, (X.shape[0], centers.shape[0])))

68 n_clusters = centers.shape[0]

69 n_samples = X.shape[0]

70 store_distances = 0

71 inertia = 0.0

73 if centers.dtype == numpy.float32:

74 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float32)

75 else:

76 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float64)

78 if n_samples == distances.shape[0]:

79 store_distances = 1

81 for center_idx in range(n_clusters):

82 center_squared_norms[center_idx] = numpy.dot(

83 centers[center_idx, :], centers[center_idx, :])

85 for sample_idx in range(n_samples):

86 min_dist = -1

87 for center_idx in range(n_clusters):

88 dist = 0.0

89 # hardcoded: minimize euclidean distance to cluster center:

90 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>

91 dist += X[sample_idx, :] @ centers[center_idx, :].reshape((-1, 1))

92 dist *= -2

93 dist += center_squared_norms[center_idx]

94 dist += x_squared_norms[sample_idx]

95 dist *= sample_weight[sample_idx]

96 if min_dist == -1 or dist < min_dist:

97 min_dist = dist

98 labels[sample_idx] = center_idx

99 if store_distances:

100 distances[sample_idx] = dist

101 inertia += min_dist

102

103 return inertia

104

105

106def _assign_labels_array(X, sample_weight, x_squared_norms, centers,

107 labels, distances):

108 """Compute label assignment and inertia for a dense array

109 Return the inertia (sum of squared distances to the centers).

110 """

111 n_clusters = centers.shape[0]

112 n_samples = X.shape[0]

113 store_distances = 0

114 inertia = 0.0

115

116 dtype = numpy.float32 if centers.dtype == numpy.float32 else numpy.float64

117 center_squared_norms = numpy.zeros(n_clusters, dtype=dtype)

118

119 if n_samples == distances.shape[0]:

120 store_distances = 1

121

122 for center_idx in range(n_clusters):

123 center_squared_norms[center_idx] = numpy.dot(

124 centers[center_idx, :], centers[center_idx, :])

125

126 for sample_idx in range(n_samples):

127 min_dist = -1

128 for center_idx in range(n_clusters):

129 dist = 0.0

130 # hardcoded: minimize euclidean distance to cluster center:

131 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>

132 dist += numpy.dot(X[sample_idx, :], centers[center_idx, :])

133 dist *= -2

134 dist += center_squared_norms[center_idx]

135 dist += x_squared_norms[sample_idx]

136 dist *= sample_weight[sample_idx]

137 if min_dist == -1 or dist < min_dist:

138 min_dist = dist

139 labels[sample_idx] = center_idx

140

141 if store_distances:

142 distances[sample_idx] = min_dist

143 inertia += min_dist

144

145 return inertia

146

147

148def _labels_inertia_skl(X, sample_weight, x_squared_norms, centers,

149 distances=None):

150 """E step of the K-means EM algorithm.

151 Compute the labels and the inertia of the given samples and centers.

152 This will compute the distances in-place.

153

154 :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)

155 The input samples to assign to the labels.

156 :param sample_weight: array-like, shape (n_samples,)

157 The weights for each observation in X.

158 :param x_squared_norms: array, shape (n_samples,)

159 Precomputed squared euclidean norm of each data point, to speed up

160 computations.

161 :param centers: float array, shape (k, n_features)

162 The cluster centers.

163 :param distances: float array, shape (n_samples,)

164 Pre-allocated array to be filled in with each sample's distance

165 to the closest center.

166 :return: labels, int array of shape(n)

167 The resulting assignment

168 :return: inertia, float

169 Sum of squared distances of samples to their closest cluster center.

170 """

171 n_samples = X.shape[0]

172 sample_weight = _check_sample_weight(sample_weight, X)

173 # set the default value of centers to -1 to be able to detect any anomaly

174 # easily

175 labels = numpy.full(n_samples, -1, numpy.int32)

176 if distances is None:

177 distances = numpy.zeros(shape=(0,), dtype=X.dtype)

178 # distances will be changed in-place

179 if issparse(X):

180 inertia = _assign_labels_csr(

181 X, sample_weight, x_squared_norms, centers, labels,

182 distances=distances)

183 else:

184 inertia = _assign_labels_array(

185 X, sample_weight, x_squared_norms, centers, labels,

186 distances=distances)

187 return labels, inertia

188

189

190def _centers_dense(X, sample_weight, labels, n_clusters, distances):

191 """

192 M step of the K-means EM algorithm

193 Computation of cluster centers / means.

194

195 :param X: array-like, shape (n_samples, n_features)

196 :param sample_weight: array-like, shape (n_samples,)

197 The weights for each observation in X.

198 :param labels: array of integers, shape (n_samples)

199 Current label assignment

200 :param n_clusters: int

201 Number of desired clusters

202 :param distances: array-like, shape (n_samples)

203 Distance to closest cluster for each sample.

204 :return: centers : array, shape (n_clusters, n_features)

205 The resulting centers

206 """

207 n_samples = X.shape[0]

208 n_features = X.shape[1]

209

210 dtype = X.dtype

211 centers = numpy.zeros((n_clusters, n_features), dtype=dtype)

212 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype)

213

214 for i in range(n_samples):

215 c = labels[i]

216 weight_in_cluster[c] += sample_weight[i]

217 empty_clusters = numpy.where(weight_in_cluster == 0)[0]

218 # maybe also relocate small clusters?

219

220 if distances is not None and len(empty_clusters):

221 # find points to reassign empty clusters to

222 far_from_centers = distances.argsort()[::-1]

223

224 for i, cluster_id in enumerate(empty_clusters):

225 far_index = far_from_centers[i]

226 new_center = X[far_index] * sample_weight[far_index]

227 centers[cluster_id] = new_center

228 weight_in_cluster[cluster_id] = sample_weight[far_index]

229

230 for i in range(n_samples):

231 for j in range(n_features):

232 centers[labels[i], j] += X[i, j] * sample_weight[i]

233

234 centers /= weight_in_cluster[:, numpy.newaxis]

235

236 return centers

237

238

239def _centers_sparse(X, sample_weight, labels, n_clusters, distances):

240 """

241 M step of the K-means EM algorithm

242 Computation of cluster centers / means.

243

244 :param X: scipy.sparse.csr_matrix, shape (n_samples, n_features)

245 :param sample_weight: array-like, shape (n_samples,)

246 The weights for each observation in X.

247 :param labels: array of integers, shape (n_samples)

248 Current label assignment

249 :param n_clusters: int

250 Number of desired clusters

251 :param distances: array-like, shape (n_samples)

252 Distance to closest cluster for each sample.

253 :return: centers, array, shape (n_clusters, n_features)

254 The resulting centers

255 """

256 n_samples = X.shape[0]

257 n_features = X.shape[1]

258

259 data = X.data

260 indices = X.indices

261 indptr = X.indptr

262

263 dtype = X.dtype

264 centers = numpy.zeros((n_clusters, n_features), dtype=dtype)

265 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype)

266 for i in range(n_samples):

267 c = labels[i]

268 weight_in_cluster[c] += sample_weight[i]

269 empty_clusters = numpy.where(weight_in_cluster == 0)[0]

270 n_empty_clusters = empty_clusters.shape[0]

271

272 # maybe also relocate small clusters?

273

274 if n_empty_clusters > 0:

275 # find points to reassign empty clusters to

276 far_from_centers = distances.argsort()[::-1][:n_empty_clusters]

277 assign_rows_csr(X, far_from_centers, empty_clusters, centers)

278

279 for i in range(n_empty_clusters):

280 weight_in_cluster[empty_clusters[i]] = 1

281

282 for i in range(labels.shape[0]):

283 curr_label = labels[i]

284 for ind in range(indptr[i], indptr[i + 1]):

285 j = indices[ind]

286 centers[curr_label, j] += data[ind] * sample_weight[i]

287

288 centers /= weight_in_cluster[:, numpy.newaxis]

289

290 return centers

Coverage for mlinsights/mlmodel/_kmeans_022.py : 95%

132 statements

Coverage for mlinsights/mlmodel/_kmeans_022.py : 95%

132 statements 126 run 6 missing 5 excluded

132 statements