Coverage for mlinsights/mlmodel/_kmeans_022.py: 95%
132 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-28 08:46 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-28 08:46 +0100
1# pylint: disable=C0302
2"""
3@file
4@brief Implements k-means with norms L1 and L2.
5"""
6import numpy
7from scipy.sparse import issparse
8# Source: https://github.com/scikit-learn/scikit-learn/blob/95d4f0841d57e8b5f6b2a570312e9d832e69debc/sklearn/cluster/_k_means_fast.pyx
9from sklearn.utils.sparsefuncs_fast import assign_rows_csr # pylint: disable=W0611,E0611
10try:
11 from sklearn.cluster._kmeans import _check_sample_weight
12except ImportError: # pragma: no cover
13 from sklearn.cluster._kmeans import (
14 _check_normalize_sample_weight as _check_sample_weight)
15from sklearn.metrics.pairwise import pairwise_distances_argmin_min
18def _labels_inertia_precompute_dense(norm, X, sample_weight, centers, distances):
19 """
20 Computes labels and inertia using a full distance matrix.
22 This will overwrite the 'distances' array in-place.
24 :param norm: 'L1' or 'L2'
25 :param X: numpy array, shape (n_sample, n_features)
26 Input data.
27 :param sample_weight: array-like, shape (n_samples,)
28 The weights for each observation in X.
29 :param centers: numpy array, shape (n_clusters, n_features)
30 Cluster centers which data is assigned to.
31 :param distances: numpy array, shape (n_samples,)
32 Pre-allocated array in which distances are stored.
33 :return: labels : numpy array, dtype=numpy.int, shape (n_samples,)
34 Indices of clusters that samples are assigned to.
35 :return: inertia : float
36 Sum of squared distances of samples to their closest
37 cluster center.
38 """
39 n_samples = X.shape[0]
40 if norm == 'L2':
41 labels, mindist = pairwise_distances_argmin_min(
42 X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})
43 elif norm == 'L1':
44 labels, mindist = pairwise_distances_argmin_min(
45 X=X, Y=centers, metric='manhattan')
46 else: # pragma no cover
47 raise NotImplementedError(
48 f"Not implemented for norm '{norm}'.")
49 # cython k-means code assumes int32 inputs
50 labels = labels.astype(numpy.int32, copy=False)
51 if n_samples == distances.shape[0]:
52 # distances will be changed in-place
53 distances[:] = mindist
54 inertia = (mindist * sample_weight).sum()
55 return labels, inertia
58def _assign_labels_csr(X, sample_weight, x_squared_norms, centers,
59 labels, distances):
60 """Compute label assignment and inertia for a CSR input
61 Return the inertia (sum of squared distances to the centers).
62 """
63 if (distances is not None and
64 distances.shape != (X.shape[0], )):
65 raise ValueError( # pragma: no cover
66 f"Dimension mismatch for distance got "
67 f"{distances.shape}, expecting "
68 f"{(X.shape[0], centers.shape[0])}.")
69 n_clusters = centers.shape[0]
70 n_samples = X.shape[0]
71 store_distances = 0
72 inertia = 0.0
74 if centers.dtype == numpy.float32:
75 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float32)
76 else:
77 center_squared_norms = numpy.zeros(n_clusters, dtype=numpy.float64)
79 if n_samples == distances.shape[0]:
80 store_distances = 1
82 for center_idx in range(n_clusters):
83 center_squared_norms[center_idx] = numpy.dot(
84 centers[center_idx, :], centers[center_idx, :])
86 for sample_idx in range(n_samples):
87 min_dist = -1
88 for center_idx in range(n_clusters):
89 dist = 0.0
90 # hardcoded: minimize euclidean distance to cluster center:
91 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>
92 dist += X[sample_idx, :] @ centers[center_idx, :].reshape((-1, 1))
93 dist *= -2
94 dist += center_squared_norms[center_idx]
95 dist += x_squared_norms[sample_idx]
96 dist *= sample_weight[sample_idx]
97 if min_dist == -1 or dist < min_dist:
98 min_dist = dist
99 labels[sample_idx] = center_idx
100 if store_distances:
101 distances[sample_idx] = dist
102 inertia += min_dist
104 return inertia
107def _assign_labels_array(X, sample_weight, x_squared_norms, centers,
108 labels, distances):
109 """Compute label assignment and inertia for a dense array
110 Return the inertia (sum of squared distances to the centers).
111 """
112 n_clusters = centers.shape[0]
113 n_samples = X.shape[0]
114 store_distances = 0
115 inertia = 0.0
117 dtype = numpy.float32 if centers.dtype == numpy.float32 else numpy.float64
118 center_squared_norms = numpy.zeros(n_clusters, dtype=dtype)
120 if n_samples == distances.shape[0]:
121 store_distances = 1
123 for center_idx in range(n_clusters):
124 center_squared_norms[center_idx] = numpy.dot(
125 centers[center_idx, :], centers[center_idx, :])
127 for sample_idx in range(n_samples):
128 min_dist = -1
129 for center_idx in range(n_clusters):
130 dist = 0.0
131 # hardcoded: minimize euclidean distance to cluster center:
132 # ||a - b||^2 = ||a||^2 + ||b||^2 -2 <a, b>
133 dist += numpy.dot(X[sample_idx, :], centers[center_idx, :])
134 dist *= -2
135 dist += center_squared_norms[center_idx]
136 dist += x_squared_norms[sample_idx]
137 dist *= sample_weight[sample_idx]
138 if min_dist == -1 or dist < min_dist:
139 min_dist = dist
140 labels[sample_idx] = center_idx
142 if store_distances:
143 distances[sample_idx] = min_dist
144 inertia += min_dist
146 return inertia
149def _labels_inertia_skl(X, sample_weight, x_squared_norms, centers,
150 distances=None):
151 """E step of the K-means EM algorithm.
152 Compute the labels and the inertia of the given samples and centers.
153 This will compute the distances in-place.
155 :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
156 The input samples to assign to the labels.
157 :param sample_weight: array-like, shape (n_samples,)
158 The weights for each observation in X.
159 :param x_squared_norms: array, shape (n_samples,)
160 Precomputed squared euclidean norm of each data point, to speed up
161 computations.
162 :param centers: float array, shape (k, n_features)
163 The cluster centers.
164 :param distances: float array, shape (n_samples,)
165 Pre-allocated array to be filled in with each sample's distance
166 to the closest center.
167 :return: labels, int array of shape(n)
168 The resulting assignment
169 :return: inertia, float
170 Sum of squared distances of samples to their closest cluster center.
171 """
172 n_samples = X.shape[0]
173 sample_weight = _check_sample_weight(sample_weight, X)
174 # set the default value of centers to -1 to be able to detect any anomaly
175 # easily
176 labels = numpy.full(n_samples, -1, numpy.int32)
177 if distances is None:
178 distances = numpy.zeros(shape=(0,), dtype=X.dtype)
179 # distances will be changed in-place
180 if issparse(X):
181 inertia = _assign_labels_csr(
182 X, sample_weight, x_squared_norms, centers, labels,
183 distances=distances)
184 else:
185 inertia = _assign_labels_array(
186 X, sample_weight, x_squared_norms, centers, labels,
187 distances=distances)
188 return labels, inertia
191def _centers_dense(X, sample_weight, labels, n_clusters, distances):
192 """
193 M step of the K-means EM algorithm
194 Computation of cluster centers / means.
196 :param X: array-like, shape (n_samples, n_features)
197 :param sample_weight: array-like, shape (n_samples,)
198 The weights for each observation in X.
199 :param labels: array of integers, shape (n_samples)
200 Current label assignment
201 :param n_clusters: int
202 Number of desired clusters
203 :param distances: array-like, shape (n_samples)
204 Distance to closest cluster for each sample.
205 :return: centers : array, shape (n_clusters, n_features)
206 The resulting centers
207 """
208 n_samples = X.shape[0]
209 n_features = X.shape[1]
211 dtype = X.dtype
212 centers = numpy.zeros((n_clusters, n_features), dtype=dtype)
213 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype)
215 for i in range(n_samples):
216 c = labels[i]
217 weight_in_cluster[c] += sample_weight[i]
218 empty_clusters = numpy.where(weight_in_cluster == 0)[0]
219 # maybe also relocate small clusters?
221 if distances is not None and len(empty_clusters):
222 # find points to reassign empty clusters to
223 far_from_centers = distances.argsort()[::-1]
225 for i, cluster_id in enumerate(empty_clusters):
226 far_index = far_from_centers[i]
227 new_center = X[far_index] * sample_weight[far_index]
228 centers[cluster_id] = new_center
229 weight_in_cluster[cluster_id] = sample_weight[far_index]
231 for i in range(n_samples):
232 for j in range(n_features):
233 centers[labels[i], j] += X[i, j] * sample_weight[i]
235 centers /= weight_in_cluster[:, numpy.newaxis]
237 return centers
240def _centers_sparse(X, sample_weight, labels, n_clusters, distances):
241 """
242 M step of the K-means EM algorithm
243 Computation of cluster centers / means.
245 :param X: scipy.sparse.csr_matrix, shape (n_samples, n_features)
246 :param sample_weight: array-like, shape (n_samples,)
247 The weights for each observation in X.
248 :param labels: array of integers, shape (n_samples)
249 Current label assignment
250 :param n_clusters: int
251 Number of desired clusters
252 :param distances: array-like, shape (n_samples)
253 Distance to closest cluster for each sample.
254 :return: centers, array, shape (n_clusters, n_features)
255 The resulting centers
256 """
257 n_samples = X.shape[0]
258 n_features = X.shape[1]
260 data = X.data
261 indices = X.indices
262 indptr = X.indptr
264 dtype = X.dtype
265 centers = numpy.zeros((n_clusters, n_features), dtype=dtype)
266 weight_in_cluster = numpy.zeros((n_clusters,), dtype=dtype)
267 for i in range(n_samples):
268 c = labels[i]
269 weight_in_cluster[c] += sample_weight[i]
270 empty_clusters = numpy.where(weight_in_cluster == 0)[0]
271 n_empty_clusters = empty_clusters.shape[0]
273 # maybe also relocate small clusters?
275 if n_empty_clusters > 0:
276 # find points to reassign empty clusters to
277 far_from_centers = distances.argsort()[::-1][:n_empty_clusters]
278 assign_rows_csr(X, far_from_centers, empty_clusters, centers)
280 for i in range(n_empty_clusters):
281 weight_in_cluster[empty_clusters[i]] = 1
283 for i in range(labels.shape[0]):
284 curr_label = labels[i]
285 for ind in range(indptr[i], indptr[i + 1]):
286 j = indices[ind]
287 centers[curr_label, j] += data[ind] * sample_weight[i]
289 centers /= weight_in_cluster[:, numpy.newaxis]
291 return centers