Coverage for mlinsights/mlmodel/kmeans

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Implémente la classe @see cl ConstraintKMeans.

5"""

6import numpy

7from scipy.spatial import Delaunay # pylint: disable=E0611

8from sklearn.cluster import KMeans

9from sklearn.metrics.pairwise import euclidean_distances

10from ._kmeans_constraint_ import constraint_kmeans, constraint_predictions

13class ConstraintKMeans(KMeans):

14 """

15 Defines a constraint :epkg:`k-means`.

16 Clusters are modified to have an equal size.

17 The algorithm is initialized with a regular :epkg:`KMeans`

18 and continues with a modified version of it.

20 Computing the predictions offer a choice.

21 The first one is to keep the predictions

22 from the regular :epkg:`k-means`

23 algorithm but with the balanced clusters.

24 The second is to compute balanced predictions

25 over the test set. That implies that the predictions

26 for the same observations might change depending

27 on the set it belongs to.

29 The parameter *strategy* determines how

30 obseervations should be assigned to a cluster.

31 The value can be:

33 * ``'distance'``: observations are ranked by distance to a cluster,

34 the algorithm assigns first point to the closest center unless it reached

35 the maximum size, it deals first with the further point and maps it to

36 the closest center

37 * ``'gain'``: follows the algorithm described at

38 see `Same-size k-Means Variation

39 <https://elki-project.github.io/tutorial/same-size_k_means>`_,

40 * ``'weights'``: estimates weights attached to each cluster,

41 it weights the distance to each cluster in order

42 to balance the number of points mapped to every cluster,

43 the strategy uses a learning rate.

45 The first two strategies cannot reach a good compromise

46 without using function @see fn _switch_clusters which

47 tries every switch between clusters: two points

48 change clusters. It keeps the number of points and checks

49 that the inertia is reduced.

50 """

52 _strategy_value = {'distance', 'gain', 'weights'}

54 def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=500,

55 tol=0.0001, verbose=0,

56 random_state=None, copy_x=True, algorithm='auto',

57 balanced_predictions=False, strategy='gain', kmeans0=True,

58 learning_rate=1., history=False):

59 """

60 @param n_clusters number of clusters

61 @param init used by :epkg:`k-means`

62 @param n_init used by :epkg:`k-means`

63 @param max_iter used by :epkg:`k-means`

64 @param tol used by :epkg:`k-means`

65 @param verbose used by :epkg:`k-means`

66 @param random_state used by :epkg:`k-means`

67 @param copy_x used by :epkg:`k-means`

68 @param algorithm used by :epkg:`k-means`

69 @param balanced_predictions produced balanced prediction

70 or the regular ones

71 @param strategy strategy or algorithm used to abide

72 by the constraint

73 @param kmeans0 if True, applies *k-means* algorithm first

74 @param history keeps centers accress iterations

75 @param learning_rate learning rate, used by strategy `'weights'`

76 """

77 self._n_threads = 1

78 KMeans.__init__(self, n_clusters=n_clusters, init=init, n_init=n_init,

79 max_iter=max_iter, tol=tol,

80 verbose=verbose, random_state=random_state, copy_x=copy_x,

81 algorithm=algorithm)

82 self.balanced_predictions = balanced_predictions

83 self.strategy = strategy

84 self.kmeans0 = kmeans0

85 self.history = history

86 self.learning_rate = learning_rate

87 if strategy not in ConstraintKMeans._strategy_value:

88 raise ValueError(

89 f'strategy must be in {ConstraintKMeans._strategy_value}')

91 def fit(self, X, y=None, sample_weight=None, fLOG=None):

92 """

93 Compute k-means clustering.

95 :param X: array-like or sparse matrix, shape=(n_samples, n_features)

96 Training instances to cluster. It must be noted that the data

97 will be converted to C ordering, which will cause a memory

98 copy if the given data is not C-contiguous.

99 :param y: Ignored

100 :param sample_weight: sample weight

101 :param fLOG: logging function

102 """

103 max_iter = self.max_iter

104 self.max_iter //= 2

105 if self.kmeans0:

106 KMeans.fit(self, X, y, sample_weight=sample_weight)

107 state = None

108 else:

109 state = numpy.random.RandomState( # pylint: disable=E1101

110 self.random_state)

111 labels = state.randint(

112 0, self.n_clusters, X.shape[0], dtype=numpy.int32)

113 centers = numpy.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)

114 choice = state.randint(0, self.n_clusters, self.n_clusters)

115 for i, c in enumerate(choice):

116 centers[i, :] = X[c, :]

117 self.labels_ = labels

118 self.cluster_centers_ = centers

119 self.inertia_ = float(X.shape[0])

120 self.n_iter_ = 0

121

122 self.max_iter = max_iter

123 return self.constraint_kmeans(

124 X, sample_weight=sample_weight, state=state,

125 learning_rate=self.learning_rate,

126 history=self.history, fLOG=fLOG)

127

128 def constraint_kmeans(self, X, sample_weight=None, state=None,

129 learning_rate=1., history=False, fLOG=None):

130 """

131 Completes the constraint k-means.

132

133 @param X features

134 @param sample_weight sample weight

135 @param state state

136 @param history keeps evolution of centers

137 @param fLOG logging function

138 """

139 labels, centers, inertia, weights, iter_, all_centers = constraint_kmeans(

140 X, self.labels_, sample_weight, self.cluster_centers_,

141 inertia=self.inertia_, iter=self.n_iter_,

142 max_iter=self.max_iter, verbose=self.verbose,

143 strategy=self.strategy, state=state,

144 learning_rate=learning_rate, history=history,

145 fLOG=fLOG)

146 self.labels_ = labels

147 self.cluster_centers_ = centers

148 self.cluster_centers_iter_ = (

149 None if len(all_centers) == 0 else numpy.dstack(all_centers))

150 self.inertia_ = inertia

151 self.n_iter_ = iter_

152 self.weights_ = weights

153 return self

154

155 def predict(self, X, sample_weight=None):

156 """

157 Computes the predictions.

158

159 @param X features.

160 @return prediction

161 """

162 if self.weights_ is None:

163 if self.balanced_predictions:

164 labels, _, __ = constraint_predictions(

165 X, self.cluster_centers_, strategy=self.strategy + '_p')

166 return labels

167 return KMeans.predict(self, X, sample_weight=sample_weight)

168 else:

169 if self.balanced_predictions:

170 raise RuntimeError( # pragma: no cover

171 "balanced_predictions and weights_ cannot be used together.")

172 return KMeans.predict(self, X, sample_weight=sample_weight)

173

174 def transform(self, X):

175 """

176 Computes the predictions.

177

178 @param X features.

179 @return prediction

180 """

181 if self.weights_ is None:

182 if self.balanced_predictions:

183 labels, distances, __ = constraint_predictions(

184 X, self.cluster_centers_, strategy=self.strategy)

185 # We remove small distances than the chosen clusters

186 # due to the constraint, we choose max*2 instead.

187 mx = distances.max() * 2

188 for i, l in enumerate(labels):

189 mi = distances[i, l]

190 mmi = distances[i, :].min()

191 if mi > mmi:

192 # numpy.nan would be best

193 distances[i, distances[i, :] < mi] = mx

194 return distances

195 return KMeans.transform(self, X)

196 else:

197 if self.balanced_predictions:

198 raise RuntimeError( # pragma: no cover

199 "balanced_predictions and weights_ cannot be used together.")

200 res = KMeans.transform(self, X)

201 res *= self.weights_.reshape((1, -1))

202 return res

203

204 def score(self, X, y=None, sample_weight=None):

205 """

206 Returns the distances to all clusters.

207

208 @param X features

209 @param y unused

210 @param sample_weight sample weight

211 @return distances

212 """

213 if self.weights_ is None:

214 if self.balanced_predictions:

215 _, __, dist_close = constraint_predictions(

216 X, self.cluster_centers_, strategy=self.strategy)

217 return dist_close

218 res = euclidean_distances(self.cluster_centers_, X, squared=True)

219 else:

220 if self.balanced_predictions:

221 raise RuntimeError( # pragma: no cover

222 "balanced_predictions and weights_ cannot be used together.")

223 res = euclidean_distances(X, self.cluster_centers_, squared=True)

224 res *= self.weights_.reshape((1, -1))

225 return res.max(axis=1)

226

227 def cluster_edges(self):

228 """

229 Computes edges between clusters based on a

230 `Delaunay <https://docs.scipy.org/doc/scipy/reference/

231 generated/scipy.spatial.Delaunay.html>`_

232 graph.

233 """

234 tri = Delaunay(self.cluster_centers_)

235 triangles = tri.simplices # pylint: disable=E1101

236 edges = set()

237 for row in triangles:

238 for j in range(1, row.shape[-1]):

239 a, b = row[j - 1:j + 1]

240 if a < b:

241 edges.add((a, b))

242 else:

243 edges.add((b, a))

244 a, b = row[0], row[-1]

245 if a < b:

246 edges.add((a, b))

247 else:

248 edges.add((b, a))

249 return edges

Coverage for mlinsights/mlmodel/kmeans_constraint.py: 98%

94 statements