Coverage for mlinsights/mlmodel/kmeans_constraint.py: 98%

94 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-09 08:45 +0200

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Implémente la classe @see cl ConstraintKMeans. 

5""" 

6import numpy 

7from scipy.spatial import Delaunay # pylint: disable=E0611 

8from sklearn.cluster import KMeans 

9from sklearn.metrics.pairwise import euclidean_distances 

10from ._kmeans_constraint_ import constraint_kmeans, constraint_predictions 

11 

12 

13class ConstraintKMeans(KMeans): 

14 """ 

15 Defines a constraint :epkg:`k-means`. 

16 Clusters are modified to have an equal size. 

17 The algorithm is initialized with a regular :epkg:`KMeans` 

18 and continues with a modified version of it. 

19 

20 Computing the predictions offer a choice. 

21 The first one is to keep the predictions 

22 from the regular :epkg:`k-means` 

23 algorithm but with the balanced clusters. 

24 The second is to compute balanced predictions 

25 over the test set. That implies that the predictions 

26 for the same observations might change depending 

27 on the set it belongs to. 

28 

29 The parameter *strategy* determines how 

30 obseervations should be assigned to a cluster. 

31 The value can be: 

32 

33 * ``'distance'``: observations are ranked by distance to a cluster, 

34 the algorithm assigns first point to the closest center unless it reached 

35 the maximum size, it deals first with the further point and maps it to 

36 the closest center 

37 * ``'gain'``: follows the algorithm described at 

38 see `Same-size k-Means Variation 

39 <https://elki-project.github.io/tutorial/same-size_k_means>`_, 

40 * ``'weights'``: estimates weights attached to each cluster, 

41 it weights the distance to each cluster in order 

42 to balance the number of points mapped to every cluster, 

43 the strategy uses a learning rate. 

44 

45 The first two strategies cannot reach a good compromise 

46 without using function @see fn _switch_clusters which 

47 tries every switch between clusters: two points 

48 change clusters. It keeps the number of points and checks 

49 that the inertia is reduced. 

50 """ 

51 

52 _strategy_value = {'distance', 'gain', 'weights'} 

53 

54 def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=500, 

55 tol=0.0001, verbose=0, 

56 random_state=None, copy_x=True, algorithm='auto', 

57 balanced_predictions=False, strategy='gain', kmeans0=True, 

58 learning_rate=1., history=False): 

59 """ 

60 @param n_clusters number of clusters 

61 @param init used by :epkg:`k-means` 

62 @param n_init used by :epkg:`k-means` 

63 @param max_iter used by :epkg:`k-means` 

64 @param tol used by :epkg:`k-means` 

65 @param verbose used by :epkg:`k-means` 

66 @param random_state used by :epkg:`k-means` 

67 @param copy_x used by :epkg:`k-means` 

68 @param algorithm used by :epkg:`k-means` 

69 @param balanced_predictions produced balanced prediction 

70 or the regular ones 

71 @param strategy strategy or algorithm used to abide 

72 by the constraint 

73 @param kmeans0 if True, applies *k-means* algorithm first 

74 @param history keeps centers accress iterations 

75 @param learning_rate learning rate, used by strategy `'weights'` 

76 """ 

77 self._n_threads = 1 

78 KMeans.__init__(self, n_clusters=n_clusters, init=init, n_init=n_init, 

79 max_iter=max_iter, tol=tol, 

80 verbose=verbose, random_state=random_state, copy_x=copy_x, 

81 algorithm=algorithm) 

82 self.balanced_predictions = balanced_predictions 

83 self.strategy = strategy 

84 self.kmeans0 = kmeans0 

85 self.history = history 

86 self.learning_rate = learning_rate 

87 if strategy not in ConstraintKMeans._strategy_value: 

88 raise ValueError( 

89 f'strategy must be in {ConstraintKMeans._strategy_value}') 

90 

91 def fit(self, X, y=None, sample_weight=None, fLOG=None): 

92 """ 

93 Compute k-means clustering. 

94 

95 :param X: array-like or sparse matrix, shape=(n_samples, n_features) 

96 Training instances to cluster. It must be noted that the data 

97 will be converted to C ordering, which will cause a memory 

98 copy if the given data is not C-contiguous. 

99 :param y: Ignored 

100 :param sample_weight: sample weight 

101 :param fLOG: logging function 

102 """ 

103 max_iter = self.max_iter 

104 self.max_iter //= 2 

105 if self.kmeans0: 

106 KMeans.fit(self, X, y, sample_weight=sample_weight) 

107 state = None 

108 else: 

109 state = numpy.random.RandomState( # pylint: disable=E1101 

110 self.random_state) 

111 labels = state.randint( 

112 0, self.n_clusters, X.shape[0], dtype=numpy.int32) 

113 centers = numpy.empty((self.n_clusters, X.shape[1]), dtype=X.dtype) 

114 choice = state.randint(0, self.n_clusters, self.n_clusters) 

115 for i, c in enumerate(choice): 

116 centers[i, :] = X[c, :] 

117 self.labels_ = labels 

118 self.cluster_centers_ = centers 

119 self.inertia_ = float(X.shape[0]) 

120 self.n_iter_ = 0 

121 

122 self.max_iter = max_iter 

123 return self.constraint_kmeans( 

124 X, sample_weight=sample_weight, state=state, 

125 learning_rate=self.learning_rate, 

126 history=self.history, fLOG=fLOG) 

127 

128 def constraint_kmeans(self, X, sample_weight=None, state=None, 

129 learning_rate=1., history=False, fLOG=None): 

130 """ 

131 Completes the constraint k-means. 

132 

133 @param X features 

134 @param sample_weight sample weight 

135 @param state state 

136 @param history keeps evolution of centers 

137 @param fLOG logging function 

138 """ 

139 labels, centers, inertia, weights, iter_, all_centers = constraint_kmeans( 

140 X, self.labels_, sample_weight, self.cluster_centers_, 

141 inertia=self.inertia_, iter=self.n_iter_, 

142 max_iter=self.max_iter, verbose=self.verbose, 

143 strategy=self.strategy, state=state, 

144 learning_rate=learning_rate, history=history, 

145 fLOG=fLOG) 

146 self.labels_ = labels 

147 self.cluster_centers_ = centers 

148 self.cluster_centers_iter_ = ( 

149 None if len(all_centers) == 0 else numpy.dstack(all_centers)) 

150 self.inertia_ = inertia 

151 self.n_iter_ = iter_ 

152 self.weights_ = weights 

153 return self 

154 

155 def predict(self, X, sample_weight=None): 

156 """ 

157 Computes the predictions. 

158 

159 @param X features. 

160 @return prediction 

161 """ 

162 if self.weights_ is None: 

163 if self.balanced_predictions: 

164 labels, _, __ = constraint_predictions( 

165 X, self.cluster_centers_, strategy=self.strategy + '_p') 

166 return labels 

167 return KMeans.predict(self, X, sample_weight=sample_weight) 

168 else: 

169 if self.balanced_predictions: 

170 raise RuntimeError( # pragma: no cover 

171 "balanced_predictions and weights_ cannot be used together.") 

172 return KMeans.predict(self, X, sample_weight=sample_weight) 

173 

174 def transform(self, X): 

175 """ 

176 Computes the predictions. 

177 

178 @param X features. 

179 @return prediction 

180 """ 

181 if self.weights_ is None: 

182 if self.balanced_predictions: 

183 labels, distances, __ = constraint_predictions( 

184 X, self.cluster_centers_, strategy=self.strategy) 

185 # We remove small distances than the chosen clusters 

186 # due to the constraint, we choose max*2 instead. 

187 mx = distances.max() * 2 

188 for i, l in enumerate(labels): 

189 mi = distances[i, l] 

190 mmi = distances[i, :].min() 

191 if mi > mmi: 

192 # numpy.nan would be best 

193 distances[i, distances[i, :] < mi] = mx 

194 return distances 

195 return KMeans.transform(self, X) 

196 else: 

197 if self.balanced_predictions: 

198 raise RuntimeError( # pragma: no cover 

199 "balanced_predictions and weights_ cannot be used together.") 

200 res = KMeans.transform(self, X) 

201 res *= self.weights_.reshape((1, -1)) 

202 return res 

203 

204 def score(self, X, y=None, sample_weight=None): 

205 """ 

206 Returns the distances to all clusters. 

207 

208 @param X features 

209 @param y unused 

210 @param sample_weight sample weight 

211 @return distances 

212 """ 

213 if self.weights_ is None: 

214 if self.balanced_predictions: 

215 _, __, dist_close = constraint_predictions( 

216 X, self.cluster_centers_, strategy=self.strategy) 

217 return dist_close 

218 res = euclidean_distances(self.cluster_centers_, X, squared=True) 

219 else: 

220 if self.balanced_predictions: 

221 raise RuntimeError( # pragma: no cover 

222 "balanced_predictions and weights_ cannot be used together.") 

223 res = euclidean_distances(X, self.cluster_centers_, squared=True) 

224 res *= self.weights_.reshape((1, -1)) 

225 return res.max(axis=1) 

226 

227 def cluster_edges(self): 

228 """ 

229 Computes edges between clusters based on a 

230 `Delaunay <https://docs.scipy.org/doc/scipy/reference/ 

231 generated/scipy.spatial.Delaunay.html>`_ 

232 graph. 

233 """ 

234 tri = Delaunay(self.cluster_centers_) 

235 triangles = tri.simplices # pylint: disable=E1101 

236 edges = set() 

237 for row in triangles: 

238 for j in range(1, row.shape[-1]): 

239 a, b = row[j - 1:j + 1] 

240 if a < b: 

241 edges.add((a, b)) 

242 else: 

243 edges.add((b, a)) 

244 a, b = row[0], row[-1] 

245 if a < b: 

246 edges.add((a, b)) 

247 else: 

248 edges.add((b, a)) 

249 return edges