Coverage for src/ensae_teaching_cs/special/elections.py: 86%

226 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-01-27 05:44 +0100

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Contains a class to process elections results (France) 

5""" 

6import random 

7import numpy 

8import pandas 

9 

10 

11class ElectionResults: 

12 """ 

13 Processes data coming from 

14 `data.gouv.fr <http://www.data.gouv.fr/content/search?SortBy=Pertinence&SortOrder=0&SearchText=%C3%A9lections+2012>`_. 

15 

16 The class uses `pandas <http://pandas.pydata.org/>`_ to process the data. 

17 See `Elections françaises <http://www.xavierdupre.fr/blog/2013-12-06_nojs.html>`_. 

18 See `Optimisation sous contraintes appliquée au calcul du report des voix <http://www.xavierdupre.fr/blog/2013-12-07_nojs.html>`_. 

19 """ 

20 

21 def __init__(self, file, year=None, level="Départements"): 

22 """ 

23 Loads the data downloaded from 

24 `data.gouv.fr <http://www.data.gouv.fr/content/search?SortBy=Pertinence&SortOrder=0&SearchText=%C3%A9lections+2012>`_. 

25 

26 @param file xls file 

27 @param year year (optional) 

28 @param level ``Départements`` or ``Cantons`` 

29 """ 

30 self.year = year 

31 self.level = level.lower().replace("s", "") 

32 if isinstance(file, list): 

33 self.tours = file 

34 else: 

35 self.tours = [pandas.read_excel(file, sheet_name=f"{level} T1", engine='openpyxl'), 

36 pandas.read_excel(file, sheet_name=f"{level} T2", engine='openpyxl')] 

37 for i, t in enumerate(self.tours): 

38 if len(t) == 0: 

39 raise Exception("no data for tour %d" % (i + 1)) 

40 self.tours = [self.process_tour(_) for _ in self.tours] 

41 for i, t in enumerate(self.tours): 

42 if len(t) == 0: 

43 raise Exception("no data for tour %d" % i) 

44 try: 

45 self.tours = [ 

46 _.sort_values(f"Libellé du {self.level}", inplace=False) for _ in self.tours] 

47 except Exception as e: 

48 message = "unable to sort, shape={1} columns={0}".format( 

49 ",".join(self.tours[0].columns), self.tours[0].shape) 

50 raise Exception(message) from e 

51 

52 def get_candidates_votes(self, round): 

53 """ 

54 Returns the numbers of voters for each candidate. 

55 

56 @param round 0 or 1 

57 @return dictionary 

58 """ 

59 cols0 = [_ for _ in self.tours[ 

60 round].columns if _ not in self.LevelCol] 

61 sums = [self.tours[round][c].sum() for c in cols0] 

62 return {c: s for c, s in zip(cols0, sums)} 

63 

64 def correct(self, method=None): 

65 """ 

66 Corrects the second round in a way there is the same number of voters. 

67 

68 @param method some preprocess before going on (see below) 

69 

70 About ``method``: 

71 

72 - *'N'* --> correct the number of voters for each regions 

73 - *'cand'* --> gives the same weights to every candidates 

74 """ 

75 if method == "N": 

76 if len(self.T0) != len(self.T1): 

77 raise Exception( 

78 "unable to proceed because of different numbers of regions") 

79 cols0 = [_ for _ in self.tours[ 

80 0].columns if _ not in self.LevelCol] 

81 cols1 = [_ for _ in self.tours[ 

82 1].columns if _ not in self.LevelCol] 

83 for i in range(len(self.T0)): 

84 s1 = self.T0.loc[i, cols0].sum() 

85 s2 = self.T1.loc[i, cols1].sum() 

86 coef = 1.0 * s1 / s2 

87 for c in cols1: 

88 self.T1.loc[i, c] *= coef 

89 elif method == "cand": 

90 cols0 = [_ for _ in self.tours[ 

91 0].columns if _ not in self.LevelCol] 

92 sums = [self.T0[c].sum() for c in cols0] 

93 total = sum(sums) 

94 for c, s in zip(cols0, sums): 

95 self.T0[c] = self.T0[c] * total / s 

96 self.correct("N") 

97 else: 

98 raise NotImplementedError("unknown method: " + method) 

99 

100 def __str__(self): 

101 """usual""" 

102 message = "Year: {0} T1: {1} T2: {2}".format( 

103 self.Year, len(self.tours[0]), len(self.tours[1])) 

104 return message 

105 

106 def GetNbCandidates(self, round): 

107 """ 

108 Returns the number of candidates. 

109 @param round round (0 or 1) 

110 @return number of candidates 

111 """ 

112 return len(self.tours[round].columns) - 4 

113 

114 @property 

115 def Year(self): 

116 """ 

117 Returns the year. 

118 """ 

119 return self.year 

120 

121 @property 

122 def Level(self): 

123 """ 

124 Returns the level (``département`` or ``canton``). 

125 """ 

126 return self.level 

127 

128 @property 

129 def LevelCol(self): 

130 """ 

131 Returns the column associated to the level (their name depends on the level). 

132 """ 

133 return [f"Code du {self.level}", f"Libellé du {self.level}"] 

134 

135 @property 

136 def T0(self): 

137 """ 

138 Returns the dataframe for the first round. 

139 """ 

140 return self.tours[0] 

141 

142 @property 

143 def T1(self): 

144 """ 

145 Returns the dataframe for the second round. 

146 """ 

147 return self.tours[1] 

148 

149 def process_tour(self, tour): 

150 """ 

151 Keeps the interesting columns, move candidates name as column name. 

152 

153 @param tour dataframe 

154 @return dataframe 

155 """ 

156 keep = [isinstance(_, (float, int, numpy.int64, numpy.float64)) and ~numpy.isnan(_) 

157 for _ in tour["Abstentions"]] 

158 tour = tour.loc[keep, :] 

159 names = [_ for _ in tour.columns if _.startswith("Nom")] 

160 res = [] 

161 for n in names: 

162 c = list(tour[n]) 

163 res.extend(c) 

164 unique = set(res) 

165 unique = list(unique) 

166 

167 try: 

168 unique.sort() 

169 except TypeError as e: 

170 raise Exception("unable to sort " + str(unique) + 

171 f"\ncolumns:{','.join(tour.columns)}") from e 

172 

173 columns0 = [f"Code du {self.level}", f"Libellé du {self.level}", ] 

174 columns1 = ["Abstentions", "Blancs et nuls", ] 

175 

176 def proc(row): 

177 res = {} 

178 for i, v in enumerate(row): 

179 k = tour.columns[i] 

180 if k in columns0: 

181 res[k] = row[i] 

182 elif k in columns1: 

183 res[k] = row[i] 

184 elif k.startswith("Nom"): 

185 res[v] = row[i + 2] 

186 badkeys = [_ for _ in res if len(_) == 0] 

187 if len(badkeys) > 0: 

188 return None 

189 return res 

190 rows = list(map(lambda r: proc(r), tour.values)) 

191 rows = [_ for _ in rows if _ is not None] 

192 return pandas.DataFrame(rows) 

193 

194 def vote_transfer(self): 

195 """ 

196 Computes the votes between the two rounds using 

197 contrainsts optimization, the optimization 

198 requires :epkg:`cvxopt`. 

199 

200 See `Optimisation sous contraintes appliquée au calcul du report des voix <http://www.xavierdupre.fr/blog/2013-12-07_nojs.html>`_. 

201 

202 @return results (as a DataFrame) 

203 """ 

204 cols0 = [_ for _ in self.tours[0] if _ not in self.LevelCol] 

205 X = self.tours[0][cols0].values 

206 X = numpy.matrix(X) 

207 

208 cols1 = [_ for _ in self.tours[1] if _ not in self.LevelCol] 

209 Y = self.tours[1][cols1].values 

210 Y = numpy.matrix(Y) 

211 

212 nbC = Y.shape[1] 

213 lin, col = X.shape 

214 

215 # construction de Q 

216 def _zeros(lin, col): 

217 return [[0.0 for i in range(0, col)] for j in range(0, lin)] 

218 bigX = [numpy.matrix(_zeros(lin, col * nbC)) for i in range(0, nbC)] 

219 

220 for i in range(0, nbC): 

221 bigX[i][:, col * i:col * (i + 1)] = X[:, :] 

222 

223 pX = [] 

224 for m in bigX: 

225 pX.append(m.transpose() * m) 

226 

227 Q = None 

228 for m in pX: 

229 if Q is None: 

230 Q = +m 

231 else: 

232 Q += m * 2 

233 

234 # construction de p 

235 p = None 

236 for i in range(0, nbC): 

237 tr = bigX[i].transpose() 

238 y2 = Y[:, i] * (-2) 

239 t = tr * y2 

240 if p is None: 

241 p = t 

242 else: 

243 p += t 

244 

245 # construction de G, h 

246 def _identite(n): 

247 return [[0.0 if i != j else 1.0 for i in range(0, n)] for j in range(0, n)] 

248 h = numpy.matrix(_zeros(col * nbC, 1)) 

249 G = - numpy.matrix(_identite(col * nbC)) 

250 

251 # construction de C,b 

252 b = numpy.matrix(_zeros(col, 1)) 

253 b[:, :] = 1.0 

254 C = numpy.matrix(_zeros(col, col * nbC)) 

255 for i in range(0, col): 

256 for ni in range(0, nbC): 

257 C[i, i + col * ni] = 1.0 

258 

259 # résolutation 

260 from cvxopt import solvers 

261 from cvxopt import matrix 

262 

263 Q = matrix(Q) 

264 p = matrix(p) 

265 G = matrix(G) 

266 h = matrix(h) 

267 C = matrix(C) 

268 b = matrix(b) 

269 

270 old = solvers.options.get("show_progress", True) 

271 solvers.options["show_progress"] = False 

272 sol = solvers.qp(Q, p, G, h, C, b) 

273 solvers.options["show_progress"] = old 

274 coef = sol['x'] 

275 

276 res = numpy.matrix(_zeros(col, nbC)) 

277 for i in range(0, nbC): 

278 res[:, i] = coef[col * i:col * (i + 1)] 

279 

280 rown = [_ for _ in self.tours[0].columns if _ not in self.LevelCol] 

281 coln = [_ for _ in self.tours[1].columns if _ not in self.LevelCol] 

282 return pandas.DataFrame(data=res, index=rown, columns=coln) 

283 

284 def resample(self, method="uniform"): 

285 """ 

286 Builds a new sample: it produces a results with the same number of 

287 rows, but each rows is randomly drawn from the current data. 

288 This is needed for the bootstrap procedures. 

289 

290 @param method ``weight`` or ``uniform`` 

291 @return two matrices 

292 """ 

293 if len(self.T0) != len(self.T1): 

294 raise Exception( 

295 "unable to proceeed, we need to draw the same regions, assuming both matrices are sorted in the same order") 

296 

297 def resample_matrix(mat, h): 

298 return mat.loc[h, :] 

299 if method == "uniform": 

300 n = len(self.T0) 

301 h = [random.randint(0, n - 1) for i in range(0, n)] 

302 else: 

303 def find_index(x): 

304 s = 0 

305 for i, _ in enumerate(self.WeightsNorm): 

306 s += _ 

307 if x < s: 

308 return i 

309 return len(self.WeightsNorm) - 1 

310 n = len(self.T0) 

311 h = [find_index(random.random()) for i in range(0, n)] 

312 

313 return ElectionResults([resample_matrix(self.T0, h), 

314 resample_matrix(self.T1, h), ], 

315 year=self.year, level=self.level) 

316 

317 def get_people(self, round=0): 

318 """ 

319 Returns the number of people per regions. 

320 @param round first (0) or second (1) round 

321 @return series 

322 """ 

323 return self.tours[round].apply(lambda row: sum([row[_] for _ in self.tours[round].columns if _ not in self.LevelCol]), axis=1) 

324 

325 @property 

326 def WeightsNorm(self): 

327 """ 

328 Returns the proportion of voters for each regions. 

329 """ 

330 if "weightsnorm" not in self.__dict__: 

331 self.weightsnorm = list(self.get_people()) 

332 s = sum(self.weightsnorm) 

333 self.weightsnorm = [_ * 1.0 / s for _ in self.weightsnorm] 

334 return self.weightsnorm 

335 

336 @staticmethod 

337 def min_max_mean_std(series, alpha=0.05): 

338 """ 

339 returns the mean standard deviation, bounds of the confidence interval 

340 

341 @param series list of numbers 

342 @param alpha confidence level 

343 @return mean, std, low, high 

344 """ 

345 series = list(sorted(series)) 

346 a = int(len(series) * alpha / 2) 

347 low, high = series[a], series[-a - 1] 

348 mean = sum(series) / len(series) 

349 std = sum([(x - mean) ** 2 for x in series]) / len(series) 

350 return mean, std ** 0.5, low, high 

351 

352 def bootstrap(self, iter=1000, method="vote_transfer", alpha=0.05, fLOG=None, **params): 

353 """ 

354 Uses the bootstrap method to compute confidence intervals 

355 see `bootstrap <http://fr.wikipedia.org/wiki/Bootstrap_%28statistiques%29>`_. 

356 

357 @param iter number of iteration 

358 @param method method to bootstrap 

359 @param alpha confidence level 

360 @param fLOG logging function or none 

361 @param params parameters to give to ``method`` 

362 @return four matrices, averaged results, sigma, lower bound, higher bound 

363 """ 

364 if fLOG is None: 

365 fLOG = lambda *x: "" 

366 fLOG("sampling", iter) 

367 samples = [self.resample() for i in range(iter)] 

368 if method == "vote_transfer": 

369 matrices = [_.vote_transfer(**params) for _ in samples] 

370 else: 

371 raise NotImplementedError() 

372 

373 mean = matrices[0].copy() 

374 std = matrices[0].copy() 

375 low = matrices[0].copy() 

376 high = matrices[0].copy() 

377 

378 shape = mean.shape 

379 fLOG("level for each coefficient", shape) 

380 for i in range(0, shape[0]): 

381 for j in range(0, shape[1]): 

382 series = [m.iloc[i, j] for m in matrices] 

383 xmean, xstd, xlow, xhigh = ElectionResults.min_max_mean_std( 

384 series, alpha=alpha) 

385 mean.iloc[i, j] = xmean 

386 std.iloc[i, j] = xstd 

387 low.iloc[i, j] = xlow 

388 high.iloc[i, j] = xhigh 

389 return mean, std, low, high 

390 

391 @staticmethod 

392 def combine_into_string(matrices, float_format=str, agg_format=str): 

393 """ 

394 Combines two matrices into one before displaying it. 

395 

396 @param matrices list of matrices (same dimension) 

397 @param float_format to format each float of all matrices 

398 @param agg_format to build the aggregated string 

399 @return matrixes (dataframe) 

400 

401 Example: 

402 

403 :: 

404 

405 def pour(x) : 

406 if x < 0.01 : return "" 

407 else : return "%2.0f" % (x*100) + "%" 

408 

409 boot = el.bootstrap(iter=10) 

410 comb = el.combine_string( [boot[2],boot[3]], pour, lambda v : "%s-%s" % tuple(v)) 

411 """ 

412 shape = matrices[0].shape 

413 res = [["" for i in range(shape[1])] for j in range(shape[0])] 

414 for i in range(0, shape[0]): 

415 for j in range(0, shape[1]): 

416 series = [float_format(m.iloc[i, j]) for m in matrices] 

417 res[i][j] = agg_format(series) 

418 return pandas.DataFrame(data=res, columns=list(matrices[0].columns), index=list(matrices[0].index))