Coverage for src/ensae_projects/hackathon/perf2018.py: 0%

1"""

2@file

3@brief Compute the performance for the hackathon 2018.

4"""

5import os

6import time

7import pandas

8import numpy

9from PIL import Image

10from lightmlrestapi.mlapp.mlstorage import MLStorage

11try:

12 from ..hackathon.image_helper import enumerate_image_class

13except (ImportError, ValueError):

14 from ensae_projects.hackathon.image_helper import enumerate_image_class

17class MLStoragePerf2018:

18 """

19 Computes the performances the a hackathon.

20 """

22 def __init__(self, storage, examples, cache_file="cache_file.csv"):

23 """

24 @param storage storage location

25 @param examples deep learning models

26 """

27 self._storage = self._load_ml_storage(storage)

28 self._examples = examples

29 self._cache_file = cache_file

31 def _load_ml_storage(self, root):

32 """

33 Creates an instance of a

34 `MLStorage <http://www.xavierdupre.fr/app/lightmlrestapi/helpsphinx/lightmlrestapi/mlapp/mlstorage.html

35 # lightmlrestapi.mlapp.mlstorage.MLStorage>`_

36 based on a folder.

38 @param root folder

39 """

40 if not os.path.exists(root):

41 raise FileNotFoundError("Unable to find '{0}'".format(root))

42 stor = MLStorage(root)

43 return stor

45 def _load_cached_performance(self, cache_file=None):

46 """

47 Retrieves performances already computed.

49 @param cached_file file

50 """

51 if cache_file is None:

52 cache_file = self._cache_file

53 if os.path.exists(cache_file):

54 df = pandas.read_csv(cache_file, sep=",", encoding="utf-8")

55 return df

56 else:

57 return None

59 def _save_performance(self, df, cache_file=None):

60 """

61 Saves cached performance.

63 @param df dataframe

64 @param cache_file destination

65 """

66 if cache_file is None:

67 cache_file = self._cache_file

68 df.to_csv(cache_file, sep=',', encoding='utf-8', index=False)

70 def compute_performance(self, use_cache=True, fLOG=None):

71 """

72 Computes the performance for the not cached one if

73 *use_cache* is True.

75 @param use_cache use cache

76 @param fLOG logging function

77 @return dataframe

78 """

79 cache = None

80 already = set()

81 if use_cache:

82 cache = self._load_cached_performance()

83 if cache is not None:

84 already = set(cache["name"])

86 rows = []

87 for i, name in enumerate(sorted(self._storage.enumerate_names())):

88 if i % 30 == 0:

89 print('.')

90 if name not in already:

91 t0 = time.perf_counter()

92 if fLOG:

93 fLOG(

94 "[MLStoragePerf2018] compute perf for {0}: '{1}'".format(i, name))

95 res = self.compute_perf(name) # pylint: disable=E1111

96 if fLOG:

97 fLOG(

98 "[MLStoragePerf2018] Done for {0}: {1}".format(name, res))

99 if 'exc' in res:

100 fLOG("[MLStoragePerf2018] exception for {0}: {1}".format(

101 name, res['exc']))

102 res["name"] = name

103 res["stime"] = os.stat(self._storage._folder, name).st_mtime

104 t1 = time.perf_counter() - t0

105 res["time"] = t1

106 rows.append(res)

107 already.add(name)

108

109 df = pandas.DataFrame(rows)

110 sc = list(sorted(df.columns))

111 df = df[sc]

112

113 if cache is not None:

114 df = pandas.concat([df, cache])

115

116 df = df.sort_values("name").copy()

117

118 self._save_performance(df)

119

120 return df

121

122 def compute_perf(self, name):

123 """

124 Computes the performances for every image and one

125 particular model.

126 """

127 raise NotImplementedError()

128

129

130class MLStoragePerf2018Image(MLStoragePerf2018):

131 """

132 Overloads *compute_perf* for images.

133 Example of use:

134

135 ::

136

137 from ensae_projects.hackathon.perf2018 import MLStoragePerf2018Image

138 mstorage = "storage_brgm"

139 mexample = "hackathon_test/sample_labelled_test"

140 mpref = MLStoragePerf2018Image(mstorage, mexample)

141 mres = mpref.compute_performance(fLOG=print, use_cache=True)

142 mres = mres.sort_values("precision", ascending=False)

143 print(mres)

144 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - BRGM</h1>\n"

145 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html())

146 with open("brgm.html", "w", encoding="utf-8") as f:

147 f.write(mcontent)

148 """

149

150 def __init__(self, storage, examples, cache_file="cache_file.csv"):

151 """

152 @param storage storage location

153 @param examples deep learning models

154 """

155 MLStoragePerf2018.__init__(self, storage, examples, cache_file)

156

157 def _label_mapping(self, subs):

158 """

159 Computes the label based on a subfolder name.

160 """

161 return 1 if subs.endswith('1') else 0

162

163 def compute_perf(self, name):

164 """

165 Computes the performances for every image and one

166 particular model.

167 """

168 from keras import backend as K

169 K.clear_session() # pylint: disable=E1101

170 folder = self._examples

171

172 try:

173 model = self._storage.load_model(name)

174 vers = self._storage.call_version(name)

175 exc = None

176 except Exception as e: # pylint: disable=W0703

177 model = None

178 exc = e

179 vers = None

180

181 rows = []

182 for img, sub in enumerate_image_class(folder):

183 label = self._label_mapping(sub)

184 obs = dict(image=img, sub=sub, label=label)

185 if model is None:

186 obs = {'exc': Exception("model is None")}

187 pred = None

188 else:

189 X = numpy.array(Image.open(img))

190 try:

191 pred = self._storage.call_predict(

192 name, X, loaded_model=model)

193 # print("*****",pred)

194 except Exception as e: # pylint: disable=W0703

195 exc = e

196 pred = None

197 #print('------', e)

198

199 if pred is None:

200 pass

201 else:

202 if isinstance(pred, float):

203 plabel = 1 if pred > 0.5 else 0

204 score = pred

205 if isinstance(pred, list):

206 pred = numpy.array(pred)

207 if isinstance(pred, numpy.ndarray):

208 pred = pred.ravel()

209 if len(pred) == 1:

210 plabel = 1 if pred[0] > 0.5 else 0

211 score = pred[0]

212 elif len(pred) > 1:

213 plabel = numpy.argmax(pred)

214 score = pred[plabel]

215 else:

216 exc = ValueError("No prediction")

217 else:

218 exc = TypeError(

219 "Prediction with the wrong type {0}".format(type(pred)))

220

221 if exc:

222 obs.update({'exc': exc})

223 else:

224 obs.update(dict(predicted_label=plabel, score=score))

225

226 rows.append(obs)

227 # print(rows)

228 # break

229

230 final = pandas.DataFrame(rows)

231 columns = list(final.columns)

232 if 'score' in columns:

233 final["score"] = final["score"].fillna(0)

234 final["predicted_label"] = final["predicted_label"].fillna(-1)

235 final["correct"] = final["predicted_label"] == final["label"]

236 final["correcti"] = 0

237 final.loc[final["correct"], "correcti"] = 1

238

239 res = {}

240 if exc:

241 res["exc"] = str(exc)

242 if len(final) > 0:

243 gr = final["correcti"].sum() / final.shape[0]

244 res["precision"] = gr

245

246 gr = final[["sub", "correcti", "correct"]]

247 gr = gr.groupby("sub", as_index=False)

248 gr = gr.agg({"correct": len, 'correcti': sum})

249 gr["ratio"] = gr["correcti"] / gr["correct"]

250 for i in range(gr.shape[0]):

251 res["p_%s" % gr.loc[i, "sub"]] = gr.loc[i, "ratio"]

252 else:

253 res["precision"] = 0

254 else:

255 res = dict(exc=exc, precision=0)

256

257 if vers is not None:

258 res["version"] = vers

259 return res

260

261

262class MLStoragePerf2018TimeSeries(MLStoragePerf2018):

263 """

264 Overloads *compute_perf* for timeseries.

265

266 Example of use:

267

268 ::

269

270 from ensae_projects.hackathon.perf2018 import MLStoragePerf2018TimeSeries

271 mstorage = "storage_microdon"

272 mexample = "hackathon_test/sample_labelled_test"

273 mpref = MLStoragePerf2018TimeSeries(mstorage, mexample)

274 mres = mpref.compute_performance(fLOG=print, use_cache=True)

275 mres = mres.sort_values("cor", ascending=False)

276 print(mres)

277 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - Microdon</h1>\n"

278 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html())

279 with open("brgm.html", "w", encoding="utf-8") as f:

280 f.write(mcontent)

281 """

282

283 def __init__(self, storage, examples, cache_file="cache_file.csv"):

284 """

285 @param storage storage location

286 @param examples deep learning models

287 """

288 MLStoragePerf2018.__init__(self, storage, examples, cache_file)

289

290 df = pandas.read_csv(examples)

291

292 sub = df[["year", "week", "campaigns_campaign_id", "collecteur_id",

293 "montant_total", "nb_dons_total", "nb_transac_total"]].copy()

294 dsub = sub.fillna(0)

295 gr = dsub.groupby(

296 ["year", "week", "campaigns_campaign_id"], as_index=False).sum()

297 gr["PARTICIPATION"] = gr["nb_dons_total"] / gr["nb_transac_total"]

298 self._expected = gr

299

300 def _label_mapping(self, subs):

301 """

302 Computes the label based on a subfolder name.

303 """

304 return 1 if subs.endswith('1') else 0

305

306 def compute_perf(self, name):

307 """

308 Computes the performances for every image and one

309 particular model.

310 """

311

312 try:

313 model = self._storage.load_model(name)

314 vers = self._storage.call_version(name)

315 exc = None

316 except Exception as e: # pylint: disable=W0703

317 model = None

318 exc = e

319 vers = None

320

321 cols = ["year", "week", "campaigns_campaign_id",

322 "PARTICIPATION", "nb_transac_total"]

323 X = self._expected[cols]

324 total = 0

325 total10 = 0

326 total100 = 0

327 total1000 = 0

328 n1 = 0

329 n10 = 0

330 n100 = 0

331 n1000 = 0

332

333 preds = []

334 exps = []

335 diffs = {}

336

337 for i in range(0, X.shape[0]):

338 week = X.iloc[i, 1]

339 camp = X.iloc[i, 2]

340 exp = X.iloc[i, 3]

341 if exp > 1:

342 continue

343 nb_transac_total = X.iloc[i, 4]

344 if nb_transac_total > 0:

345 try:

346 pred = self._storage.call_predict(

347 name, (week, camp), loaded_model=model)

348 except Exception as e: # pylint: disable=W0703

349 exc = e

350 pred = 0

351 if isinstance(pred, list):

352 if len(pred) == 1:

353 pred = pred[0]

354 else:

355 exc = Exception(

356 "Returned a list of value when expecting one")

357 elif isinstance(pred, numpy.ndarray):

358 pred = pred.ravel()

359 if len(pred) == 1:

360 pred = pred[0]

361 else:

362 exc = Exception(

363 "Returned a list of value when expecting one")

364 n1 += 1

365 exps.append(exp)

366 preds.append(pred)

367 diffs[week, camp] = abs(exp - pred)

368 total += (pred - exp) ** 2

369 if nb_transac_total >= 10:

370 total10 += (pred - exp) ** 2

371 n10 += 1

372 if nb_transac_total >= 100:

373 total100 += (pred - exp) ** 2

374 n100 += 1

375 if nb_transac_total >= 1000:

376 total1000 += (pred - exp) ** 2

377 n1000 += 1

378

379 res = {}

380 if vers is not None:

381 res["version"] = vers

382 if exc is not None:

383 res["exc"] = exc

384 if n1 > 0:

385 res["score"] = (total / n1) ** 0.5

386 res["score10"] = (total10 / n10) ** 0.5

387 res["score100"] = (total100 / n100) ** 0.5

388 res["score1000"] = (total1000 / n1000) ** 0.5

389 try:

390 res["cor"] = numpy.corrcoef(numpy.array(preds),

391 numpy.array(exps))[0, 1]

392 except (AttributeError, KeyError):

393 res["cor"] = numpy.nan

394 try:

395 res["pmin"] = numpy.min(preds)

396 res["pmax"] = numpy.max(preds)

397 except (KeyError, ValueError):

398 res["pmin"] = numpy.nan

399 res["pmax"] = numpy.nan

400

401 resort = [(v, k) for k, v in diffs.items()]

402

403 try:

404 resort.sort()

405 skip = False

406 except ValueError:

407 skip = True

408 exc = Exception(

409 "Unable to sort differences {0}".format(resort[0]))

410 if not skip:

411 last = resort[-1]

412 res["worst"] = "{0}:{1}".format(last[1], last[0])

413 best = resort[0]

414 res["best"] = "{0}:{1}".format(best[1], best[0])

415 return res

416

417

418if __name__ == "__main__":

419 mstorage = r'/home/jbr/hack35/'

420 mexample = r'./sample_labelled_test'

421 mpref = MLStoragePerf2018Image(mstorage, mexample)

422 mres = mpref.compute_performance(fLOG=print, use_cache=True)

423 mres = mres.sort_values("precision", ascending=False)

424 print(mres)

425 if 'exc' in mres.columns:

426 print(list(mres['exc']))

427 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - BRGM</h1>\n"

428 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html())

429 from pyquickhelper.pandashelper.tblformat import df2rst

430 with open("hackathon2018/brgm.rst", "w", encoding="utf-8") as f:

431 f.write(df2rst(mres))

432 with open("hackathon2018/brgm.html", "w", encoding="utf-8") as f:

433 f.write(mcontent)