Coverage for src/ensae_projects/hackathon/perf2018.py: 0%
261 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
1"""
2@file
3@brief Compute the performance for the hackathon 2018.
4"""
5import os
6import time
7import pandas
8import numpy
9from PIL import Image
10from lightmlrestapi.mlapp.mlstorage import MLStorage
11try:
12 from ..hackathon.image_helper import enumerate_image_class
13except (ImportError, ValueError):
14 from ensae_projects.hackathon.image_helper import enumerate_image_class
17class MLStoragePerf2018:
18 """
19 Computes the performances the a hackathon.
20 """
22 def __init__(self, storage, examples, cache_file="cache_file.csv"):
23 """
24 @param storage storage location
25 @param examples deep learning models
26 """
27 self._storage = self._load_ml_storage(storage)
28 self._examples = examples
29 self._cache_file = cache_file
31 def _load_ml_storage(self, root):
32 """
33 Creates an instance of a
34 `MLStorage <http://www.xavierdupre.fr/app/lightmlrestapi/helpsphinx/lightmlrestapi/mlapp/mlstorage.html
35 # lightmlrestapi.mlapp.mlstorage.MLStorage>`_
36 based on a folder.
38 @param root folder
39 """
40 if not os.path.exists(root):
41 raise FileNotFoundError("Unable to find '{0}'".format(root))
42 stor = MLStorage(root)
43 return stor
45 def _load_cached_performance(self, cache_file=None):
46 """
47 Retrieves performances already computed.
49 @param cached_file file
50 """
51 if cache_file is None:
52 cache_file = self._cache_file
53 if os.path.exists(cache_file):
54 df = pandas.read_csv(cache_file, sep=",", encoding="utf-8")
55 return df
56 else:
57 return None
59 def _save_performance(self, df, cache_file=None):
60 """
61 Saves cached performance.
63 @param df dataframe
64 @param cache_file destination
65 """
66 if cache_file is None:
67 cache_file = self._cache_file
68 df.to_csv(cache_file, sep=',', encoding='utf-8', index=False)
70 def compute_performance(self, use_cache=True, fLOG=None):
71 """
72 Computes the performance for the not cached one if
73 *use_cache* is True.
75 @param use_cache use cache
76 @param fLOG logging function
77 @return dataframe
78 """
79 cache = None
80 already = set()
81 if use_cache:
82 cache = self._load_cached_performance()
83 if cache is not None:
84 already = set(cache["name"])
86 rows = []
87 for i, name in enumerate(sorted(self._storage.enumerate_names())):
88 if i % 30 == 0:
89 print('.')
90 if name not in already:
91 t0 = time.perf_counter()
92 if fLOG:
93 fLOG(
94 "[MLStoragePerf2018] compute perf for {0}: '{1}'".format(i, name))
95 res = self.compute_perf(name) # pylint: disable=E1111
96 if fLOG:
97 fLOG(
98 "[MLStoragePerf2018] Done for {0}: {1}".format(name, res))
99 if 'exc' in res:
100 fLOG("[MLStoragePerf2018] exception for {0}: {1}".format(
101 name, res['exc']))
102 res["name"] = name
103 res["stime"] = os.stat(self._storage._folder, name).st_mtime
104 t1 = time.perf_counter() - t0
105 res["time"] = t1
106 rows.append(res)
107 already.add(name)
109 df = pandas.DataFrame(rows)
110 sc = list(sorted(df.columns))
111 df = df[sc]
113 if cache is not None:
114 df = pandas.concat([df, cache])
116 df = df.sort_values("name").copy()
118 self._save_performance(df)
120 return df
122 def compute_perf(self, name):
123 """
124 Computes the performances for every image and one
125 particular model.
126 """
127 raise NotImplementedError()
130class MLStoragePerf2018Image(MLStoragePerf2018):
131 """
132 Overloads *compute_perf* for images.
133 Example of use:
135 ::
137 from ensae_projects.hackathon.perf2018 import MLStoragePerf2018Image
138 mstorage = "storage_brgm"
139 mexample = "hackathon_test/sample_labelled_test"
140 mpref = MLStoragePerf2018Image(mstorage, mexample)
141 mres = mpref.compute_performance(fLOG=print, use_cache=True)
142 mres = mres.sort_values("precision", ascending=False)
143 print(mres)
144 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - BRGM</h1>\n"
145 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html())
146 with open("brgm.html", "w", encoding="utf-8") as f:
147 f.write(mcontent)
148 """
150 def __init__(self, storage, examples, cache_file="cache_file.csv"):
151 """
152 @param storage storage location
153 @param examples deep learning models
154 """
155 MLStoragePerf2018.__init__(self, storage, examples, cache_file)
157 def _label_mapping(self, subs):
158 """
159 Computes the label based on a subfolder name.
160 """
161 return 1 if subs.endswith('1') else 0
163 def compute_perf(self, name):
164 """
165 Computes the performances for every image and one
166 particular model.
167 """
168 from keras import backend as K
169 K.clear_session() # pylint: disable=E1101
170 folder = self._examples
172 try:
173 model = self._storage.load_model(name)
174 vers = self._storage.call_version(name)
175 exc = None
176 except Exception as e: # pylint: disable=W0703
177 model = None
178 exc = e
179 vers = None
181 rows = []
182 for img, sub in enumerate_image_class(folder):
183 label = self._label_mapping(sub)
184 obs = dict(image=img, sub=sub, label=label)
185 if model is None:
186 obs = {'exc': Exception("model is None")}
187 pred = None
188 else:
189 X = numpy.array(Image.open(img))
190 try:
191 pred = self._storage.call_predict(
192 name, X, loaded_model=model)
193 # print("*****",pred)
194 except Exception as e: # pylint: disable=W0703
195 exc = e
196 pred = None
197 #print('------', e)
199 if pred is None:
200 pass
201 else:
202 if isinstance(pred, float):
203 plabel = 1 if pred > 0.5 else 0
204 score = pred
205 if isinstance(pred, list):
206 pred = numpy.array(pred)
207 if isinstance(pred, numpy.ndarray):
208 pred = pred.ravel()
209 if len(pred) == 1:
210 plabel = 1 if pred[0] > 0.5 else 0
211 score = pred[0]
212 elif len(pred) > 1:
213 plabel = numpy.argmax(pred)
214 score = pred[plabel]
215 else:
216 exc = ValueError("No prediction")
217 else:
218 exc = TypeError(
219 "Prediction with the wrong type {0}".format(type(pred)))
221 if exc:
222 obs.update({'exc': exc})
223 else:
224 obs.update(dict(predicted_label=plabel, score=score))
226 rows.append(obs)
227 # print(rows)
228 # break
230 final = pandas.DataFrame(rows)
231 columns = list(final.columns)
232 if 'score' in columns:
233 final["score"] = final["score"].fillna(0)
234 final["predicted_label"] = final["predicted_label"].fillna(-1)
235 final["correct"] = final["predicted_label"] == final["label"]
236 final["correcti"] = 0
237 final.loc[final["correct"], "correcti"] = 1
239 res = {}
240 if exc:
241 res["exc"] = str(exc)
242 if len(final) > 0:
243 gr = final["correcti"].sum() / final.shape[0]
244 res["precision"] = gr
246 gr = final[["sub", "correcti", "correct"]]
247 gr = gr.groupby("sub", as_index=False)
248 gr = gr.agg({"correct": len, 'correcti': sum})
249 gr["ratio"] = gr["correcti"] / gr["correct"]
250 for i in range(gr.shape[0]):
251 res["p_%s" % gr.loc[i, "sub"]] = gr.loc[i, "ratio"]
252 else:
253 res["precision"] = 0
254 else:
255 res = dict(exc=exc, precision=0)
257 if vers is not None:
258 res["version"] = vers
259 return res
262class MLStoragePerf2018TimeSeries(MLStoragePerf2018):
263 """
264 Overloads *compute_perf* for timeseries.
266 Example of use:
268 ::
270 from ensae_projects.hackathon.perf2018 import MLStoragePerf2018TimeSeries
271 mstorage = "storage_microdon"
272 mexample = "hackathon_test/sample_labelled_test"
273 mpref = MLStoragePerf2018TimeSeries(mstorage, mexample)
274 mres = mpref.compute_performance(fLOG=print, use_cache=True)
275 mres = mres.sort_values("cor", ascending=False)
276 print(mres)
277 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - Microdon</h1>\n"
278 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html())
279 with open("brgm.html", "w", encoding="utf-8") as f:
280 f.write(mcontent)
281 """
283 def __init__(self, storage, examples, cache_file="cache_file.csv"):
284 """
285 @param storage storage location
286 @param examples deep learning models
287 """
288 MLStoragePerf2018.__init__(self, storage, examples, cache_file)
290 df = pandas.read_csv(examples)
292 sub = df[["year", "week", "campaigns_campaign_id", "collecteur_id",
293 "montant_total", "nb_dons_total", "nb_transac_total"]].copy()
294 dsub = sub.fillna(0)
295 gr = dsub.groupby(
296 ["year", "week", "campaigns_campaign_id"], as_index=False).sum()
297 gr["PARTICIPATION"] = gr["nb_dons_total"] / gr["nb_transac_total"]
298 self._expected = gr
300 def _label_mapping(self, subs):
301 """
302 Computes the label based on a subfolder name.
303 """
304 return 1 if subs.endswith('1') else 0
306 def compute_perf(self, name):
307 """
308 Computes the performances for every image and one
309 particular model.
310 """
312 try:
313 model = self._storage.load_model(name)
314 vers = self._storage.call_version(name)
315 exc = None
316 except Exception as e: # pylint: disable=W0703
317 model = None
318 exc = e
319 vers = None
321 cols = ["year", "week", "campaigns_campaign_id",
322 "PARTICIPATION", "nb_transac_total"]
323 X = self._expected[cols]
324 total = 0
325 total10 = 0
326 total100 = 0
327 total1000 = 0
328 n1 = 0
329 n10 = 0
330 n100 = 0
331 n1000 = 0
333 preds = []
334 exps = []
335 diffs = {}
337 for i in range(0, X.shape[0]):
338 week = X.iloc[i, 1]
339 camp = X.iloc[i, 2]
340 exp = X.iloc[i, 3]
341 if exp > 1:
342 continue
343 nb_transac_total = X.iloc[i, 4]
344 if nb_transac_total > 0:
345 try:
346 pred = self._storage.call_predict(
347 name, (week, camp), loaded_model=model)
348 except Exception as e: # pylint: disable=W0703
349 exc = e
350 pred = 0
351 if isinstance(pred, list):
352 if len(pred) == 1:
353 pred = pred[0]
354 else:
355 exc = Exception(
356 "Returned a list of value when expecting one")
357 elif isinstance(pred, numpy.ndarray):
358 pred = pred.ravel()
359 if len(pred) == 1:
360 pred = pred[0]
361 else:
362 exc = Exception(
363 "Returned a list of value when expecting one")
364 n1 += 1
365 exps.append(exp)
366 preds.append(pred)
367 diffs[week, camp] = abs(exp - pred)
368 total += (pred - exp) ** 2
369 if nb_transac_total >= 10:
370 total10 += (pred - exp) ** 2
371 n10 += 1
372 if nb_transac_total >= 100:
373 total100 += (pred - exp) ** 2
374 n100 += 1
375 if nb_transac_total >= 1000:
376 total1000 += (pred - exp) ** 2
377 n1000 += 1
379 res = {}
380 if vers is not None:
381 res["version"] = vers
382 if exc is not None:
383 res["exc"] = exc
384 if n1 > 0:
385 res["score"] = (total / n1) ** 0.5
386 res["score10"] = (total10 / n10) ** 0.5
387 res["score100"] = (total100 / n100) ** 0.5
388 res["score1000"] = (total1000 / n1000) ** 0.5
389 try:
390 res["cor"] = numpy.corrcoef(numpy.array(preds),
391 numpy.array(exps))[0, 1]
392 except (AttributeError, KeyError):
393 res["cor"] = numpy.nan
394 try:
395 res["pmin"] = numpy.min(preds)
396 res["pmax"] = numpy.max(preds)
397 except (KeyError, ValueError):
398 res["pmin"] = numpy.nan
399 res["pmax"] = numpy.nan
401 resort = [(v, k) for k, v in diffs.items()]
403 try:
404 resort.sort()
405 skip = False
406 except ValueError:
407 skip = True
408 exc = Exception(
409 "Unable to sort differences {0}".format(resort[0]))
410 if not skip:
411 last = resort[-1]
412 res["worst"] = "{0}:{1}".format(last[1], last[0])
413 best = resort[0]
414 res["best"] = "{0}:{1}".format(best[1], best[0])
415 return res
418if __name__ == "__main__":
419 mstorage = r'/home/jbr/hack35/'
420 mexample = r'./sample_labelled_test'
421 mpref = MLStoragePerf2018Image(mstorage, mexample)
422 mres = mpref.compute_performance(fLOG=print, use_cache=True)
423 mres = mres.sort_values("precision", ascending=False)
424 print(mres)
425 if 'exc' in mres.columns:
426 print(list(mres['exc']))
427 mbody = "<html><body><h1>Hackathon EY-ENSAE 2018 - BRGM</h1>\n"
428 mcontent = "{0}{1}</body></html>".format(mbody, mres.to_html())
429 from pyquickhelper.pandashelper.tblformat import df2rst
430 with open("hackathon2018/brgm.rst", "w", encoding="utf-8") as f:
431 f.write(df2rst(mres))
432 with open("hackathon2018/brgm.html", "w", encoding="utf-8") as f:
433 f.write(mcontent)