Coverage for src/ensae_projects/ml/competitions.py: 67%
123 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
1"""
2@file
3@brief Compute metrics in for a competition
4"""
5import os
8def main_codalab_wrapper_binary_classification(fct, metric_name, argv, truth_file="truth.txt",
9 submission_file="answer.txt", output_file="scores.txt"):
10 """
11 adapt the tempate available at
12 `evaluate.py <https://github.com/Tivix/competition-examples/blob/master/hello_world/competition/scoring_program/evaluate.py>`_
13 """
14 input_dir = argv[1]
15 output_dir = argv[2]
17 submit_dir = os.path.join(input_dir, 'res')
18 truth_dir = os.path.join(input_dir, 'ref')
20 if not os.path.isdir(submit_dir):
21 raise FileNotFoundError("%s doesn't exist" % submit_dir)
23 if os.path.isdir(submit_dir) and os.path.isdir(truth_dir):
24 if not os.path.exists(output_dir):
25 os.makedirs(output_dir)
27 private_codalab_wrapper_binary_classification(fct, metric_name,
28 fold1=truth_dir, f1=truth_file,
29 fold2=submit_dir, f2=submission_file,
30 output=os.path.join(output_dir, output_file))
31 else:
32 raise FileNotFoundError(
33 "{0} or {1} is not a folder".format(submit_dir, truth_dir))
36def private_codalab_wrapper_binary_classification(fct, metric_name, fold1, fold2, f1="answer.txt", f2="answer.txt",
37 output="scores.txt", use_print=False):
38 """
39 Wraps the function following the guidelines
40 `User_Building a Scoring Program for a Competition
41 <https://github.com/codalab/codalab-competitions/wiki/User_Building-a-Scoring-Program-for-a-Competition>`_.
42 It replicates the example available at
43 `competition-examples/hello_world <https://github.com/Tivix/competition-examples/tree/master/hello_world/competition>`_.
45 @param fct function to wrap
46 @param metric_name metric name
47 @param fold1 folder which contains the data for folder containing the truth
48 @param fold2 folder which contains the data for folder containing the data
49 @param f1 filename for the truth
50 @param f2 filename for the produced answers
51 @param output produces an output with the expected results
52 @param use_print display intermediate results
53 @return metric
54 """
55 f1 = os.path.join(fold1, f1)
56 f2 = os.path.join(fold2, f2)
57 if not os.path.exists(f1):
58 raise FileNotFoundError("unable to find '{0}'".format(f1))
59 if not os.path.exists(f2):
60 raise FileNotFoundError("unable to find '{0}'".format(f2))
61 if f1 == f2:
62 raise ValueError(
63 "answers and scores are the same file: '{0}'".format(f1))
65 with open(f1, "r") as f:
66 lines = f.readlines()
67 answers = [float(_) for _ in lines if _]
68 if use_print:
69 print("Reading answers:", f1, len(answers), "rows")
70 print("First answers:", answers[:10])
72 with open(f2, "r") as f:
73 lines = f.readlines()
74 scores = [float(_) for _ in lines if _]
75 if use_print:
76 print("Reading scores:", f1, len(scores), "rows")
77 print("First scores:", scores[:10])
79 metric = fct(answers, scores)
80 res = "{0}:{1}".format(metric_name, metric)
81 if use_print:
82 print("Results=", res)
83 with open(output, "w") as f:
84 f.write(res)
85 if use_print:
86 print("Wrote", res, "in", output)
87 return metric
90def AUC(answers, scores):
91 """
92 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_.
94 @param answers expected answers 0 (false), 1 (true)
95 @param scores score obtained for class 1
96 @return number
97 """
98 ab = list(zip(answers, scores))
99 plus = [s for a, s in ab if a == 1]
100 moins = [s for a, s in ab if a != 1]
101 auc = 0
102 for p in plus:
103 for m in moins:
104 if p > m:
105 auc += 2
106 elif p == m:
107 auc += 1
108 den = len(plus) * len(moins)
109 if den == 0:
110 return 1.0 if len(moins) == 0 else 0.0
111 return auc * 1.0 / (len(plus) * len(moins) * 2)
114def AUC_multi(answers, scores, ignored=None):
115 """
116 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_.
118 @param answers expected answers `class` as a string
119 @param scores prediction and score `(class, score)`
120 @param ignored ignored class
121 @return number
122 """
123 if ignored is None:
124 ignored = []
125 new_answers = [(1 if s[0] == a else 0)
126 for (a, s) in zip(answers, scores) if a not in ignored]
127 return AUC(new_answers, scores)
130def AUC_multi_multi(nb, answers, scores, ignored=None):
131 """
132 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_.
134 @param nb number of observations
135 @param answers expected answers, list of tuple of classes as a string
136 @param scores prediction and score `(class, score)`
137 @param ignored ignored class
138 @return number
140 Dummy expected classes (both classes):
142 ::
144 endettement 4.0
145 surendettement 4.0
146 surendettement 4.0
147 surendettement 4.0
149 Dummy predicted answers:
151 ::
153 2.0 endettement 0.48775936896183714 0.5033579692108108
154 5.0 microcredit social 0.16592396695909017 0.8643847837801871
155 5.0 microcredit personnel 0.7962830470795325 0.6233706526012659
156 3.0 impayes 0.17370233487556486 0.779432954126955
158 """
159 res = []
160 for i in range(0, nb):
161 ta = [a[i] for a in answers]
162 ts = [(a[i], a[nb + i]) for a in scores]
163 auc = AUC_multi(ta, ts, ignored)
164 err = sum(1 if a != s[0] else 0 for (a, s) in zip(ta, ts))
165 res.append(err * 1.0 / len(ta))
166 res.append(auc)
167 return res
170def private_codalab_wrapper_multi_classification(fct, variables_name, fold1, fold2, f1="answer.txt", f2="answer.txt",
171 output="scores.txt", use_print=False, ignored=None):
172 """
173 Wraps the function following the guidelines
174 `User_Building a Scoring Program for a Competition
175 <https://github.com/codalab/codalab-competitions/wiki/User_Building-a-Scoring-Program-for-a-Competition>`_.
176 It replicates the example available at
177 `competition-examples/hello_world <https://github.com/Tivix/competition-examples/tree/master/hello_world/competition>`_.
179 @param fct function to wrap
180 @param variables_name variables names
181 @param fold1 folder which contains the data for folder containing the truth
182 @param fold2 folder which contains the data for folder containing the data
183 @param f1 filename for the truth
184 @param f2 filename for the produced answers
185 @param output produces an output with the expected results
186 @param use_print display intermediate results
187 @param ignored ignored labels
188 @return metric
189 """
190 f1 = os.path.join(fold1, f1)
191 f2 = os.path.join(fold2, f2)
192 if not os.path.exists(f1):
193 raise FileNotFoundError("unable to find '{0}'".format(f1))
194 if not os.path.exists(f2):
195 raise FileNotFoundError("unable to find '{0}'".format(f2))
196 if f1 == f2:
197 raise ValueError(
198 "answers and scores are the same file: '{0}'".format(f1))
200 def pair_process(row):
201 for i in range(len(row) // 2, len(row)):
202 row[i] = float(row[i])
203 return row
205 with open(f1, "r") as f:
206 lines = f.readlines()
207 answers = [_.strip(" \r\n").split("\t") for _ in lines if _]
209 if use_print:
210 print("Reading answers:", f1, len(answers), "rows")
211 print("First answers:", answers[:10])
213 with open(f2, "r") as f:
214 lines = f.readlines()
216 scores = [pair_process(_.strip(" \r\n").split("\t")) for _ in lines if _]
217 if use_print:
218 print("Reading scores:", f1, len(scores), "rows")
219 print("First scores:", scores[:10])
221 metric = fct(len(variables_name), answers, scores, ignored=ignored)
222 all_names = []
223 for v in variables_name:
224 all_names.append("%s_ERR" % v)
225 all_names.append("%s_AUC" % v)
227 res = "\n".join(["{0}:{1}".format(mn, m)
228 for (mn, m) in zip(all_names, metric)])
229 if use_print:
230 print("Results=", res)
231 with open(output, "w") as f:
232 f.write(res)
233 if use_print:
234 print("Wrote", res, "in", output)
235 return metric
238def main_codalab_wrapper_multi_classification(fct, variables_name, argv, truth_file="truth.txt",
239 submission_file="answer.txt", output_file="scores.txt"):
240 """
241 adapt the tempate available at
242 `evaluate.py <https://github.com/Tivix/competition-examples/blob/master/hello_world/competition/scoring_program/evaluate.py>`_
243 """
244 input_dir = argv[1]
245 output_dir = argv[2]
247 submit_dir = os.path.join(input_dir, 'res')
248 truth_dir = os.path.join(input_dir, 'ref')
250 if not os.path.isdir(submit_dir):
251 raise FileNotFoundError("%s doesn't exist" % submit_dir)
253 if os.path.isdir(submit_dir) and os.path.isdir(truth_dir):
254 if not os.path.exists(output_dir):
255 os.makedirs(output_dir)
257 private_codalab_wrapper_multi_classification(fct, variables_name,
258 fold1=truth_dir, f1=truth_file,
259 fold2=submit_dir, f2=submission_file,
260 output=os.path.join(
261 output_dir, output_file),
262 ignored=["nul"])
263 else:
264 raise FileNotFoundError(
265 "{0} or {1} is not a folder".format(submit_dir, truth_dir))