Coverage for src/ensae_projects/ml/competitions.py: 67%

123 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-07-20 04:37 +0200

1""" 

2@file 

3@brief Compute metrics in for a competition 

4""" 

5import os 

6 

7 

8def main_codalab_wrapper_binary_classification(fct, metric_name, argv, truth_file="truth.txt", 

9 submission_file="answer.txt", output_file="scores.txt"): 

10 """ 

11 adapt the tempate available at 

12 `evaluate.py <https://github.com/Tivix/competition-examples/blob/master/hello_world/competition/scoring_program/evaluate.py>`_ 

13 """ 

14 input_dir = argv[1] 

15 output_dir = argv[2] 

16 

17 submit_dir = os.path.join(input_dir, 'res') 

18 truth_dir = os.path.join(input_dir, 'ref') 

19 

20 if not os.path.isdir(submit_dir): 

21 raise FileNotFoundError("%s doesn't exist" % submit_dir) 

22 

23 if os.path.isdir(submit_dir) and os.path.isdir(truth_dir): 

24 if not os.path.exists(output_dir): 

25 os.makedirs(output_dir) 

26 

27 private_codalab_wrapper_binary_classification(fct, metric_name, 

28 fold1=truth_dir, f1=truth_file, 

29 fold2=submit_dir, f2=submission_file, 

30 output=os.path.join(output_dir, output_file)) 

31 else: 

32 raise FileNotFoundError( 

33 "{0} or {1} is not a folder".format(submit_dir, truth_dir)) 

34 

35 

36def private_codalab_wrapper_binary_classification(fct, metric_name, fold1, fold2, f1="answer.txt", f2="answer.txt", 

37 output="scores.txt", use_print=False): 

38 """ 

39 Wraps the function following the guidelines 

40 `User_Building a Scoring Program for a Competition 

41 <https://github.com/codalab/codalab-competitions/wiki/User_Building-a-Scoring-Program-for-a-Competition>`_. 

42 It replicates the example available at 

43 `competition-examples/hello_world <https://github.com/Tivix/competition-examples/tree/master/hello_world/competition>`_. 

44 

45 @param fct function to wrap 

46 @param metric_name metric name 

47 @param fold1 folder which contains the data for folder containing the truth 

48 @param fold2 folder which contains the data for folder containing the data 

49 @param f1 filename for the truth 

50 @param f2 filename for the produced answers 

51 @param output produces an output with the expected results 

52 @param use_print display intermediate results 

53 @return metric 

54 """ 

55 f1 = os.path.join(fold1, f1) 

56 f2 = os.path.join(fold2, f2) 

57 if not os.path.exists(f1): 

58 raise FileNotFoundError("unable to find '{0}'".format(f1)) 

59 if not os.path.exists(f2): 

60 raise FileNotFoundError("unable to find '{0}'".format(f2)) 

61 if f1 == f2: 

62 raise ValueError( 

63 "answers and scores are the same file: '{0}'".format(f1)) 

64 

65 with open(f1, "r") as f: 

66 lines = f.readlines() 

67 answers = [float(_) for _ in lines if _] 

68 if use_print: 

69 print("Reading answers:", f1, len(answers), "rows") 

70 print("First answers:", answers[:10]) 

71 

72 with open(f2, "r") as f: 

73 lines = f.readlines() 

74 scores = [float(_) for _ in lines if _] 

75 if use_print: 

76 print("Reading scores:", f1, len(scores), "rows") 

77 print("First scores:", scores[:10]) 

78 

79 metric = fct(answers, scores) 

80 res = "{0}:{1}".format(metric_name, metric) 

81 if use_print: 

82 print("Results=", res) 

83 with open(output, "w") as f: 

84 f.write(res) 

85 if use_print: 

86 print("Wrote", res, "in", output) 

87 return metric 

88 

89 

90def AUC(answers, scores): 

91 """ 

92 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_. 

93 

94 @param answers expected answers 0 (false), 1 (true) 

95 @param scores score obtained for class 1 

96 @return number 

97 """ 

98 ab = list(zip(answers, scores)) 

99 plus = [s for a, s in ab if a == 1] 

100 moins = [s for a, s in ab if a != 1] 

101 auc = 0 

102 for p in plus: 

103 for m in moins: 

104 if p > m: 

105 auc += 2 

106 elif p == m: 

107 auc += 1 

108 den = len(plus) * len(moins) 

109 if den == 0: 

110 return 1.0 if len(moins) == 0 else 0.0 

111 return auc * 1.0 / (len(plus) * len(moins) * 2) 

112 

113 

114def AUC_multi(answers, scores, ignored=None): 

115 """ 

116 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_. 

117 

118 @param answers expected answers `class` as a string 

119 @param scores prediction and score `(class, score)` 

120 @param ignored ignored class 

121 @return number 

122 """ 

123 if ignored is None: 

124 ignored = [] 

125 new_answers = [(1 if s[0] == a else 0) 

126 for (a, s) in zip(answers, scores) if a not in ignored] 

127 return AUC(new_answers, scores) 

128 

129 

130def AUC_multi_multi(nb, answers, scores, ignored=None): 

131 """ 

132 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_. 

133 

134 @param nb number of observations 

135 @param answers expected answers, list of tuple of classes as a string 

136 @param scores prediction and score `(class, score)` 

137 @param ignored ignored class 

138 @return number 

139 

140 Dummy expected classes (both classes): 

141 

142 :: 

143 

144 endettement 4.0 

145 surendettement 4.0 

146 surendettement 4.0 

147 surendettement 4.0 

148 

149 Dummy predicted answers: 

150 

151 :: 

152 

153 2.0 endettement 0.48775936896183714 0.5033579692108108 

154 5.0 microcredit social 0.16592396695909017 0.8643847837801871 

155 5.0 microcredit personnel 0.7962830470795325 0.6233706526012659 

156 3.0 impayes 0.17370233487556486 0.779432954126955 

157 

158 """ 

159 res = [] 

160 for i in range(0, nb): 

161 ta = [a[i] for a in answers] 

162 ts = [(a[i], a[nb + i]) for a in scores] 

163 auc = AUC_multi(ta, ts, ignored) 

164 err = sum(1 if a != s[0] else 0 for (a, s) in zip(ta, ts)) 

165 res.append(err * 1.0 / len(ta)) 

166 res.append(auc) 

167 return res 

168 

169 

170def private_codalab_wrapper_multi_classification(fct, variables_name, fold1, fold2, f1="answer.txt", f2="answer.txt", 

171 output="scores.txt", use_print=False, ignored=None): 

172 """ 

173 Wraps the function following the guidelines 

174 `User_Building a Scoring Program for a Competition 

175 <https://github.com/codalab/codalab-competitions/wiki/User_Building-a-Scoring-Program-for-a-Competition>`_. 

176 It replicates the example available at 

177 `competition-examples/hello_world <https://github.com/Tivix/competition-examples/tree/master/hello_world/competition>`_. 

178 

179 @param fct function to wrap 

180 @param variables_name variables names 

181 @param fold1 folder which contains the data for folder containing the truth 

182 @param fold2 folder which contains the data for folder containing the data 

183 @param f1 filename for the truth 

184 @param f2 filename for the produced answers 

185 @param output produces an output with the expected results 

186 @param use_print display intermediate results 

187 @param ignored ignored labels 

188 @return metric 

189 """ 

190 f1 = os.path.join(fold1, f1) 

191 f2 = os.path.join(fold2, f2) 

192 if not os.path.exists(f1): 

193 raise FileNotFoundError("unable to find '{0}'".format(f1)) 

194 if not os.path.exists(f2): 

195 raise FileNotFoundError("unable to find '{0}'".format(f2)) 

196 if f1 == f2: 

197 raise ValueError( 

198 "answers and scores are the same file: '{0}'".format(f1)) 

199 

200 def pair_process(row): 

201 for i in range(len(row) // 2, len(row)): 

202 row[i] = float(row[i]) 

203 return row 

204 

205 with open(f1, "r") as f: 

206 lines = f.readlines() 

207 answers = [_.strip(" \r\n").split("\t") for _ in lines if _] 

208 

209 if use_print: 

210 print("Reading answers:", f1, len(answers), "rows") 

211 print("First answers:", answers[:10]) 

212 

213 with open(f2, "r") as f: 

214 lines = f.readlines() 

215 

216 scores = [pair_process(_.strip(" \r\n").split("\t")) for _ in lines if _] 

217 if use_print: 

218 print("Reading scores:", f1, len(scores), "rows") 

219 print("First scores:", scores[:10]) 

220 

221 metric = fct(len(variables_name), answers, scores, ignored=ignored) 

222 all_names = [] 

223 for v in variables_name: 

224 all_names.append("%s_ERR" % v) 

225 all_names.append("%s_AUC" % v) 

226 

227 res = "\n".join(["{0}:{1}".format(mn, m) 

228 for (mn, m) in zip(all_names, metric)]) 

229 if use_print: 

230 print("Results=", res) 

231 with open(output, "w") as f: 

232 f.write(res) 

233 if use_print: 

234 print("Wrote", res, "in", output) 

235 return metric 

236 

237 

238def main_codalab_wrapper_multi_classification(fct, variables_name, argv, truth_file="truth.txt", 

239 submission_file="answer.txt", output_file="scores.txt"): 

240 """ 

241 adapt the tempate available at 

242 `evaluate.py <https://github.com/Tivix/competition-examples/blob/master/hello_world/competition/scoring_program/evaluate.py>`_ 

243 """ 

244 input_dir = argv[1] 

245 output_dir = argv[2] 

246 

247 submit_dir = os.path.join(input_dir, 'res') 

248 truth_dir = os.path.join(input_dir, 'ref') 

249 

250 if not os.path.isdir(submit_dir): 

251 raise FileNotFoundError("%s doesn't exist" % submit_dir) 

252 

253 if os.path.isdir(submit_dir) and os.path.isdir(truth_dir): 

254 if not os.path.exists(output_dir): 

255 os.makedirs(output_dir) 

256 

257 private_codalab_wrapper_multi_classification(fct, variables_name, 

258 fold1=truth_dir, f1=truth_file, 

259 fold2=submit_dir, f2=submission_file, 

260 output=os.path.join( 

261 output_dir, output_file), 

262 ignored=["nul"]) 

263 else: 

264 raise FileNotFoundError( 

265 "{0} or {1} is not a folder".format(submit_dir, truth_dir))