Coverage for src/ensae_projects/ml/competitions.py: 67%

1"""

2@file

3@brief Compute metrics in for a competition

4"""

5import os

8def main_codalab_wrapper_binary_classification(fct, metric_name, argv, truth_file="truth.txt",

9 submission_file="answer.txt", output_file="scores.txt"):

10 """

11 adapt the tempate available at

12 `evaluate.py <https://github.com/Tivix/competition-examples/blob/master/hello_world/competition/scoring_program/evaluate.py>`_

13 """

14 input_dir = argv[1]

15 output_dir = argv[2]

17 submit_dir = os.path.join(input_dir, 'res')

18 truth_dir = os.path.join(input_dir, 'ref')

20 if not os.path.isdir(submit_dir):

21 raise FileNotFoundError("%s doesn't exist" % submit_dir)

23 if os.path.isdir(submit_dir) and os.path.isdir(truth_dir):

24 if not os.path.exists(output_dir):

25 os.makedirs(output_dir)

27 private_codalab_wrapper_binary_classification(fct, metric_name,

28 fold1=truth_dir, f1=truth_file,

29 fold2=submit_dir, f2=submission_file,

30 output=os.path.join(output_dir, output_file))

31 else:

32 raise FileNotFoundError(

33 "{0} or {1} is not a folder".format(submit_dir, truth_dir))

36def private_codalab_wrapper_binary_classification(fct, metric_name, fold1, fold2, f1="answer.txt", f2="answer.txt",

37 output="scores.txt", use_print=False):

38 """

39 Wraps the function following the guidelines

40 `User_Building a Scoring Program for a Competition

41 <https://github.com/codalab/codalab-competitions/wiki/User_Building-a-Scoring-Program-for-a-Competition>`_.

42 It replicates the example available at

43 `competition-examples/hello_world <https://github.com/Tivix/competition-examples/tree/master/hello_world/competition>`_.

45 @param fct function to wrap

46 @param metric_name metric name

47 @param fold1 folder which contains the data for folder containing the truth

48 @param fold2 folder which contains the data for folder containing the data

49 @param f1 filename for the truth

50 @param f2 filename for the produced answers

51 @param output produces an output with the expected results

52 @param use_print display intermediate results

53 @return metric

54 """

55 f1 = os.path.join(fold1, f1)

56 f2 = os.path.join(fold2, f2)

57 if not os.path.exists(f1):

58 raise FileNotFoundError("unable to find '{0}'".format(f1))

59 if not os.path.exists(f2):

60 raise FileNotFoundError("unable to find '{0}'".format(f2))

61 if f1 == f2:

62 raise ValueError(

63 "answers and scores are the same file: '{0}'".format(f1))

65 with open(f1, "r") as f:

66 lines = f.readlines()

67 answers = [float(_) for _ in lines if _]

68 if use_print:

69 print("Reading answers:", f1, len(answers), "rows")

70 print("First answers:", answers[:10])

72 with open(f2, "r") as f:

73 lines = f.readlines()

74 scores = [float(_) for _ in lines if _]

75 if use_print:

76 print("Reading scores:", f1, len(scores), "rows")

77 print("First scores:", scores[:10])

79 metric = fct(answers, scores)

80 res = "{0}:{1}".format(metric_name, metric)

81 if use_print:

82 print("Results=", res)

83 with open(output, "w") as f:

84 f.write(res)

85 if use_print:

86 print("Wrote", res, "in", output)

87 return metric

90def AUC(answers, scores):

91 """

92 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_.

94 @param answers expected answers 0 (false), 1 (true)

95 @param scores score obtained for class 1

96 @return number

97 """

98 ab = list(zip(answers, scores))

99 plus = [s for a, s in ab if a == 1]

100 moins = [s for a, s in ab if a != 1]

101 auc = 0

102 for p in plus:

103 for m in moins:

104 if p > m:

105 auc += 2

106 elif p == m:

107 auc += 1

108 den = len(plus) * len(moins)

109 if den == 0:

110 return 1.0 if len(moins) == 0 else 0.0

111 return auc * 1.0 / (len(plus) * len(moins) * 2)

112

113

114def AUC_multi(answers, scores, ignored=None):

115 """

116 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_.

117

118 @param answers expected answers `class` as a string

119 @param scores prediction and score `(class, score)`

120 @param ignored ignored class

121 @return number

122 """

123 if ignored is None:

124 ignored = []

125 new_answers = [(1 if s[0] == a else 0)

126 for (a, s) in zip(answers, scores) if a not in ignored]

127 return AUC(new_answers, scores)

128

129

130def AUC_multi_multi(nb, answers, scores, ignored=None):

131 """

132 Compute the `AUC <https://en.wikipedia.org/wiki/Area_under_the_curve_(pharmacokinetics)>`_.

133

134 @param nb number of observations

135 @param answers expected answers, list of tuple of classes as a string

136 @param scores prediction and score `(class, score)`

137 @param ignored ignored class

138 @return number

139

140 Dummy expected classes (both classes):

141

142 ::

143

144 endettement 4.0

145 surendettement 4.0

146 surendettement 4.0

147 surendettement 4.0

148

149 Dummy predicted answers:

150

151 ::

152

153 2.0 endettement 0.48775936896183714 0.5033579692108108

154 5.0 microcredit social 0.16592396695909017 0.8643847837801871

155 5.0 microcredit personnel 0.7962830470795325 0.6233706526012659

156 3.0 impayes 0.17370233487556486 0.779432954126955

157

158 """

159 res = []

160 for i in range(0, nb):

161 ta = [a[i] for a in answers]

162 ts = [(a[i], a[nb + i]) for a in scores]

163 auc = AUC_multi(ta, ts, ignored)

164 err = sum(1 if a != s[0] else 0 for (a, s) in zip(ta, ts))

165 res.append(err * 1.0 / len(ta))

166 res.append(auc)

167 return res

168

169

170def private_codalab_wrapper_multi_classification(fct, variables_name, fold1, fold2, f1="answer.txt", f2="answer.txt",

171 output="scores.txt", use_print=False, ignored=None):

172 """

173 Wraps the function following the guidelines

174 `User_Building a Scoring Program for a Competition

175 <https://github.com/codalab/codalab-competitions/wiki/User_Building-a-Scoring-Program-for-a-Competition>`_.

176 It replicates the example available at

177 `competition-examples/hello_world <https://github.com/Tivix/competition-examples/tree/master/hello_world/competition>`_.

178

179 @param fct function to wrap

180 @param variables_name variables names

181 @param fold1 folder which contains the data for folder containing the truth

182 @param fold2 folder which contains the data for folder containing the data

183 @param f1 filename for the truth

184 @param f2 filename for the produced answers

185 @param output produces an output with the expected results

186 @param use_print display intermediate results

187 @param ignored ignored labels

188 @return metric

189 """

190 f1 = os.path.join(fold1, f1)

191 f2 = os.path.join(fold2, f2)

192 if not os.path.exists(f1):

193 raise FileNotFoundError("unable to find '{0}'".format(f1))

194 if not os.path.exists(f2):

195 raise FileNotFoundError("unable to find '{0}'".format(f2))

196 if f1 == f2:

197 raise ValueError(

198 "answers and scores are the same file: '{0}'".format(f1))

199

200 def pair_process(row):

201 for i in range(len(row) // 2, len(row)):

202 row[i] = float(row[i])

203 return row

204

205 with open(f1, "r") as f:

206 lines = f.readlines()

207 answers = [_.strip(" \r\n").split("\t") for _ in lines if _]

208

209 if use_print:

210 print("Reading answers:", f1, len(answers), "rows")

211 print("First answers:", answers[:10])

212

213 with open(f2, "r") as f:

214 lines = f.readlines()

215

216 scores = [pair_process(_.strip(" \r\n").split("\t")) for _ in lines if _]

217 if use_print:

218 print("Reading scores:", f1, len(scores), "rows")

219 print("First scores:", scores[:10])

220

221 metric = fct(len(variables_name), answers, scores, ignored=ignored)

222 all_names = []

223 for v in variables_name:

224 all_names.append("%s_ERR" % v)

225 all_names.append("%s_AUC" % v)

226

227 res = "\n".join(["{0}:{1}".format(mn, m)

228 for (mn, m) in zip(all_names, metric)])

229 if use_print:

230 print("Results=", res)

231 with open(output, "w") as f:

232 f.write(res)

233 if use_print:

234 print("Wrote", res, "in", output)

235 return metric

236

237

238def main_codalab_wrapper_multi_classification(fct, variables_name, argv, truth_file="truth.txt",

239 submission_file="answer.txt", output_file="scores.txt"):

240 """

241 adapt the tempate available at

242 `evaluate.py <https://github.com/Tivix/competition-examples/blob/master/hello_world/competition/scoring_program/evaluate.py>`_

243 """

244 input_dir = argv[1]

245 output_dir = argv[2]

246

247 submit_dir = os.path.join(input_dir, 'res')

248 truth_dir = os.path.join(input_dir, 'ref')

249

250 if not os.path.isdir(submit_dir):

251 raise FileNotFoundError("%s doesn't exist" % submit_dir)

252

253 if os.path.isdir(submit_dir) and os.path.isdir(truth_dir):

254 if not os.path.exists(output_dir):

255 os.makedirs(output_dir)

256

257 private_codalab_wrapper_multi_classification(fct, variables_name,

258 fold1=truth_dir, f1=truth_file,

259 fold2=submit_dir, f2=submission_file,

260 output=os.path.join(

261 output_dir, output_file),

262 ignored=["nul"])

263 else:

264 raise FileNotFoundError(

265 "{0} or {1} is not a folder".format(submit_dir, truth_dir))