Coverage for mlprodict/onnxrt/validate/validate_summary.py: 94%

238 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1""" 

2@file 

3@brief Summarizes results produces by function in *validate.py*. 

4""" 

5import decimal 

6import json 

7import numpy 

8import pandas 

9from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version 

10from ... import __version__ as ort_version 

11 

12 

13def _clean_values_optim(val): 

14 if not isinstance(val, str): 

15 return val 

16 if '/' in val: 

17 spl = val.split('/') 

18 return "/".join(_clean_values_optim(v) for v in spl) 

19 if "'>=" in val: 

20 val = val.split("'>=") 

21 if len(val) == 2: 

22 val = val[-1] 

23 rep = { 

24 "{'optim': 'cdist'}": "cdist" 

25 } 

26 for k, v in rep.items(): 

27 val = val.replace(k, v) 

28 return val 

29 

30 

31def _summary_report_indices(df, add_cols=None, add_index=None): 

32 if 'opset' not in df.columns: 

33 raise RuntimeError( # pragma: no cover 

34 f"Unable to create summary (opset missing)\n{df.columns}\n--\n{df.head()}") 

35 

36 col_values = ["available"] 

37 for col in ['problem', 'scenario', 'opset', 'optim']: 

38 if col not in df.columns: 

39 df[col] = '' if col != 'opset' else numpy.nan 

40 indices = ["name", "problem", "scenario", 'optim', 'method_name', 

41 'output_index', 'conv_options', 'inst'] 

42 indices = [i for i in indices if i in df.columns] 

43 df["optim"] = df["optim"].fillna('') 

44 for c in ['n_features', 'runtime']: 

45 if c in df.columns: 

46 indices.append(c) 

47 if c == 'runtime': 

48 df[c].fillna('-', inplace=True) 

49 for c in df.columns: 

50 if c.startswith('opset') or c in {'available'}: 

51 df[c].fillna('?', inplace=True) 

52 

53 # Adds information about the models in the index 

54 indices2 = [] 

55 for c in df.columns: 

56 if (isinstance(c, str) and len(c) >= 5 and ( 

57 c.startswith("onx_") or c.startswith("skl_"))): 

58 if c in {'onx_domain', 'onx_doc_string', 'onx_ir_version', 

59 'onx_model_version'}: 

60 continue 

61 if df[c].dtype in (numpy.float32, numpy.float64, float, 

62 int, numpy.int32, numpy.int64): 

63 defval = -1 

64 else: 

65 defval = '' 

66 df[c].fillna(defval, inplace=True) 

67 if c.startswith('skl_'): 

68 indices.append(c) 

69 else: 

70 indices2.append(c) 

71 

72 columns = ['opset'] 

73 indices = indices + indices2 

74 if add_index is not None: 

75 for i in add_index: # pragma: no cover 

76 if i not in indices: 

77 indices.append(i) 

78 return columns, indices, col_values 

79 

80 

81class _MyEncoder(json.JSONEncoder): 

82 def default(self, o): # pylint: disable=E0202 

83 if hasattr(o, 'get_params'): 

84 obj = dict(clsname=o.__class__.__name__) 

85 obj.update(o.get_params()) 

86 return json.dumps(obj, sort_keys=True) 

87 return json.dumps(o, sort_keys=True) # pragma: no cover 

88 

89 

90def _jsonify(x): 

91 

92 def _l(k): 

93 if isinstance(k, type): 

94 return k.__name__ 

95 return k 

96 

97 if isinstance(x, dict): 

98 x = {str(_l(k)): v for k, v in x.items()} 

99 try: 

100 return json.dumps(x, sort_keys=True, cls=_MyEncoder) 

101 except TypeError: # pragma: no cover 

102 # Cannot sort. 

103 return json.dumps(x, cls=_MyEncoder) 

104 try: 

105 if numpy.isnan(x): 

106 x = '' 

107 except (ValueError, TypeError): 

108 pass 

109 try: 

110 return json.dumps(x, cls=_MyEncoder) 

111 except TypeError: # pragma: no cover 

112 # Cannot sort. 

113 return json.dumps(x, cls=_MyEncoder) 

114 

115 

116def summary_report(df, add_cols=None, add_index=None): 

117 """ 

118 Finalizes the results computed by function 

119 @see fn enumerate_validated_operator_opsets. 

120 

121 @param df dataframe 

122 @param add_cols additional columns to take into account 

123 as values 

124 @param add_index additional columns to take into accound 

125 as index 

126 @return pivoted dataframe 

127 

128 The outcome can be seen at page about :ref:`l-onnx-pyrun`. 

129 """ 

130 df = df.copy() 

131 if 'inst' in df.columns: 

132 df['inst'] = df['inst'].apply(_jsonify) 

133 if 'conv_options' in df.columns: 

134 df['conv_options'] = df['conv_options'].apply(_jsonify) 

135 num_types = (int, float, decimal.Decimal, numpy.number) 

136 

137 def aggfunc(values): 

138 if len(values) != 1: 

139 if all(map(lambda x: isinstance(x, num_types), 

140 values)): 

141 mi, ma = min(values), max(values) 

142 if numpy.isnan(mi) and numpy.isnan(ma): 

143 return "" 

144 if mi == ma: 

145 return mi 

146 return f'[{mi},{ma}]' 

147 values = [str(_).replace("\n", " ").replace('\r', '').strip(" ") 

148 for _ in values] 

149 values = [_ for _ in values if _] 

150 vals = set(values) 

151 if len(vals) != 1: 

152 return " // ".join(map(str, values)) 

153 val = values.iloc[0] if not isinstance(values, list) else values[0] 

154 if isinstance(val, float) and numpy.isnan(val): 

155 return "" 

156 return str(val) 

157 

158 columns, indices, col_values = _summary_report_indices( 

159 df, add_cols=add_cols, add_index=add_index) 

160 try: 

161 piv = pandas.pivot_table(df, values=col_values, 

162 index=indices, columns=columns, 

163 aggfunc=aggfunc).reset_index(drop=False) 

164 except (KeyError, TypeError) as e: # pragma: no cover 

165 raise RuntimeError( 

166 "Issue with keys={}, values={}\namong {}.".format( 

167 indices, col_values, df.columns)) from e 

168 

169 cols = list(piv.columns) 

170 opsets = [c[1] for c in cols if isinstance(c[1], (int, float))] 

171 

172 versions = ["opset%d" % i for i in opsets] 

173 last = piv.columns[-1] 

174 if isinstance(last, tuple) and last == ('available', '?'): 

175 versions.append('FAIL') 

176 nbvalid = len(indices + versions) 

177 if len(piv.columns) != nbvalid: 

178 raise RuntimeError( # pragma: no cover 

179 "Mismatch between {} != {}\n{}\n{}\n---\n{}\n{}\n{}".format( 

180 len(piv.columns), len(indices + versions), 

181 piv.columns, indices + versions, 

182 df.columns, indices, col_values)) 

183 piv.columns = indices + versions 

184 piv = piv[indices + list(reversed(versions))].copy() 

185 for c in versions: 

186 piv[c].fillna('-', inplace=True) 

187 

188 if "available-ERROR" in df.columns: 

189 

190 from skl2onnx.common.exceptions import MissingShapeCalculator # delayed 

191 

192 def replace_msg(text): 

193 if isinstance(text, MissingShapeCalculator): 

194 return "NO CONVERTER" # pragma: no cover 

195 if str(text).startswith("Unable to find a shape calculator for type '"): 

196 return "NO CONVERTER" 

197 if str(text).startswith("Unable to find problem for model '"): 

198 return "NO PROBLEM" # pragma: no cover 

199 if "not implemented for float64" in str(text): 

200 return "NO RUNTIME 64" # pragma: no cover 

201 return str(text) 

202 

203 piv2 = pandas.pivot_table( 

204 df, values="available-ERROR", index=indices, 

205 columns='opset', aggfunc=aggfunc).reset_index(drop=False) 

206 

207 col = piv2.iloc[:, piv2.shape[1] - 1] 

208 piv["ERROR-msg"] = col.apply(replace_msg) 

209 

210 if any('time-ratio-' in c for c in df.columns): 

211 cols = [c for c in df.columns if c.startswith('time-ratio')] 

212 cols.sort() 

213 

214 df_sub = df[indices + cols] 

215 piv2 = df_sub.groupby(indices).mean() 

216 piv = piv.merge(piv2, on=indices, how='left') 

217 

218 def rep(c): 

219 if 'N=1' in c and 'N=10' not in c: 

220 return c.replace("time-ratio-", "RT/SKL-") 

221 else: 

222 return c.replace("time-ratio-", "") 

223 cols = [rep(c) for c in piv.columns] 

224 piv.columns = cols 

225 

226 # min, max 

227 mins = [c for c in piv.columns if c.endswith('-min')] 

228 maxs = [c for c in piv.columns if c.endswith('-max')] 

229 combined = [] 

230 for mi, ma in zip(mins, maxs): 

231 combined.append(mi) 

232 combined.append(ma) 

233 first = [c for c in piv.columns if c not in combined] 

234 piv = piv[first + combined] 

235 

236 def clean_values(value): 

237 if not isinstance(value, str): 

238 return value # pragma: no cover 

239 if "ERROR->=1000000" in value: 

240 value = "big-diff" 

241 elif "ERROR" in value: 

242 value = value.replace("ERROR-_", "") 

243 value = value.replace("_exc", "") 

244 value = "ERR: " + value 

245 elif "OK-" in value: 

246 value = value.replace("OK-", "OK ") 

247 elif "e<" in value: 

248 value = value.replace("-", " ") 

249 return value 

250 

251 for c in piv.columns: 

252 if "opset" in c: 

253 piv[c] = piv[c].apply(clean_values) 

254 if 'optim' in c: 

255 piv[c] = piv[c].apply(_clean_values_optim) 

256 

257 # adding versions 

258 def keep_values(x): 

259 if isinstance(x, float) and numpy.isnan(x): 

260 return False # pragma: no cover 

261 return True 

262 

263 col_versions = [c for c in df.columns if c.startswith("v_")] 

264 if len(col_versions) > 0: 

265 for c in col_versions: 

266 vals = set(filter(keep_values, df[c])) 

267 if len(vals) != 1: 

268 raise RuntimeError( # pragma: no cover 

269 f"Columns '{c}' has multiple values {vals}.") 

270 piv[c] = list(vals)[0] 

271 

272 return piv 

273 

274 

275def merge_benchmark(dfs, column='runtime', baseline=None, suffix='-base'): 

276 """ 

277 Merges several benchmarks run with command line 

278 :ref:`validate_runtime <l-cmd-validate_runtime>`. 

279 

280 @param dfs dictionary *{'prefix': dataframe}* 

281 @param column every value from this column is prefixed 

282 by the given key in *dfs* 

283 @param baseline add baseline 

284 @param suffix suffix to add when comparing to the baseline 

285 @return merged dataframe 

286 """ 

287 def add_prefix(prefix, v): 

288 if isinstance(v, str): 

289 return prefix + v 

290 return v # pragma: no cover 

291 

292 conc = [] 

293 for k, df in dfs.items(): 

294 if column not in df.columns: 

295 raise ValueError( 

296 f"Unable to find column '{column}' in {df.columns} (key='{k}')") 

297 df = df.copy() 

298 df[column] = df[column].apply(lambda x: add_prefix(k, x)) 

299 if 'inst' in df.columns: 

300 df['inst'] = df['inst'].fillna('') 

301 else: 

302 df['inst'] = '' 

303 conc.append(df) 

304 merged = pandas.concat(conc).reset_index(drop=True) 

305 if baseline is not None: 

306 def get_key(index): 

307 k = [] 

308 for v in index: 

309 try: 

310 if numpy.isnan(v): 

311 continue # pragma: no cover 

312 except (ValueError, TypeError): 

313 pass 

314 k.append(v) 

315 return tuple(k) 

316 

317 columns, indices, _ = _summary_report_indices(merged) 

318 indices = list(_ for _ in (indices + columns) if _ != 'runtime') 

319 try: 

320 bdata = merged[merged.runtime == baseline].drop( 

321 'runtime', axis=1).set_index(indices, verify_integrity=True) 

322 except ValueError as e: 

323 bdata2 = merged[indices + ['runtime']].copy() 

324 bdata2['count'] = 1 

325 n_rows = bdata2['count'].sum() 

326 gr = bdata2.groupby(indices + ['runtime'], as_index=False).sum( 

327 ).sort_values('count', ascending=False) 

328 n_rows2 = gr['count'].sum() 

329 one = gr.head()[:1] 

330 rows = merged.merge(one, on=indices + ['runtime'])[:2] 

331 for c in ['init-types', 'bench-skl', 'bench-batch', 'init_types', 'cl']: 

332 if c in rows.columns: 

333 rows = rows.drop(c, axis=1) 

334 srows = rows.T.to_string(min_rows=100) 

335 raise ValueError( 

336 "(n_rows={}, n_rows2={}) Unable to group by {}.\n{}\n-------\n{}".format( 

337 n_rows, n_rows2, indices, gr.T, srows)) from e 

338 if bdata.shape[0] == 0: 

339 raise RuntimeError( # pragma: no cover 

340 f"No result for baseline '{baseline}'.") 

341 ratios = [c for c in merged.columns if c.startswith('time-ratio-')] 

342 indexed = {} 

343 for index in bdata.index: 

344 row = bdata.loc[index, :] 

345 key = get_key(index) 

346 indexed[key] = row[ratios] 

347 

348 for i in range(merged.shape[0]): 

349 key = get_key(tuple(merged.loc[i, indices])) 

350 if key not in indexed: 

351 continue # pragma: no cover 

352 value = indexed[key] 

353 for r in ratios: 

354 if r.endswith('-min') or r.endswith('-max'): 

355 continue 

356 value2 = merged.loc[i, r] 

357 new_r = value2 / value[r] 

358 new_col = r + suffix 

359 if new_col not in merged.columns: 

360 merged[new_col] = numpy.nan 

361 merged.loc[i, new_col] = new_r 

362 

363 return merged