Coverage for mlprodict/onnxrt/validate/validate

1"""

2@file

3@brief Summarizes results produces by function in *validate.py*.

4"""

5import decimal

6import json

7import numpy

8import pandas

9from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version

10from ... import __version__ as ort_version

13def _clean_values_optim(val):

14 if not isinstance(val, str):

15 return val

16 if '/' in val:

17 spl = val.split('/')

18 return "/".join(_clean_values_optim(v) for v in spl)

19 if "'>=" in val:

20 val = val.split("'>=")

21 if len(val) == 2:

22 val = val[-1]

23 rep = {

24 "{'optim': 'cdist'}": "cdist"

25 }

26 for k, v in rep.items():

27 val = val.replace(k, v)

28 return val

31def _summary_report_indices(df, add_cols=None, add_index=None):

32 if 'opset' not in df.columns:

33 raise RuntimeError( # pragma: no cover

34 f"Unable to create summary (opset missing)\n{df.columns}\n--\n{df.head()}")

36 col_values = ["available"]

37 for col in ['problem', 'scenario', 'opset', 'optim']:

38 if col not in df.columns:

39 df[col] = '' if col != 'opset' else numpy.nan

40 indices = ["name", "problem", "scenario", 'optim', 'method_name',

41 'output_index', 'conv_options', 'inst']

42 indices = [i for i in indices if i in df.columns]

43 df["optim"] = df["optim"].fillna('')

44 for c in ['n_features', 'runtime']:

45 if c in df.columns:

46 indices.append(c)

47 if c == 'runtime':

48 df[c].fillna('-', inplace=True)

49 for c in df.columns:

50 if c.startswith('opset') or c in {'available'}:

51 df[c].fillna('?', inplace=True)

53 # Adds information about the models in the index

54 indices2 = []

55 for c in df.columns:

56 if (isinstance(c, str) and len(c) >= 5 and (

57 c.startswith("onx_") or c.startswith("skl_"))):

58 if c in {'onx_domain', 'onx_doc_string', 'onx_ir_version',

59 'onx_model_version'}:

60 continue

61 if df[c].dtype in (numpy.float32, numpy.float64, float,

62 int, numpy.int32, numpy.int64):

63 defval = -1

64 else:

65 defval = ''

66 df[c].fillna(defval, inplace=True)

67 if c.startswith('skl_'):

68 indices.append(c)

69 else:

70 indices2.append(c)

72 columns = ['opset']

73 indices = indices + indices2

74 if add_index is not None:

75 for i in add_index: # pragma: no cover

76 if i not in indices:

77 indices.append(i)

78 return columns, indices, col_values

81class _MyEncoder(json.JSONEncoder):

82 def default(self, o): # pylint: disable=E0202

83 if hasattr(o, 'get_params'):

84 obj = dict(clsname=o.__class__.__name__)

85 obj.update(o.get_params())

86 return json.dumps(obj, sort_keys=True)

87 return json.dumps(o, sort_keys=True) # pragma: no cover

90def _jsonify(x):

92 def _l(k):

93 if isinstance(k, type):

94 return k.__name__

95 return k

97 if isinstance(x, dict):

98 x = {str(_l(k)): v for k, v in x.items()}

99 try:

100 return json.dumps(x, sort_keys=True, cls=_MyEncoder)

101 except TypeError: # pragma: no cover

102 # Cannot sort.

103 return json.dumps(x, cls=_MyEncoder)

104 try:

105 if numpy.isnan(x):

106 x = ''

107 except (ValueError, TypeError):

108 pass

109 try:

110 return json.dumps(x, cls=_MyEncoder)

111 except TypeError: # pragma: no cover

112 # Cannot sort.

113 return json.dumps(x, cls=_MyEncoder)

114

115

116def summary_report(df, add_cols=None, add_index=None):

117 """

118 Finalizes the results computed by function

119 @see fn enumerate_validated_operator_opsets.

120

121 @param df dataframe

122 @param add_cols additional columns to take into account

123 as values

124 @param add_index additional columns to take into accound

125 as index

126 @return pivoted dataframe

127

128 The outcome can be seen at page about :ref:`l-onnx-pyrun`.

129 """

130 df = df.copy()

131 if 'inst' in df.columns:

132 df['inst'] = df['inst'].apply(_jsonify)

133 if 'conv_options' in df.columns:

134 df['conv_options'] = df['conv_options'].apply(_jsonify)

135 num_types = (int, float, decimal.Decimal, numpy.number)

136

137 def aggfunc(values):

138 if len(values) != 1:

139 if all(map(lambda x: isinstance(x, num_types),

140 values)):

141 mi, ma = min(values), max(values)

142 if numpy.isnan(mi) and numpy.isnan(ma):

143 return ""

144 if mi == ma:

145 return mi

146 return f'[{mi},{ma}]'

147 values = [str(_).replace("\n", " ").replace('\r', '').strip(" ")

148 for _ in values]

149 values = [_ for _ in values if _]

150 vals = set(values)

151 if len(vals) != 1:

152 return " // ".join(map(str, values))

153 val = values.iloc[0] if not isinstance(values, list) else values[0]

154 if isinstance(val, float) and numpy.isnan(val):

155 return ""

156 return str(val)

157

158 columns, indices, col_values = _summary_report_indices(

159 df, add_cols=add_cols, add_index=add_index)

160 try:

161 piv = pandas.pivot_table(df, values=col_values,

162 index=indices, columns=columns,

163 aggfunc=aggfunc).reset_index(drop=False)

164 except (KeyError, TypeError) as e: # pragma: no cover

165 raise RuntimeError(

166 "Issue with keys={}, values={}\namong {}.".format(

167 indices, col_values, df.columns)) from e

168

169 cols = list(piv.columns)

170 opsets = [c[1] for c in cols if isinstance(c[1], (int, float))]

171

172 versions = ["opset%d" % i for i in opsets]

173 last = piv.columns[-1]

174 if isinstance(last, tuple) and last == ('available', '?'):

175 versions.append('FAIL')

176 nbvalid = len(indices + versions)

177 if len(piv.columns) != nbvalid:

178 raise RuntimeError( # pragma: no cover

179 "Mismatch between {} != {}\n{}\n{}\n---\n{}\n{}\n{}".format(

180 len(piv.columns), len(indices + versions),

181 piv.columns, indices + versions,

182 df.columns, indices, col_values))

183 piv.columns = indices + versions

184 piv = piv[indices + list(reversed(versions))].copy()

185 for c in versions:

186 piv[c].fillna('-', inplace=True)

187

188 if "available-ERROR" in df.columns:

189

190 from skl2onnx.common.exceptions import MissingShapeCalculator # delayed

191

192 def replace_msg(text):

193 if isinstance(text, MissingShapeCalculator):

194 return "NO CONVERTER" # pragma: no cover

195 if str(text).startswith("Unable to find a shape calculator for type '"):

196 return "NO CONVERTER"

197 if str(text).startswith("Unable to find problem for model '"):

198 return "NO PROBLEM" # pragma: no cover

199 if "not implemented for float64" in str(text):

200 return "NO RUNTIME 64" # pragma: no cover

201 return str(text)

202

203 piv2 = pandas.pivot_table(

204 df, values="available-ERROR", index=indices,

205 columns='opset', aggfunc=aggfunc).reset_index(drop=False)

206

207 col = piv2.iloc[:, piv2.shape[1] - 1]

208 piv["ERROR-msg"] = col.apply(replace_msg)

209

210 if any('time-ratio-' in c for c in df.columns):

211 cols = [c for c in df.columns if c.startswith('time-ratio')]

212 cols.sort()

213

214 df_sub = df[indices + cols]

215 piv2 = df_sub.groupby(indices).mean()

216 piv = piv.merge(piv2, on=indices, how='left')

217

218 def rep(c):

219 if 'N=1' in c and 'N=10' not in c:

220 return c.replace("time-ratio-", "RT/SKL-")

221 else:

222 return c.replace("time-ratio-", "")

223 cols = [rep(c) for c in piv.columns]

224 piv.columns = cols

225

226 # min, max

227 mins = [c for c in piv.columns if c.endswith('-min')]

228 maxs = [c for c in piv.columns if c.endswith('-max')]

229 combined = []

230 for mi, ma in zip(mins, maxs):

231 combined.append(mi)

232 combined.append(ma)

233 first = [c for c in piv.columns if c not in combined]

234 piv = piv[first + combined]

235

236 def clean_values(value):

237 if not isinstance(value, str):

238 return value # pragma: no cover

239 if "ERROR->=1000000" in value:

240 value = "big-diff"

241 elif "ERROR" in value:

242 value = value.replace("ERROR-_", "")

243 value = value.replace("_exc", "")

244 value = "ERR: " + value

245 elif "OK-" in value:

246 value = value.replace("OK-", "OK ")

247 elif "e<" in value:

248 value = value.replace("-", " ")

249 return value

250

251 for c in piv.columns:

252 if "opset" in c:

253 piv[c] = piv[c].apply(clean_values)

254 if 'optim' in c:

255 piv[c] = piv[c].apply(_clean_values_optim)

256

257 # adding versions

258 def keep_values(x):

259 if isinstance(x, float) and numpy.isnan(x):

260 return False # pragma: no cover

261 return True

262

263 col_versions = [c for c in df.columns if c.startswith("v_")]

264 if len(col_versions) > 0:

265 for c in col_versions:

266 vals = set(filter(keep_values, df[c]))

267 if len(vals) != 1:

268 raise RuntimeError( # pragma: no cover

269 f"Columns '{c}' has multiple values {vals}.")

270 piv[c] = list(vals)[0]

271

272 return piv

273

274

275def merge_benchmark(dfs, column='runtime', baseline=None, suffix='-base'):

276 """

277 Merges several benchmarks run with command line

278 :ref:`validate_runtime <l-cmd-validate_runtime>`.

279

280 @param dfs dictionary *{'prefix': dataframe}*

281 @param column every value from this column is prefixed

282 by the given key in *dfs*

283 @param baseline add baseline

284 @param suffix suffix to add when comparing to the baseline

285 @return merged dataframe

286 """

287 def add_prefix(prefix, v):

288 if isinstance(v, str):

289 return prefix + v

290 return v # pragma: no cover

291

292 conc = []

293 for k, df in dfs.items():

294 if column not in df.columns:

295 raise ValueError(

296 f"Unable to find column '{column}' in {df.columns} (key='{k}')")

297 df = df.copy()

298 df[column] = df[column].apply(lambda x: add_prefix(k, x))

299 if 'inst' in df.columns:

300 df['inst'] = df['inst'].fillna('')

301 else:

302 df['inst'] = ''

303 conc.append(df)

304 merged = pandas.concat(conc).reset_index(drop=True)

305 if baseline is not None:

306 def get_key(index):

307 k = []

308 for v in index:

309 try:

310 if numpy.isnan(v):

311 continue # pragma: no cover

312 except (ValueError, TypeError):

313 pass

314 k.append(v)

315 return tuple(k)

316

317 columns, indices, _ = _summary_report_indices(merged)

318 indices = list(_ for _ in (indices + columns) if _ != 'runtime')

319 try:

320 bdata = merged[merged.runtime == baseline].drop(

321 'runtime', axis=1).set_index(indices, verify_integrity=True)

322 except ValueError as e:

323 bdata2 = merged[indices + ['runtime']].copy()

324 bdata2['count'] = 1

325 n_rows = bdata2['count'].sum()

326 gr = bdata2.groupby(indices + ['runtime'], as_index=False).sum(

327 ).sort_values('count', ascending=False)

328 n_rows2 = gr['count'].sum()

329 one = gr.head()[:1]

330 rows = merged.merge(one, on=indices + ['runtime'])[:2]

331 for c in ['init-types', 'bench-skl', 'bench-batch', 'init_types', 'cl']:

332 if c in rows.columns:

333 rows = rows.drop(c, axis=1)

334 srows = rows.T.to_string(min_rows=100)

335 raise ValueError(

336 "(n_rows={}, n_rows2={}) Unable to group by {}.\n{}\n-------\n{}".format(

337 n_rows, n_rows2, indices, gr.T, srows)) from e

338 if bdata.shape[0] == 0:

339 raise RuntimeError( # pragma: no cover

340 f"No result for baseline '{baseline}'.")

341 ratios = [c for c in merged.columns if c.startswith('time-ratio-')]

342 indexed = {}

343 for index in bdata.index:

344 row = bdata.loc[index, :]

345 key = get_key(index)

346 indexed[key] = row[ratios]

347

348 for i in range(merged.shape[0]):

349 key = get_key(tuple(merged.loc[i, indices]))

350 if key not in indexed:

351 continue # pragma: no cover

352 value = indexed[key]

353 for r in ratios:

354 if r.endswith('-min') or r.endswith('-max'):

355 continue

356 value2 = merged.loc[i, r]

357 new_r = value2 / value[r]

358 new_col = r + suffix

359 if new_col not in merged.columns:

360 merged[new_col] = numpy.nan

361 merged.loc[i, new_col] = new_r

362

363 return merged

Coverage for mlprodict/onnxrt/validate/validate_summary.py: 94%

238 statements