Coverage for mlprodict/onnxrt/validate/validate

1"""

2@file

3@brief Validates runtime for many :epkg:`scikit-learn` operators.

4The submodule relies on :epkg:`onnxconverter_common`,

5:epkg:`sklearn-onnx`.

6"""

7import math

8import copy

9import os

10import warnings

11from importlib import import_module

12import pickle

13from time import perf_counter

14import numpy

15from cpyquickhelper.numbers import measure_time as _c_measure_time

16from sklearn.base import BaseEstimator

17from sklearn.linear_model._base import LinearModel

18from sklearn.model_selection import train_test_split

19from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version

20from .validate_problems import _problems

23class RuntimeBadResultsError(RuntimeError):

24 """

25 Raised when the results are too different from

26 :epkg:`scikit-learn`.

27 """

29 def __init__(self, msg, obs):

30 """

31 :param msg: to display

32 :param obs: observations

33 """

34 RuntimeError.__init__(self, msg)

35 self.obs = obs

38def _dictionary2str(di):

39 el = []

40 for k in sorted(di):

41 el.append(f'{k}={di[k]}')

42 return '/'.join(el)

45def modules_list():

46 """

47 Returns modules and versions currently used.

49 .. runpython::

50 :showcode:

51 :rst:

52 :warningout: DeprecationWarning

54 from mlprodict.onnxrt.validate.validate_helper import modules_list

55 from pyquickhelper.pandashelper import df2rst

56 from pandas import DataFrame

57 print(df2rst(DataFrame(modules_list())))

58 """

59 def try_import(name):

60 try:

61 mod = import_module(name)

62 except ImportError: # pragma: no cover

63 return None

64 return (dict(name=name, version=mod.__version__)

65 if hasattr(mod, '__version__') else dict(name=name))

67 rows = []

68 for name in sorted(['pandas', 'numpy', 'sklearn', 'mlprodict',

69 'skl2onnx', 'onnxmltools', 'onnx', 'onnxruntime',

70 'scipy']):

71 res = try_import(name)

72 if res is not None:

73 rows.append(res)

74 return rows

77def _dispsimple(arr, fLOG):

78 if isinstance(arr, (tuple, list)):

79 for i, a in enumerate(arr):

80 fLOG("output %d" % i)

81 _dispsimple(a, fLOG)

82 elif hasattr(arr, 'shape'):

83 if len(arr.shape) == 1:

84 threshold = 8

85 else:

86 threshold = min(

87 50, min(50 // arr.shape[1], 8) * arr.shape[1])

88 fLOG(numpy.array2string(arr, max_line_width=120,

89 suppress_small=True,

90 threshold=threshold))

91 else: # pragma: no cover

92 s = str(arr)

93 if len(s) > 50:

94 s = s[:50] + "..."

95 fLOG(s)

98def _merge_options(all_conv_options, aoptions):

99 if aoptions is None:

100 return copy.deepcopy(all_conv_options)

101 if not isinstance(aoptions, dict):

102 return copy.deepcopy(aoptions) # pragma: no cover

103 merged = {}

104 for k, v in all_conv_options.items():

105 if k in aoptions:

106 merged[k] = _merge_options(v, aoptions[k])

107 else:

108 merged[k] = copy.deepcopy(v)

109 for k, v in aoptions.items():

110 if k in all_conv_options:

111 continue

112 merged[k] = copy.deepcopy(v)

113 return merged

114

115

116def sklearn_operators(subfolder=None, extended=False,

117 experimental=True):

118 """

119 Builds the list of operators from :epkg:`scikit-learn`.

120 The function goes through the list of submodule

121 and get the list of class which inherit from

122 :epkg:`scikit-learn:base:BaseEstimator`.

123

124 :param subfolder: look into only one subfolder

125 :param extended: extends the list to the list of operators

126 this package implements a converter for

127 :param experimental: includes experimental module from

128 :epkg:`scikit-learn` (see `sklearn.experimental

129 <https://github.com/scikit-learn/scikit-learn/

130 tree/master/sklearn/experimental>`_)

131 :return: the list of found operators

132 """

133 if experimental:

134 from sklearn.experimental import ( # pylint: disable=W0611

135 enable_hist_gradient_boosting,

136 enable_iterative_imputer)

137

138 subfolders = sklearn__all__ + ['mlprodict.onnx_conv']

139 found = []

140 for subm in sorted(subfolders):

141 if isinstance(subm, list):

142 continue # pragma: no cover

143 if subfolder is not None and subm != subfolder:

144 continue

145

146 if subm == 'feature_extraction':

147 subs = [subm, 'feature_extraction.text']

148 else:

149 subs = [subm]

150

151 for sub in subs:

152 if '.' in sub and sub not in {'feature_extraction.text'}:

153 name_sub = sub

154 else:

155 name_sub = f"sklearn.{sub}"

156 try:

157 mod = import_module(name_sub)

158 except ModuleNotFoundError:

159 continue

160

161 if hasattr(mod, "register_converters"):

162 fct = getattr(mod, "register_converters")

163 cls = fct()

164 else:

165 cls = getattr(mod, "__all__", None)

166 if cls is None:

167 cls = list(mod.__dict__)

168 cls = [mod.__dict__[cl] for cl in cls]

169

170 for cl in cls:

171 try:

172 issub = issubclass(cl, BaseEstimator)

173 except TypeError:

174 continue

175 if cl.__name__ in {'Pipeline', 'ColumnTransformer',

176 'FeatureUnion', 'BaseEstimator',

177 'BaseEnsemble', 'BaseDecisionTree'}:

178 continue

179 if cl.__name__ in {'CustomScorerTransform'}:

180 continue

181 if (sub in {'calibration', 'dummy', 'manifold'} and

182 'Calibrated' not in cl.__name__):

183 continue

184 if issub:

185 pack = "sklearn" if sub in sklearn__all__ else cl.__module__.split('.')[

186 0]

187 found.append(

188 dict(name=cl.__name__, subfolder=sub, cl=cl, package=pack))

189

190 if extended:

191 from ...onnx_conv import register_converters

192 with warnings.catch_warnings():

193 warnings.simplefilter("ignore", ResourceWarning)

194 models = register_converters(True)

195

196 done = set(_['name'] for _ in found)

197 for m in models:

198 try:

199 name = m.__module__.split('.')

200 except AttributeError as e: # pragma: no cover

201 raise AttributeError(f"Unexpected value, m={m}") from e

202 sub = '.'.join(name[1:])

203 pack = name[0]

204 if m.__name__ not in done:

205 found.append(

206 dict(name=m.__name__, cl=m, package=pack, sub=sub))

207

208 # let's remove models which cannot predict

209 all_found = found

210 found = []

211 for mod in all_found:

212 cl = mod['cl']

213 if hasattr(cl, 'fit_predict') and not hasattr(cl, 'predict'):

214 continue

215 if hasattr(cl, 'fit_transform') and not hasattr(cl, 'transform'):

216 continue

217 if (not hasattr(cl, 'transform') and

218 not hasattr(cl, 'predict') and

219 not hasattr(cl, 'decision_function')):

220 continue

221 found.append(mod)

222 return found

223

224

225def _measure_time(fct, repeat=1, number=1, first_run=True):

226 """

227 Measures the execution time for a function.

228

229 :param fct: function to measure

230 :param repeat: number of times to repeat

231 :param number: number of times between two measures

232 :param first_run: if True, runs the function once before measuring

233 :return: last result, average, values

234 """

235 res = None

236 values = []

237 if first_run:

238 fct()

239 for __ in range(repeat):

240 begin = perf_counter()

241 for _ in range(number):

242 res = fct()

243 end = perf_counter()

244 values.append(end - begin)

245 if repeat * number == 1:

246 return res, values[0], values

247 return res, sum(values) / (repeat * number), values # pragma: no cover

248

249

250def _shape_exc(obj):

251 if hasattr(obj, 'shape'):

252 return obj.shape

253 if isinstance(obj, (list, dict, tuple)):

254 return "[{%d}]" % len(obj)

255 return None

256

257

258def dump_into_folder(dump_folder, obs_op=None, is_error=True,

259 **kwargs):

260 """

261 Dumps information when an error was detected

262 using :epkg:`*py:pickle`.

263

264 :param dump_folder: dump_folder

265 :param obs_op: obs_op (information)

266 :param is_error: is it an error or not?

267 :param kwargs: additional parameters

268 :return: name

269 """

270 if dump_folder is None:

271 raise ValueError("dump_folder cannot be None.")

272 optim = obs_op.get('optim', '')

273 optim = str(optim)

274 optim = optim.replace("<class 'sklearn.", "")

275 optim = optim.replace("<class '", "")

276 optim = optim.replace(" ", "")

277 optim = optim.replace(">", "")

278 optim = optim.replace("=", "")

279 optim = optim.replace("{", "")

280 optim = optim.replace("}", "")

281 optim = optim.replace(":", "")

282 optim = optim.replace("'", "")

283 optim = optim.replace("/", "")

284 optim = optim.replace("\\", "")

285 parts = (obs_op['runtime'], obs_op['name'], obs_op['scenario'],

286 obs_op['problem'], optim,

287 "op" + str(obs_op.get('opset', '-')),

288 "nf" + str(obs_op.get('n_features', '-')))

289 name = f"dump-{'ERROR' if is_error else 'i'}-{'-'.join(map(str, parts))}.pkl"

290 name = os.path.join(dump_folder, name)

291 obs_op = obs_op.copy()

292 fcts = [k for k in obs_op if k.startswith('lambda')]

293 for fct in fcts:

294 del obs_op[fct]

295 kwargs.update({'obs_op': obs_op})

296 with open(name, "wb") as f:

297 pickle.dump(kwargs, f)

298 return name

299

300

301def default_time_kwargs():

302 """

303 Returns default values *number* and *repeat* to measure

304 the execution of a function.

305

306 .. runpython::

307 :showcode:

308 :warningout: DeprecationWarning

309

310 from mlprodict.onnxrt.validate.validate_helper import default_time_kwargs

311 import pprint

312 pprint.pprint(default_time_kwargs())

313

314 keys define the number of rows,

315 values defines *number* and *repeat*.

316 """

317 return {

318 1: dict(number=15, repeat=20),

319 10: dict(number=10, repeat=20),

320 100: dict(number=4, repeat=10),

321 1000: dict(number=4, repeat=4),

322 10000: dict(number=2, repeat=2),

323 }

324

325

326def measure_time(stmt, x, repeat=10, number=50, div_by_number=False,

327 first_run=True, max_time=None):

328 """

329 Measures a statement and returns the results as a dictionary.

330

331 :param stmt: string

332 :param x: matrix

333 :param repeat: average over *repeat* experiment

334 :param number: number of executions in one row

335 :param div_by_number: divide by the number of executions

336 :param first_run: if True, runs the function once before measuring

337 :param max_time: execute the statement until the total goes

338 beyond this time (approximatively), *repeat* is ignored,

339 *div_by_number* must be set to True

340 :return: dictionary

341

342 See `Timer.repeat <https://docs.python.org/3/library/timeit.html?timeit.Timer.repeat>`_

343 for a better understanding of parameter *repeat* and *number*.

344 The function returns a duration corresponding to

345 *number* times the execution of the main statement.

346 """

347 if x is None:

348 raise ValueError("x cannot be None") # pragma: no cover

349

350 def fct():

351 stmt(x)

352

353 if first_run:

354 try:

355 fct()

356 except RuntimeError as e: # pragma: no cover

357 raise RuntimeError(f"{type(x)}-{x.dtype}") from e

358

359 return _c_measure_time(fct, context={}, repeat=repeat, number=number,

360 div_by_number=div_by_number, max_time=max_time)

361

362

363def _multiply_time_kwargs(time_kwargs, time_kwargs_fact, inst):

364 """

365 Multiplies values in *time_kwargs* following strategy

366 *time_kwargs_fact* for a given model *inst*.

367

368 :param time_kwargs: see below

369 :param time_kwargs_fact: see below

370 :param inst: :epkg:`scikit-learn` model

371 :return: new *time_kwargs*

372

373 Possible values for *time_kwargs_fact*:

374

375 - a integer: multiplies *number* by this number

376 - `'lin'`: multiplies value *number* for linear models depending

377 on the number of rows to process (:math:`\\propto 1/\\log_{10}(n)`)

378

379 .. runpython::

380 :showcode:

381 :warningout: DeprecationWarning

382

383 from pprint import pprint

384 from sklearn.linear_model import LinearRegression

385 from mlprodict.onnxrt.validate.validate_helper import (

386 default_time_kwargs, _multiply_time_kwargs)

387

388 lr = LinearRegression()

389 kw = default_time_kwargs()

390 pprint(kw)

391

392 kw2 = _multiply_time_kwargs(kw, 'lin', lr)

393 pprint(kw2)

394 """

395 if time_kwargs is None:

396 raise ValueError("time_kwargs cannot be None.") # pragma: no cover

397 if time_kwargs_fact in ('', None):

398 return time_kwargs

399 try:

400 vi = int(time_kwargs_fact)

401 time_kwargs_fact = vi

402 except (TypeError, ValueError):

403 pass

404 if isinstance(time_kwargs_fact, int):

405 time_kwargs_modified = copy.deepcopy(time_kwargs)

406 for k in time_kwargs_modified:

407 time_kwargs_modified[k]['number'] *= time_kwargs_fact

408 return time_kwargs_modified

409 if time_kwargs_fact == 'lin':

410 if isinstance(inst, LinearModel):

411 time_kwargs_modified = copy.deepcopy(time_kwargs)

412 for k in time_kwargs_modified:

413 kl = max(int(math.log(k) / math.log(10) + 1e-5), 1)

414 f = max(int(10 / kl + 0.5), 1)

415 time_kwargs_modified[k]['number'] *= f

416 time_kwargs_modified[k]['repeat'] *= 1

417 return time_kwargs_modified

418 return time_kwargs

419 raise ValueError( # pragma: no cover

420 f"Unable to interpret time_kwargs_fact='{time_kwargs_fact}'.")

421

422

423def _get_problem_data(prob, n_features):

424 data_problem = _problems[prob](n_features=n_features)

425 if len(data_problem) == 6:

426 X_, y_, init_types, method, output_index, Xort_ = data_problem

427 dofit = True

428 elif len(data_problem) == 7:

429 X_, y_, init_types, method, output_index, Xort_, dofit = data_problem

430 else:

431 raise RuntimeError( # pragma: no cover

432 f"Unable to interpret problem '{prob}'.")

433 if (len(X_.shape) == 2 and X_.shape[1] != n_features and

434 n_features is not None):

435 raise RuntimeError( # pragma: no cover

436 "Problem '{}' with n_features={} returned {} features"

437 "(func={}).".format(prob, n_features, X_.shape[1],

438 _problems[prob]))

439 if y_ is None:

440 (X_train, X_test, Xort_train, # pylint: disable=W0612

441 Xort_test) = train_test_split(

442 X_, Xort_, random_state=42)

443 y_train, y_test = None, None

444 else:

445 (X_train, X_test, y_train, y_test, # pylint: disable=W0612

446 Xort_train, Xort_test) = train_test_split(

447 X_, y_, Xort_, random_state=42)

448 if isinstance(init_types, tuple):

449 init_types, conv_options = init_types

450 else:

451 conv_options = None

452

453 if isinstance(method, tuple):

454 method_name, predict_kwargs = method

455 else:

456 method_name = method

457 predict_kwargs = {}

458

459 return (X_train, X_test, y_train,

460 y_test, Xort_test,

461 init_types, conv_options, method_name,

462 output_index, dofit, predict_kwargs)

Coverage for mlprodict/onnxrt/validate/validate_helper.py: 97%

224 statements