Coverage for mlprodict/onnxrt/validate/validate_helper.py: 97%
224 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
1"""
2@file
3@brief Validates runtime for many :epkg:`scikit-learn` operators.
4The submodule relies on :epkg:`onnxconverter_common`,
5:epkg:`sklearn-onnx`.
6"""
7import math
8import copy
9import os
10import warnings
11from importlib import import_module
12import pickle
13from time import perf_counter
14import numpy
15from cpyquickhelper.numbers import measure_time as _c_measure_time
16from sklearn.base import BaseEstimator
17from sklearn.linear_model._base import LinearModel
18from sklearn.model_selection import train_test_split
19from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version
20from .validate_problems import _problems
23class RuntimeBadResultsError(RuntimeError):
24 """
25 Raised when the results are too different from
26 :epkg:`scikit-learn`.
27 """
29 def __init__(self, msg, obs):
30 """
31 :param msg: to display
32 :param obs: observations
33 """
34 RuntimeError.__init__(self, msg)
35 self.obs = obs
38def _dictionary2str(di):
39 el = []
40 for k in sorted(di):
41 el.append(f'{k}={di[k]}')
42 return '/'.join(el)
45def modules_list():
46 """
47 Returns modules and versions currently used.
49 .. runpython::
50 :showcode:
51 :rst:
52 :warningout: DeprecationWarning
54 from mlprodict.onnxrt.validate.validate_helper import modules_list
55 from pyquickhelper.pandashelper import df2rst
56 from pandas import DataFrame
57 print(df2rst(DataFrame(modules_list())))
58 """
59 def try_import(name):
60 try:
61 mod = import_module(name)
62 except ImportError: # pragma: no cover
63 return None
64 return (dict(name=name, version=mod.__version__)
65 if hasattr(mod, '__version__') else dict(name=name))
67 rows = []
68 for name in sorted(['pandas', 'numpy', 'sklearn', 'mlprodict',
69 'skl2onnx', 'onnxmltools', 'onnx', 'onnxruntime',
70 'scipy']):
71 res = try_import(name)
72 if res is not None:
73 rows.append(res)
74 return rows
77def _dispsimple(arr, fLOG):
78 if isinstance(arr, (tuple, list)):
79 for i, a in enumerate(arr):
80 fLOG("output %d" % i)
81 _dispsimple(a, fLOG)
82 elif hasattr(arr, 'shape'):
83 if len(arr.shape) == 1:
84 threshold = 8
85 else:
86 threshold = min(
87 50, min(50 // arr.shape[1], 8) * arr.shape[1])
88 fLOG(numpy.array2string(arr, max_line_width=120,
89 suppress_small=True,
90 threshold=threshold))
91 else: # pragma: no cover
92 s = str(arr)
93 if len(s) > 50:
94 s = s[:50] + "..."
95 fLOG(s)
98def _merge_options(all_conv_options, aoptions):
99 if aoptions is None:
100 return copy.deepcopy(all_conv_options)
101 if not isinstance(aoptions, dict):
102 return copy.deepcopy(aoptions) # pragma: no cover
103 merged = {}
104 for k, v in all_conv_options.items():
105 if k in aoptions:
106 merged[k] = _merge_options(v, aoptions[k])
107 else:
108 merged[k] = copy.deepcopy(v)
109 for k, v in aoptions.items():
110 if k in all_conv_options:
111 continue
112 merged[k] = copy.deepcopy(v)
113 return merged
116def sklearn_operators(subfolder=None, extended=False,
117 experimental=True):
118 """
119 Builds the list of operators from :epkg:`scikit-learn`.
120 The function goes through the list of submodule
121 and get the list of class which inherit from
122 :epkg:`scikit-learn:base:BaseEstimator`.
124 :param subfolder: look into only one subfolder
125 :param extended: extends the list to the list of operators
126 this package implements a converter for
127 :param experimental: includes experimental module from
128 :epkg:`scikit-learn` (see `sklearn.experimental
129 <https://github.com/scikit-learn/scikit-learn/
130 tree/master/sklearn/experimental>`_)
131 :return: the list of found operators
132 """
133 if experimental:
134 from sklearn.experimental import ( # pylint: disable=W0611
135 enable_hist_gradient_boosting,
136 enable_iterative_imputer)
138 subfolders = sklearn__all__ + ['mlprodict.onnx_conv']
139 found = []
140 for subm in sorted(subfolders):
141 if isinstance(subm, list):
142 continue # pragma: no cover
143 if subfolder is not None and subm != subfolder:
144 continue
146 if subm == 'feature_extraction':
147 subs = [subm, 'feature_extraction.text']
148 else:
149 subs = [subm]
151 for sub in subs:
152 if '.' in sub and sub not in {'feature_extraction.text'}:
153 name_sub = sub
154 else:
155 name_sub = f"sklearn.{sub}"
156 try:
157 mod = import_module(name_sub)
158 except ModuleNotFoundError:
159 continue
161 if hasattr(mod, "register_converters"):
162 fct = getattr(mod, "register_converters")
163 cls = fct()
164 else:
165 cls = getattr(mod, "__all__", None)
166 if cls is None:
167 cls = list(mod.__dict__)
168 cls = [mod.__dict__[cl] for cl in cls]
170 for cl in cls:
171 try:
172 issub = issubclass(cl, BaseEstimator)
173 except TypeError:
174 continue
175 if cl.__name__ in {'Pipeline', 'ColumnTransformer',
176 'FeatureUnion', 'BaseEstimator',
177 'BaseEnsemble', 'BaseDecisionTree'}:
178 continue
179 if cl.__name__ in {'CustomScorerTransform'}:
180 continue
181 if (sub in {'calibration', 'dummy', 'manifold'} and
182 'Calibrated' not in cl.__name__):
183 continue
184 if issub:
185 pack = "sklearn" if sub in sklearn__all__ else cl.__module__.split('.')[
186 0]
187 found.append(
188 dict(name=cl.__name__, subfolder=sub, cl=cl, package=pack))
190 if extended:
191 from ...onnx_conv import register_converters
192 with warnings.catch_warnings():
193 warnings.simplefilter("ignore", ResourceWarning)
194 models = register_converters(True)
196 done = set(_['name'] for _ in found)
197 for m in models:
198 try:
199 name = m.__module__.split('.')
200 except AttributeError as e: # pragma: no cover
201 raise AttributeError(f"Unexpected value, m={m}") from e
202 sub = '.'.join(name[1:])
203 pack = name[0]
204 if m.__name__ not in done:
205 found.append(
206 dict(name=m.__name__, cl=m, package=pack, sub=sub))
208 # let's remove models which cannot predict
209 all_found = found
210 found = []
211 for mod in all_found:
212 cl = mod['cl']
213 if hasattr(cl, 'fit_predict') and not hasattr(cl, 'predict'):
214 continue
215 if hasattr(cl, 'fit_transform') and not hasattr(cl, 'transform'):
216 continue
217 if (not hasattr(cl, 'transform') and
218 not hasattr(cl, 'predict') and
219 not hasattr(cl, 'decision_function')):
220 continue
221 found.append(mod)
222 return found
225def _measure_time(fct, repeat=1, number=1, first_run=True):
226 """
227 Measures the execution time for a function.
229 :param fct: function to measure
230 :param repeat: number of times to repeat
231 :param number: number of times between two measures
232 :param first_run: if True, runs the function once before measuring
233 :return: last result, average, values
234 """
235 res = None
236 values = []
237 if first_run:
238 fct()
239 for __ in range(repeat):
240 begin = perf_counter()
241 for _ in range(number):
242 res = fct()
243 end = perf_counter()
244 values.append(end - begin)
245 if repeat * number == 1:
246 return res, values[0], values
247 return res, sum(values) / (repeat * number), values # pragma: no cover
250def _shape_exc(obj):
251 if hasattr(obj, 'shape'):
252 return obj.shape
253 if isinstance(obj, (list, dict, tuple)):
254 return "[{%d}]" % len(obj)
255 return None
258def dump_into_folder(dump_folder, obs_op=None, is_error=True,
259 **kwargs):
260 """
261 Dumps information when an error was detected
262 using :epkg:`*py:pickle`.
264 :param dump_folder: dump_folder
265 :param obs_op: obs_op (information)
266 :param is_error: is it an error or not?
267 :param kwargs: additional parameters
268 :return: name
269 """
270 if dump_folder is None:
271 raise ValueError("dump_folder cannot be None.")
272 optim = obs_op.get('optim', '')
273 optim = str(optim)
274 optim = optim.replace("<class 'sklearn.", "")
275 optim = optim.replace("<class '", "")
276 optim = optim.replace(" ", "")
277 optim = optim.replace(">", "")
278 optim = optim.replace("=", "")
279 optim = optim.replace("{", "")
280 optim = optim.replace("}", "")
281 optim = optim.replace(":", "")
282 optim = optim.replace("'", "")
283 optim = optim.replace("/", "")
284 optim = optim.replace("\\", "")
285 parts = (obs_op['runtime'], obs_op['name'], obs_op['scenario'],
286 obs_op['problem'], optim,
287 "op" + str(obs_op.get('opset', '-')),
288 "nf" + str(obs_op.get('n_features', '-')))
289 name = f"dump-{'ERROR' if is_error else 'i'}-{'-'.join(map(str, parts))}.pkl"
290 name = os.path.join(dump_folder, name)
291 obs_op = obs_op.copy()
292 fcts = [k for k in obs_op if k.startswith('lambda')]
293 for fct in fcts:
294 del obs_op[fct]
295 kwargs.update({'obs_op': obs_op})
296 with open(name, "wb") as f:
297 pickle.dump(kwargs, f)
298 return name
301def default_time_kwargs():
302 """
303 Returns default values *number* and *repeat* to measure
304 the execution of a function.
306 .. runpython::
307 :showcode:
308 :warningout: DeprecationWarning
310 from mlprodict.onnxrt.validate.validate_helper import default_time_kwargs
311 import pprint
312 pprint.pprint(default_time_kwargs())
314 keys define the number of rows,
315 values defines *number* and *repeat*.
316 """
317 return {
318 1: dict(number=15, repeat=20),
319 10: dict(number=10, repeat=20),
320 100: dict(number=4, repeat=10),
321 1000: dict(number=4, repeat=4),
322 10000: dict(number=2, repeat=2),
323 }
326def measure_time(stmt, x, repeat=10, number=50, div_by_number=False,
327 first_run=True, max_time=None):
328 """
329 Measures a statement and returns the results as a dictionary.
331 :param stmt: string
332 :param x: matrix
333 :param repeat: average over *repeat* experiment
334 :param number: number of executions in one row
335 :param div_by_number: divide by the number of executions
336 :param first_run: if True, runs the function once before measuring
337 :param max_time: execute the statement until the total goes
338 beyond this time (approximatively), *repeat* is ignored,
339 *div_by_number* must be set to True
340 :return: dictionary
342 See `Timer.repeat <https://docs.python.org/3/library/timeit.html?timeit.Timer.repeat>`_
343 for a better understanding of parameter *repeat* and *number*.
344 The function returns a duration corresponding to
345 *number* times the execution of the main statement.
346 """
347 if x is None:
348 raise ValueError("x cannot be None") # pragma: no cover
350 def fct():
351 stmt(x)
353 if first_run:
354 try:
355 fct()
356 except RuntimeError as e: # pragma: no cover
357 raise RuntimeError(f"{type(x)}-{x.dtype}") from e
359 return _c_measure_time(fct, context={}, repeat=repeat, number=number,
360 div_by_number=div_by_number, max_time=max_time)
363def _multiply_time_kwargs(time_kwargs, time_kwargs_fact, inst):
364 """
365 Multiplies values in *time_kwargs* following strategy
366 *time_kwargs_fact* for a given model *inst*.
368 :param time_kwargs: see below
369 :param time_kwargs_fact: see below
370 :param inst: :epkg:`scikit-learn` model
371 :return: new *time_kwargs*
373 Possible values for *time_kwargs_fact*:
375 - a integer: multiplies *number* by this number
376 - `'lin'`: multiplies value *number* for linear models depending
377 on the number of rows to process (:math:`\\propto 1/\\log_{10}(n)`)
379 .. runpython::
380 :showcode:
381 :warningout: DeprecationWarning
383 from pprint import pprint
384 from sklearn.linear_model import LinearRegression
385 from mlprodict.onnxrt.validate.validate_helper import (
386 default_time_kwargs, _multiply_time_kwargs)
388 lr = LinearRegression()
389 kw = default_time_kwargs()
390 pprint(kw)
392 kw2 = _multiply_time_kwargs(kw, 'lin', lr)
393 pprint(kw2)
394 """
395 if time_kwargs is None:
396 raise ValueError("time_kwargs cannot be None.") # pragma: no cover
397 if time_kwargs_fact in ('', None):
398 return time_kwargs
399 try:
400 vi = int(time_kwargs_fact)
401 time_kwargs_fact = vi
402 except (TypeError, ValueError):
403 pass
404 if isinstance(time_kwargs_fact, int):
405 time_kwargs_modified = copy.deepcopy(time_kwargs)
406 for k in time_kwargs_modified:
407 time_kwargs_modified[k]['number'] *= time_kwargs_fact
408 return time_kwargs_modified
409 if time_kwargs_fact == 'lin':
410 if isinstance(inst, LinearModel):
411 time_kwargs_modified = copy.deepcopy(time_kwargs)
412 for k in time_kwargs_modified:
413 kl = max(int(math.log(k) / math.log(10) + 1e-5), 1)
414 f = max(int(10 / kl + 0.5), 1)
415 time_kwargs_modified[k]['number'] *= f
416 time_kwargs_modified[k]['repeat'] *= 1
417 return time_kwargs_modified
418 return time_kwargs
419 raise ValueError( # pragma: no cover
420 f"Unable to interpret time_kwargs_fact='{time_kwargs_fact}'.")
423def _get_problem_data(prob, n_features):
424 data_problem = _problems[prob](n_features=n_features)
425 if len(data_problem) == 6:
426 X_, y_, init_types, method, output_index, Xort_ = data_problem
427 dofit = True
428 elif len(data_problem) == 7:
429 X_, y_, init_types, method, output_index, Xort_, dofit = data_problem
430 else:
431 raise RuntimeError( # pragma: no cover
432 f"Unable to interpret problem '{prob}'.")
433 if (len(X_.shape) == 2 and X_.shape[1] != n_features and
434 n_features is not None):
435 raise RuntimeError( # pragma: no cover
436 "Problem '{}' with n_features={} returned {} features"
437 "(func={}).".format(prob, n_features, X_.shape[1],
438 _problems[prob]))
439 if y_ is None:
440 (X_train, X_test, Xort_train, # pylint: disable=W0612
441 Xort_test) = train_test_split(
442 X_, Xort_, random_state=42)
443 y_train, y_test = None, None
444 else:
445 (X_train, X_test, y_train, y_test, # pylint: disable=W0612
446 Xort_train, Xort_test) = train_test_split(
447 X_, y_, Xort_, random_state=42)
448 if isinstance(init_types, tuple):
449 init_types, conv_options = init_types
450 else:
451 conv_options = None
453 if isinstance(method, tuple):
454 method_name, predict_kwargs = method
455 else:
456 method_name = method
457 predict_kwargs = {}
459 return (X_train, X_test, y_train,
460 y_test, Xort_test,
461 init_types, conv_options, method_name,
462 output_index, dofit, predict_kwargs)