Coverage for mlprodict/onnxrt/validate/validate_helper.py: 97%

224 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1""" 

2@file 

3@brief Validates runtime for many :epkg:`scikit-learn` operators. 

4The submodule relies on :epkg:`onnxconverter_common`, 

5:epkg:`sklearn-onnx`. 

6""" 

7import math 

8import copy 

9import os 

10import warnings 

11from importlib import import_module 

12import pickle 

13from time import perf_counter 

14import numpy 

15from cpyquickhelper.numbers import measure_time as _c_measure_time 

16from sklearn.base import BaseEstimator 

17from sklearn.linear_model._base import LinearModel 

18from sklearn.model_selection import train_test_split 

19from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version 

20from .validate_problems import _problems 

21 

22 

23class RuntimeBadResultsError(RuntimeError): 

24 """ 

25 Raised when the results are too different from 

26 :epkg:`scikit-learn`. 

27 """ 

28 

29 def __init__(self, msg, obs): 

30 """ 

31 :param msg: to display 

32 :param obs: observations 

33 """ 

34 RuntimeError.__init__(self, msg) 

35 self.obs = obs 

36 

37 

38def _dictionary2str(di): 

39 el = [] 

40 for k in sorted(di): 

41 el.append(f'{k}={di[k]}') 

42 return '/'.join(el) 

43 

44 

45def modules_list(): 

46 """ 

47 Returns modules and versions currently used. 

48 

49 .. runpython:: 

50 :showcode: 

51 :rst: 

52 :warningout: DeprecationWarning 

53 

54 from mlprodict.onnxrt.validate.validate_helper import modules_list 

55 from pyquickhelper.pandashelper import df2rst 

56 from pandas import DataFrame 

57 print(df2rst(DataFrame(modules_list()))) 

58 """ 

59 def try_import(name): 

60 try: 

61 mod = import_module(name) 

62 except ImportError: # pragma: no cover 

63 return None 

64 return (dict(name=name, version=mod.__version__) 

65 if hasattr(mod, '__version__') else dict(name=name)) 

66 

67 rows = [] 

68 for name in sorted(['pandas', 'numpy', 'sklearn', 'mlprodict', 

69 'skl2onnx', 'onnxmltools', 'onnx', 'onnxruntime', 

70 'scipy']): 

71 res = try_import(name) 

72 if res is not None: 

73 rows.append(res) 

74 return rows 

75 

76 

77def _dispsimple(arr, fLOG): 

78 if isinstance(arr, (tuple, list)): 

79 for i, a in enumerate(arr): 

80 fLOG("output %d" % i) 

81 _dispsimple(a, fLOG) 

82 elif hasattr(arr, 'shape'): 

83 if len(arr.shape) == 1: 

84 threshold = 8 

85 else: 

86 threshold = min( 

87 50, min(50 // arr.shape[1], 8) * arr.shape[1]) 

88 fLOG(numpy.array2string(arr, max_line_width=120, 

89 suppress_small=True, 

90 threshold=threshold)) 

91 else: # pragma: no cover 

92 s = str(arr) 

93 if len(s) > 50: 

94 s = s[:50] + "..." 

95 fLOG(s) 

96 

97 

98def _merge_options(all_conv_options, aoptions): 

99 if aoptions is None: 

100 return copy.deepcopy(all_conv_options) 

101 if not isinstance(aoptions, dict): 

102 return copy.deepcopy(aoptions) # pragma: no cover 

103 merged = {} 

104 for k, v in all_conv_options.items(): 

105 if k in aoptions: 

106 merged[k] = _merge_options(v, aoptions[k]) 

107 else: 

108 merged[k] = copy.deepcopy(v) 

109 for k, v in aoptions.items(): 

110 if k in all_conv_options: 

111 continue 

112 merged[k] = copy.deepcopy(v) 

113 return merged 

114 

115 

116def sklearn_operators(subfolder=None, extended=False, 

117 experimental=True): 

118 """ 

119 Builds the list of operators from :epkg:`scikit-learn`. 

120 The function goes through the list of submodule 

121 and get the list of class which inherit from 

122 :epkg:`scikit-learn:base:BaseEstimator`. 

123 

124 :param subfolder: look into only one subfolder 

125 :param extended: extends the list to the list of operators 

126 this package implements a converter for 

127 :param experimental: includes experimental module from 

128 :epkg:`scikit-learn` (see `sklearn.experimental 

129 <https://github.com/scikit-learn/scikit-learn/ 

130 tree/master/sklearn/experimental>`_) 

131 :return: the list of found operators 

132 """ 

133 if experimental: 

134 from sklearn.experimental import ( # pylint: disable=W0611 

135 enable_hist_gradient_boosting, 

136 enable_iterative_imputer) 

137 

138 subfolders = sklearn__all__ + ['mlprodict.onnx_conv'] 

139 found = [] 

140 for subm in sorted(subfolders): 

141 if isinstance(subm, list): 

142 continue # pragma: no cover 

143 if subfolder is not None and subm != subfolder: 

144 continue 

145 

146 if subm == 'feature_extraction': 

147 subs = [subm, 'feature_extraction.text'] 

148 else: 

149 subs = [subm] 

150 

151 for sub in subs: 

152 if '.' in sub and sub not in {'feature_extraction.text'}: 

153 name_sub = sub 

154 else: 

155 name_sub = f"sklearn.{sub}" 

156 try: 

157 mod = import_module(name_sub) 

158 except ModuleNotFoundError: 

159 continue 

160 

161 if hasattr(mod, "register_converters"): 

162 fct = getattr(mod, "register_converters") 

163 cls = fct() 

164 else: 

165 cls = getattr(mod, "__all__", None) 

166 if cls is None: 

167 cls = list(mod.__dict__) 

168 cls = [mod.__dict__[cl] for cl in cls] 

169 

170 for cl in cls: 

171 try: 

172 issub = issubclass(cl, BaseEstimator) 

173 except TypeError: 

174 continue 

175 if cl.__name__ in {'Pipeline', 'ColumnTransformer', 

176 'FeatureUnion', 'BaseEstimator', 

177 'BaseEnsemble', 'BaseDecisionTree'}: 

178 continue 

179 if cl.__name__ in {'CustomScorerTransform'}: 

180 continue 

181 if (sub in {'calibration', 'dummy', 'manifold'} and 

182 'Calibrated' not in cl.__name__): 

183 continue 

184 if issub: 

185 pack = "sklearn" if sub in sklearn__all__ else cl.__module__.split('.')[ 

186 0] 

187 found.append( 

188 dict(name=cl.__name__, subfolder=sub, cl=cl, package=pack)) 

189 

190 if extended: 

191 from ...onnx_conv import register_converters 

192 with warnings.catch_warnings(): 

193 warnings.simplefilter("ignore", ResourceWarning) 

194 models = register_converters(True) 

195 

196 done = set(_['name'] for _ in found) 

197 for m in models: 

198 try: 

199 name = m.__module__.split('.') 

200 except AttributeError as e: # pragma: no cover 

201 raise AttributeError(f"Unexpected value, m={m}") from e 

202 sub = '.'.join(name[1:]) 

203 pack = name[0] 

204 if m.__name__ not in done: 

205 found.append( 

206 dict(name=m.__name__, cl=m, package=pack, sub=sub)) 

207 

208 # let's remove models which cannot predict 

209 all_found = found 

210 found = [] 

211 for mod in all_found: 

212 cl = mod['cl'] 

213 if hasattr(cl, 'fit_predict') and not hasattr(cl, 'predict'): 

214 continue 

215 if hasattr(cl, 'fit_transform') and not hasattr(cl, 'transform'): 

216 continue 

217 if (not hasattr(cl, 'transform') and 

218 not hasattr(cl, 'predict') and 

219 not hasattr(cl, 'decision_function')): 

220 continue 

221 found.append(mod) 

222 return found 

223 

224 

225def _measure_time(fct, repeat=1, number=1, first_run=True): 

226 """ 

227 Measures the execution time for a function. 

228 

229 :param fct: function to measure 

230 :param repeat: number of times to repeat 

231 :param number: number of times between two measures 

232 :param first_run: if True, runs the function once before measuring 

233 :return: last result, average, values 

234 """ 

235 res = None 

236 values = [] 

237 if first_run: 

238 fct() 

239 for __ in range(repeat): 

240 begin = perf_counter() 

241 for _ in range(number): 

242 res = fct() 

243 end = perf_counter() 

244 values.append(end - begin) 

245 if repeat * number == 1: 

246 return res, values[0], values 

247 return res, sum(values) / (repeat * number), values # pragma: no cover 

248 

249 

250def _shape_exc(obj): 

251 if hasattr(obj, 'shape'): 

252 return obj.shape 

253 if isinstance(obj, (list, dict, tuple)): 

254 return "[{%d}]" % len(obj) 

255 return None 

256 

257 

258def dump_into_folder(dump_folder, obs_op=None, is_error=True, 

259 **kwargs): 

260 """ 

261 Dumps information when an error was detected 

262 using :epkg:`*py:pickle`. 

263 

264 :param dump_folder: dump_folder 

265 :param obs_op: obs_op (information) 

266 :param is_error: is it an error or not? 

267 :param kwargs: additional parameters 

268 :return: name 

269 """ 

270 if dump_folder is None: 

271 raise ValueError("dump_folder cannot be None.") 

272 optim = obs_op.get('optim', '') 

273 optim = str(optim) 

274 optim = optim.replace("<class 'sklearn.", "") 

275 optim = optim.replace("<class '", "") 

276 optim = optim.replace(" ", "") 

277 optim = optim.replace(">", "") 

278 optim = optim.replace("=", "") 

279 optim = optim.replace("{", "") 

280 optim = optim.replace("}", "") 

281 optim = optim.replace(":", "") 

282 optim = optim.replace("'", "") 

283 optim = optim.replace("/", "") 

284 optim = optim.replace("\\", "") 

285 parts = (obs_op['runtime'], obs_op['name'], obs_op['scenario'], 

286 obs_op['problem'], optim, 

287 "op" + str(obs_op.get('opset', '-')), 

288 "nf" + str(obs_op.get('n_features', '-'))) 

289 name = f"dump-{'ERROR' if is_error else 'i'}-{'-'.join(map(str, parts))}.pkl" 

290 name = os.path.join(dump_folder, name) 

291 obs_op = obs_op.copy() 

292 fcts = [k for k in obs_op if k.startswith('lambda')] 

293 for fct in fcts: 

294 del obs_op[fct] 

295 kwargs.update({'obs_op': obs_op}) 

296 with open(name, "wb") as f: 

297 pickle.dump(kwargs, f) 

298 return name 

299 

300 

301def default_time_kwargs(): 

302 """ 

303 Returns default values *number* and *repeat* to measure 

304 the execution of a function. 

305 

306 .. runpython:: 

307 :showcode: 

308 :warningout: DeprecationWarning 

309 

310 from mlprodict.onnxrt.validate.validate_helper import default_time_kwargs 

311 import pprint 

312 pprint.pprint(default_time_kwargs()) 

313 

314 keys define the number of rows, 

315 values defines *number* and *repeat*. 

316 """ 

317 return { 

318 1: dict(number=15, repeat=20), 

319 10: dict(number=10, repeat=20), 

320 100: dict(number=4, repeat=10), 

321 1000: dict(number=4, repeat=4), 

322 10000: dict(number=2, repeat=2), 

323 } 

324 

325 

326def measure_time(stmt, x, repeat=10, number=50, div_by_number=False, 

327 first_run=True, max_time=None): 

328 """ 

329 Measures a statement and returns the results as a dictionary. 

330 

331 :param stmt: string 

332 :param x: matrix 

333 :param repeat: average over *repeat* experiment 

334 :param number: number of executions in one row 

335 :param div_by_number: divide by the number of executions 

336 :param first_run: if True, runs the function once before measuring 

337 :param max_time: execute the statement until the total goes 

338 beyond this time (approximatively), *repeat* is ignored, 

339 *div_by_number* must be set to True 

340 :return: dictionary 

341 

342 See `Timer.repeat <https://docs.python.org/3/library/timeit.html?timeit.Timer.repeat>`_ 

343 for a better understanding of parameter *repeat* and *number*. 

344 The function returns a duration corresponding to 

345 *number* times the execution of the main statement. 

346 """ 

347 if x is None: 

348 raise ValueError("x cannot be None") # pragma: no cover 

349 

350 def fct(): 

351 stmt(x) 

352 

353 if first_run: 

354 try: 

355 fct() 

356 except RuntimeError as e: # pragma: no cover 

357 raise RuntimeError(f"{type(x)}-{x.dtype}") from e 

358 

359 return _c_measure_time(fct, context={}, repeat=repeat, number=number, 

360 div_by_number=div_by_number, max_time=max_time) 

361 

362 

363def _multiply_time_kwargs(time_kwargs, time_kwargs_fact, inst): 

364 """ 

365 Multiplies values in *time_kwargs* following strategy 

366 *time_kwargs_fact* for a given model *inst*. 

367 

368 :param time_kwargs: see below 

369 :param time_kwargs_fact: see below 

370 :param inst: :epkg:`scikit-learn` model 

371 :return: new *time_kwargs* 

372 

373 Possible values for *time_kwargs_fact*: 

374 

375 - a integer: multiplies *number* by this number 

376 - `'lin'`: multiplies value *number* for linear models depending 

377 on the number of rows to process (:math:`\\propto 1/\\log_{10}(n)`) 

378 

379 .. runpython:: 

380 :showcode: 

381 :warningout: DeprecationWarning 

382 

383 from pprint import pprint 

384 from sklearn.linear_model import LinearRegression 

385 from mlprodict.onnxrt.validate.validate_helper import ( 

386 default_time_kwargs, _multiply_time_kwargs) 

387 

388 lr = LinearRegression() 

389 kw = default_time_kwargs() 

390 pprint(kw) 

391 

392 kw2 = _multiply_time_kwargs(kw, 'lin', lr) 

393 pprint(kw2) 

394 """ 

395 if time_kwargs is None: 

396 raise ValueError("time_kwargs cannot be None.") # pragma: no cover 

397 if time_kwargs_fact in ('', None): 

398 return time_kwargs 

399 try: 

400 vi = int(time_kwargs_fact) 

401 time_kwargs_fact = vi 

402 except (TypeError, ValueError): 

403 pass 

404 if isinstance(time_kwargs_fact, int): 

405 time_kwargs_modified = copy.deepcopy(time_kwargs) 

406 for k in time_kwargs_modified: 

407 time_kwargs_modified[k]['number'] *= time_kwargs_fact 

408 return time_kwargs_modified 

409 if time_kwargs_fact == 'lin': 

410 if isinstance(inst, LinearModel): 

411 time_kwargs_modified = copy.deepcopy(time_kwargs) 

412 for k in time_kwargs_modified: 

413 kl = max(int(math.log(k) / math.log(10) + 1e-5), 1) 

414 f = max(int(10 / kl + 0.5), 1) 

415 time_kwargs_modified[k]['number'] *= f 

416 time_kwargs_modified[k]['repeat'] *= 1 

417 return time_kwargs_modified 

418 return time_kwargs 

419 raise ValueError( # pragma: no cover 

420 f"Unable to interpret time_kwargs_fact='{time_kwargs_fact}'.") 

421 

422 

423def _get_problem_data(prob, n_features): 

424 data_problem = _problems[prob](n_features=n_features) 

425 if len(data_problem) == 6: 

426 X_, y_, init_types, method, output_index, Xort_ = data_problem 

427 dofit = True 

428 elif len(data_problem) == 7: 

429 X_, y_, init_types, method, output_index, Xort_, dofit = data_problem 

430 else: 

431 raise RuntimeError( # pragma: no cover 

432 f"Unable to interpret problem '{prob}'.") 

433 if (len(X_.shape) == 2 and X_.shape[1] != n_features and 

434 n_features is not None): 

435 raise RuntimeError( # pragma: no cover 

436 "Problem '{}' with n_features={} returned {} features" 

437 "(func={}).".format(prob, n_features, X_.shape[1], 

438 _problems[prob])) 

439 if y_ is None: 

440 (X_train, X_test, Xort_train, # pylint: disable=W0612 

441 Xort_test) = train_test_split( 

442 X_, Xort_, random_state=42) 

443 y_train, y_test = None, None 

444 else: 

445 (X_train, X_test, y_train, y_test, # pylint: disable=W0612 

446 Xort_train, Xort_test) = train_test_split( 

447 X_, y_, Xort_, random_state=42) 

448 if isinstance(init_types, tuple): 

449 init_types, conv_options = init_types 

450 else: 

451 conv_options = None 

452 

453 if isinstance(method, tuple): 

454 method_name, predict_kwargs = method 

455 else: 

456 method_name = method 

457 predict_kwargs = {} 

458 

459 return (X_train, X_test, y_train, 

460 y_test, Xort_test, 

461 init_types, conv_options, method_name, 

462 output_index, dofit, predict_kwargs)