Coverage for mlprodict/asv_benchmark/common_asv_skl.py: 95%

312 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1# pylint: disable=E1101 

2""" 

3Common class for all benchmarks testing 

4converted models from :epkg:`scikit-learn` 

5with :epkg:`asv`. The benchmark can be run through 

6file :epkg:`run_asv.sh` on Linux or :epkg:`run_asv.bat` on 

7Windows. 

8 

9.. warning:: 

10 On Windows, you should avoid cloning the repository 

11 on a folder with a long full name. Visual Studio tends to 

12 abide by the rule of the maximum path length even though 

13 the system is told otherwise. 

14""" 

15import os 

16from datetime import datetime 

17import pickle 

18from logging import getLogger 

19import numpy 

20from sklearn import set_config 

21from sklearn.datasets import load_iris 

22from sklearn.metrics import ( 

23 accuracy_score, mean_absolute_error, silhouette_score) 

24from sklearn.model_selection import train_test_split 

25from mlprodict import get_ir_version, __max_supported_opset__ 

26from mlprodict.onnxrt import OnnxInference 

27from mlprodict.onnx_conv import ( 

28 to_onnx, register_rewritten_operators, register_converters, 

29 register_new_operators) 

30from mlprodict.onnxrt.validate.validate_benchmark import make_n_rows 

31from mlprodict.onnxrt.validate.validate_problems import _modify_dimension 

32from mlprodict.onnx_tools.optim import onnx_statistics 

33from mlprodict.tools.asv_options_helper import ( 

34 expand_onnx_options, version2number) 

35from mlprodict.tools.model_info import set_random_state 

36 

37 

38class _CommonAsvSklBenchmark: 

39 """ 

40 Common tests to all benchmarks testing converted 

41 :epkg:`scikit-learn` models. See `benchmark attributes 

42 <https://asv.readthedocs.io/en/stable/benchmarks.html#general>`_. 

43 """ 

44 

45 # Part which changes. 

46 # params and param_names may be changed too. 

47 

48 params = [ 

49 ['skl', 'pyrtc', 'ort'], # values for runtime 

50 [1, 10, 100, 10000], # values for N 

51 [4, 20], # values for nf 

52 [__max_supported_opset__], # values for opset 

53 ["float", "double"], # values for dtype 

54 [None], # values for optim 

55 ] 

56 param_names = ['rt', 'N', 'nf', 'opset', 'dtype', 'optim'] 

57 chk_method_name = None 

58 version = datetime.now().isoformat() 

59 pretty_source = "disabled" 

60 

61 par_ydtype = numpy.int64 

62 par_dofit = True 

63 par_convopts = None 

64 

65 def _create_model(self): # pragma: no cover 

66 raise NotImplementedError("This method must be overwritten.") 

67 

68 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): # pragma: no cover 

69 raise NotImplementedError("This method must be overwritten.") 

70 

71 def _score_metric(self, X, y_exp, y_pred): # pragma: no cover 

72 raise NotImplementedError("This method must be overwritten.") 

73 

74 def _optimize_onnx(self, onx): 

75 return onx 

76 

77 def _get_xdtype(self, dtype): 

78 if dtype in ('float', numpy.float32): 

79 return numpy.float32 

80 elif dtype in ('double', '64', 64, numpy.float64): 

81 return numpy.float64 

82 raise ValueError( # pragma: no cover 

83 f"Unknown dtype '{dtype}'.") 

84 

85 def _get_dataset(self, nf, dtype): 

86 xdtype = self._get_xdtype(dtype) 

87 data = load_iris() 

88 X, y = data.data, data.target 

89 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

90 rnd = state.randn(*X.shape) / 3 

91 X += rnd 

92 X = _modify_dimension(X, nf) 

93 X_train, X_test, y_train, y_test = train_test_split( 

94 X, y, random_state=42) 

95 Xt = X_test.astype(xdtype) 

96 yt = y_test.astype(self.par_ydtype) 

97 if X_train.shape[0] < X_train.shape[1]: 

98 raise RuntimeError( # pragma: no cover 

99 "Unable to train a model with less observations than features " 

100 "shape=%r." % (X_train.shape, )) 

101 return (X_train, y_train), (Xt, yt) 

102 

103 def _to_onnx(self, model, X, opset, dtype, optim): 

104 if optim is None or len(optim) == 0: 

105 options = self.par_convopts 

106 elif self.par_convopts and len(self.par_convopts) > 0: 

107 raise NotImplementedError( # pragma: no cover 

108 f"Conflict between par_convopts={self.par_convopts} and optim={optim}") 

109 else: 

110 # Expand common onnx options, see _nick_name_options. 

111 options = expand_onnx_options(model, optim) 

112 

113 return to_onnx(model, X, options=options, target_opset=opset) 

114 

115 def _create_onnx_inference(self, onx, runtime): 

116 if 'onnxruntime' in runtime: 

117 old = onx.ir_version 

118 onx.ir_version = get_ir_version(__max_supported_opset__) 

119 else: 

120 old = None 

121 

122 try: 

123 res = OnnxInference( 

124 onx, runtime=runtime, 

125 runtime_options=dict(log_severity_level=3)) 

126 except RuntimeError as e: # pragma: no cover 

127 if "[ONNXRuntimeError]" in str(e): 

128 return RuntimeError(f"onnxruntime fails due to {str(e)}") 

129 raise e 

130 if old is not None: 

131 onx.ir_version = old 

132 return res 

133 

134 # Part which does not change. 

135 

136 def _check_rt(self, rt, meth): 

137 """ 

138 Checks that runtime has the appropriate method. 

139 """ 

140 if rt is None: 

141 raise ValueError("rt cannot be empty.") # pragma: no cover 

142 if not hasattr(rt, meth): 

143 raise TypeError( # pragma: no cover 

144 f"rt of type {type(rt)!r} has no method {meth!r}.") 

145 

146 def runtime_name(self, runtime): 

147 """ 

148 Returns the runtime shortname. 

149 """ 

150 if runtime == 'skl': 

151 name = runtime 

152 elif runtime == 'ort': 

153 name = 'onnxruntime1' 

154 elif runtime == 'ort2': 

155 name = 'onnxruntime2' # pragma: no cover 

156 elif runtime == 'pyrt': 

157 name = 'python' 

158 elif runtime == 'pyrtc': 

159 name = 'python_compiled' 

160 else: 

161 raise ValueError( # pragma: no cover 

162 f"Unknown runtime '{runtime}'.") 

163 return name 

164 

165 def _name(self, nf, opset, dtype): 

166 last = f'cache-{self.__class__.__name__}-nf{nf}-op{opset}-dt{dtype}.pickle' 

167 return last 

168 

169 def setup_cache(self): 

170 "asv API" 

171 for dtype in self.params[4]: 

172 for opv in self.params[3]: 

173 for nf in self.params[2]: 

174 (X_train, y_train), (X, y) = self._get_dataset(nf, dtype) 

175 model = self._create_model() 

176 if self.par_dofit: 

177 set_random_state(model) 

178 model.fit(X_train, y_train) 

179 stored = {'model': model, 'X': X, 'y': y} 

180 filename = self._name(nf, opv, dtype) 

181 with open(filename, "wb") as f: 

182 pickle.dump(stored, f) 

183 if not os.path.exists(filename): 

184 raise RuntimeError( # pragma: no cover 

185 f"Unable to dump model {model!r} into {filename!r}.") 

186 

187 def setup(self, runtime, N, nf, opset, dtype, optim): 

188 "asv API" 

189 logger = getLogger('skl2onnx') 

190 logger.disabled = True 

191 register_converters() 

192 register_rewritten_operators() 

193 register_new_operators() 

194 with open(self._name(nf, opset, dtype), "rb") as f: 

195 stored = pickle.load(f) 

196 self.stored = stored 

197 self.model = stored['model'] 

198 self.X, self.y = make_n_rows(stored['X'], N, stored['y']) 

199 onx, rt_, rt_fct_, rt_fct_track_ = self._create_onnx_and_runtime( 

200 runtime, self.model, self.X, opset, dtype, optim) 

201 self.onx = onx 

202 setattr(self, "rt_" + runtime, rt_) 

203 setattr(self, "rt_fct_" + runtime, rt_fct_) 

204 setattr(self, "rt_fct_track_" + runtime, rt_fct_track_) 

205 set_config(assume_finite=True) 

206 

207 def time_predict(self, runtime, N, nf, opset, dtype, optim): 

208 "asv API" 

209 return getattr(self, "rt_fct_" + runtime)(self.X) 

210 

211 def peakmem_predict(self, runtime, N, nf, opset, dtype, optim): 

212 "asv API" 

213 return getattr(self, "rt_fct_" + runtime)(self.X) 

214 

215 def track_score(self, runtime, N, nf, opset, dtype, optim): 

216 "asv API" 

217 yp = getattr(self, "rt_fct_track_" + runtime)(self.X) 

218 return self._score_metric(self.X, self.y, yp) 

219 

220 def track_onnxsize(self, runtime, N, nf, opset, dtype, optim): 

221 "asv API" 

222 return len(self.onx.SerializeToString()) 

223 

224 def track_nbnodes(self, runtime, N, nf, opset, dtype, optim): 

225 "asv API" 

226 stats = onnx_statistics(self.onx) 

227 return stats.get('nnodes', 0) 

228 

229 def track_vmlprodict(self, runtime, N, nf, opset, dtype, optim): 

230 "asv API" 

231 from mlprodict import __version__ 

232 return version2number(__version__) 

233 

234 def track_vsklearn(self, runtime, N, nf, opset, dtype, optim): 

235 "asv API" 

236 from sklearn import __version__ 

237 return version2number(__version__) 

238 

239 def track_vort(self, runtime, N, nf, opset, dtype, optim): 

240 "asv API" 

241 from onnxruntime import __version__ as onnxrt_version 

242 return version2number(onnxrt_version) 

243 

244 def check_method_name(self, method_name): 

245 "Does some verifications. Fails if inconsistencies." 

246 if getattr(self, 'chk_method_name', None) not in (None, method_name): 

247 raise RuntimeError( # pragma: no cover 

248 f"Method name must be '{method_name}'.") 

249 if getattr(self, 'chk_method_name', None) is None: 

250 raise RuntimeError( # pragma: no cover 

251 "Unable to check that the method name is correct " 

252 "(expected is '{}')".format( 

253 method_name)) 

254 

255 

256class _CommonAsvSklBenchmarkClassifier(_CommonAsvSklBenchmark): 

257 """ 

258 Common class for a classifier. 

259 """ 

260 chk_method_name = 'predict_proba' 

261 

262 def _score_metric(self, X, y_exp, y_pred): 

263 return accuracy_score(y_exp, y_pred) 

264 

265 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): 

266 self.check_method_name('predict_proba') 

267 onx_ = self._to_onnx(model, X, opset, dtype, optim) 

268 onx = self._optimize_onnx(onx_) 

269 name = self.runtime_name(runtime) 

270 if name == 'skl': 

271 rt_ = None 

272 rt_fct_ = lambda X: model.predict_proba(X) 

273 rt_fct_track_ = lambda X: model.predict(X) 

274 else: 

275 rt_ = self._create_onnx_inference(onx, name) 

276 self._check_rt(rt_, 'run') 

277 rt_fct_ = lambda pX: rt_.run({'X': pX}) 

278 rt_fct_track_ = lambda pX: rt_fct_(pX)['output_label'] 

279 return onx, rt_, rt_fct_, rt_fct_track_ 

280 

281 

282class _CommonAsvSklBenchmarkClassifierRawScore(_CommonAsvSklBenchmark): 

283 """ 

284 Common class for a classifier. 

285 """ 

286 chk_method_name = 'decision_function' 

287 

288 def _score_metric(self, X, y_exp, y_pred): 

289 return accuracy_score(y_exp, y_pred) 

290 

291 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): 

292 self.check_method_name('decision_function') 

293 onx_ = self._to_onnx(model, X, opset, dtype, optim) 

294 onx = self._optimize_onnx(onx_) 

295 name = self.runtime_name(runtime) 

296 if name == 'skl': 

297 rt_ = None 

298 rt_fct_ = lambda X: model.decision_function(X) 

299 rt_fct_track_ = lambda X: model.predict(X) 

300 else: 

301 rt_ = self._create_onnx_inference(onx, name) 

302 self._check_rt(rt_, 'run') 

303 rt_fct_ = lambda X: rt_.run({'X': X}) 

304 rt_fct_track_ = lambda X: rt_fct_(X)['output_label'] 

305 return onx, rt_, rt_fct_, rt_fct_track_ 

306 

307 

308class _CommonAsvSklBenchmarkClustering(_CommonAsvSklBenchmark): 

309 """ 

310 Common class for a clustering algorithm. 

311 """ 

312 chk_method_name = 'predict' 

313 

314 def _score_metric(self, X, y_exp, y_pred): 

315 if X.shape[0] == 1: 

316 return 0. # pragma: no cover 

317 elif set(y_pred) == 1: 

318 return 0. # pragma: no cover 

319 return silhouette_score(X, y_pred) 

320 

321 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): 

322 self.check_method_name('predict') 

323 onx_ = self._to_onnx(model, X, opset, dtype, optim) 

324 onx = self._optimize_onnx(onx_) 

325 name = self.runtime_name(runtime) 

326 if name == 'skl': 

327 rt_ = None 

328 rt_fct_ = lambda X: model.predict(X.astype(numpy.float64)) 

329 rt_fct_track_ = lambda X: model.predict(X.astype(numpy.float64)) 

330 else: 

331 rt_ = self._create_onnx_inference(onx, name) 

332 self._check_rt(rt_, 'run') 

333 rt_fct_ = lambda X: rt_.run({'X': X}) 

334 rt_fct_track_ = lambda X: rt_fct_(X)['label'] 

335 return onx, rt_, rt_fct_, rt_fct_track_ 

336 

337 

338class _CommonAsvSklBenchmarkMultiClassifier(_CommonAsvSklBenchmark): 

339 """ 

340 Common class for a multi-classifier. 

341 """ 

342 chk_method_name = 'predict_proba' 

343 

344 def _get_dataset(self, nf, dtype): 

345 xdtype = self._get_xdtype(dtype) 

346 data = load_iris() 

347 X, y = data.data, data.target 

348 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

349 rnd = state.randn(*X.shape) / 3 

350 X += rnd 

351 nbclass = len(set(y)) 

352 y_ = numpy.zeros((y.shape[0], nbclass), dtype=y.dtype) 

353 for i, vy in enumerate(y): 

354 y_[i, vy] = 1 

355 y = y_ 

356 X = _modify_dimension(X, nf) 

357 X_train, X_test, y_train, y_test = train_test_split( 

358 X, y, random_state=42) 

359 X = X_test.astype(xdtype) 

360 y = y_test.astype(self.par_ydtype) 

361 return (X_train, y_train), (X, y) 

362 

363 def _score_metric(self, X, y_exp, y_pred): 

364 return accuracy_score(y_exp.ravel(), y_pred.ravel()) 

365 

366 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): 

367 self.check_method_name('predict_proba') 

368 onx_ = self._to_onnx(model, X, opset, dtype, optim) 

369 onx = self._optimize_onnx(onx_) 

370 name = self.runtime_name(runtime) 

371 if name == 'skl': 

372 rt_ = None 

373 rt_fct_ = lambda X: model.predict_proba(X) 

374 rt_fct_track_ = lambda X: model.predict(X) 

375 else: 

376 rt_ = self._create_onnx_inference(onx, name) 

377 self._check_rt(rt_, 'run') 

378 rt_fct_ = lambda X: rt_.run({'X': X}) 

379 rt_fct_track_ = lambda X: rt_fct_(X)['output_label'] 

380 return onx, rt_, rt_fct_, rt_fct_track_ 

381 

382 

383class _CommonAsvSklBenchmarkOutlier(_CommonAsvSklBenchmark): 

384 """ 

385 Common class for outlier detection. 

386 """ 

387 chk_method_name = 'predict' 

388 

389 def _score_metric(self, X, y_exp, y_pred): 

390 return numpy.sum(y_pred) / y_pred.shape[0] 

391 

392 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): 

393 self.check_method_name('predict') 

394 onx_ = self._to_onnx(model, X, opset, dtype, optim) 

395 onx = self._optimize_onnx(onx_) 

396 name = self.runtime_name(runtime) 

397 if name == 'skl': 

398 rt_ = None 

399 rt_fct_ = lambda X: model.predict(X) 

400 rt_fct_track_ = lambda X: model.predict(X) 

401 else: 

402 rt_ = self._create_onnx_inference(onx, name) 

403 self._check_rt(rt_, 'run') 

404 rt_fct_ = lambda X: rt_.run({'X': X}) 

405 rt_fct_track_ = lambda X: rt_fct_(X)['scores'] 

406 return onx, rt_, rt_fct_, rt_fct_track_ 

407 

408 

409class _CommonAsvSklBenchmarkRegressor(_CommonAsvSklBenchmark): 

410 """ 

411 Common class for a regressor. 

412 """ 

413 chk_method_name = 'predict' 

414 

415 def _score_metric(self, X, y_exp, y_pred): 

416 return mean_absolute_error(y_exp, y_pred) 

417 

418 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): 

419 self.check_method_name('predict') 

420 onx = self._to_onnx(model, X, opset, dtype, optim) 

421 name = self.runtime_name(runtime) 

422 if name == 'skl': 

423 rt_ = None 

424 rt_fct_ = lambda X: model.predict(X) 

425 rt_fct_track_ = lambda X: model.predict(X) 

426 else: 

427 rt_ = self._create_onnx_inference(onx, name) 

428 self._check_rt(rt_, 'run') 

429 rt_fct_ = lambda X: rt_.run({'X': X}) 

430 rt_fct_track_ = lambda X: rt_fct_(X)['variable'] 

431 return onx, rt_, rt_fct_, rt_fct_track_ 

432 

433 

434class _CommonAsvSklBenchmarkTrainableTransform(_CommonAsvSklBenchmark): 

435 """ 

436 Common class for a trainable transformer. 

437 """ 

438 chk_method_name = 'transform' 

439 

440 def _score_metric(self, X, y_exp, y_pred): 

441 return numpy.sum(y_pred) / y_pred.shape[0] 

442 

443 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): 

444 self.check_method_name('transform') 

445 onx_ = self._to_onnx(model, X, opset, dtype, optim) 

446 onx = self._optimize_onnx(onx_) 

447 name = self.runtime_name(runtime) 

448 if name == 'skl': 

449 rt_ = None 

450 rt_fct_ = lambda X: model.transform(X) 

451 rt_fct_track_ = lambda X: model.transform(X) 

452 else: 

453 rt_ = self._create_onnx_inference(onx, name) 

454 self._check_rt(rt_, 'run') 

455 rt_fct_ = lambda X: rt_.run({'X': X}) 

456 rt_fct_track_ = lambda X: rt_fct_(X)['variable'] 

457 return onx, rt_, rt_fct_, rt_fct_track_ 

458 

459 

460class _CommonAsvSklBenchmarkTransform(_CommonAsvSklBenchmark): 

461 """ 

462 Common class for a transformer. 

463 """ 

464 chk_method_name = 'transform' 

465 

466 def _score_metric(self, X, y_exp, y_pred): 

467 return numpy.sum(y_pred) / y_pred.shape[0] 

468 

469 def _create_onnx_and_runtime(self, runtime, model, X, opset, dtype, optim): 

470 self.check_method_name('transform') 

471 onx_ = self._to_onnx(model, X, opset, dtype, optim) 

472 onx = self._optimize_onnx(onx_) 

473 name = self.runtime_name(runtime) 

474 if name == 'skl': 

475 rt_ = None 

476 rt_fct_ = lambda X: model.transform(X) 

477 rt_fct_track_ = lambda X: model.transform(X) 

478 else: 

479 rt_ = self._create_onnx_inference(onx, name) 

480 self._check_rt(rt_, 'run') 

481 rt_fct_ = lambda X: rt_.run({'X': X}) 

482 rt_fct_track_ = lambda X: rt_fct_(X)['variable'] 

483 return onx, rt_, rt_fct_, rt_fct_track_ 

484 

485 

486class _CommonAsvSklBenchmarkTransformPositive(_CommonAsvSklBenchmarkTransform): 

487 """ 

488 Common class for a transformer for positive features. 

489 """ 

490 chk_method_name = 'transform' 

491 

492 def _get_dataset(self, nf, dtype): 

493 xdtype = self._get_xdtype(dtype) 

494 data = load_iris() 

495 X, y = data.data, data.target 

496 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

497 rnd = state.randn(*X.shape) / 3 

498 X += rnd 

499 X = _modify_dimension(X, nf) 

500 X = numpy.abs(X) 

501 X_train, X_test, y_train, y_test = train_test_split( 

502 X, y, random_state=42) 

503 X = X_test.astype(xdtype) 

504 y = y_test.astype(self.par_ydtype) 

505 return (X_train, y_train), (X, y)