Coverage for mlprodict/onnxrt/validate/validate_problems.py: 99%

444 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1# pylint: disable=E1101 

2""" 

3@file 

4@brief Validates runtime for many :scikit-learn: operators. 

5The submodule relies on :epkg:`onnxconverter_common`, 

6:epkg:`sklearn-onnx`. 

7""" 

8import numpy 

9from sklearn.base import ( 

10 ClusterMixin, BiclusterMixin, OutlierMixin, 

11 RegressorMixin, ClassifierMixin) 

12from sklearn.calibration import CalibratedClassifierCV 

13from sklearn.cross_decomposition import PLSSVD 

14from sklearn.datasets import load_iris 

15from sklearn.decomposition import LatentDirichletAllocation, NMF 

16from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis 

17from sklearn.ensemble import ( 

18 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier, 

19 BaggingClassifier, VotingClassifier, GradientBoostingClassifier, 

20 RandomForestClassifier) 

21try: 

22 from sklearn.ensemble import StackingClassifier, StackingRegressor 

23except ImportError: # pragma: no cover 

24 # new in 0.22 

25 StackingClassifier, StackingRegressor = None, None 

26from sklearn.feature_extraction import DictVectorizer, FeatureHasher 

27from sklearn.feature_extraction.text import ( 

28 CountVectorizer, TfidfVectorizer, TfidfTransformer) 

29from sklearn.ensemble import ( 

30 HistGradientBoostingRegressor, 

31 HistGradientBoostingClassifier) 

32from sklearn.feature_selection import ( 

33 RFE, RFECV, GenericUnivariateSelect, 

34 SelectPercentile, SelectFwe, SelectKBest, 

35 SelectFdr, SelectFpr, SelectFromModel) 

36from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor 

37from sklearn.isotonic import IsotonicRegression 

38from sklearn.linear_model import ( 

39 ARDRegression, ElasticNetCV, 

40 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC, 

41 SGDRegressor, OrthogonalMatchingPursuitCV, 

42 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet, 

43 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso, 

44 PassiveAggressiveClassifier, RidgeClassifier, 

45 RidgeClassifierCV, PassiveAggressiveRegressor, 

46 HuberRegressor, LogisticRegression, SGDClassifier, 

47 LogisticRegressionCV, Perceptron) 

48from sklearn.mixture._base import BaseMixture 

49from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 

50from sklearn.multiclass import ( 

51 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier) 

52from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier 

53from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB 

54from sklearn.neighbors import ( 

55 NearestCentroid, RadiusNeighborsClassifier, 

56 NeighborhoodComponentsAnalysis) 

57from sklearn.preprocessing import ( 

58 LabelBinarizer, LabelEncoder, 

59 OneHotEncoder, PowerTransformer) 

60from sklearn.semi_supervised import LabelPropagation, LabelSpreading 

61from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC 

62from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier 

63from sklearn.utils import shuffle 

64from ._validate_problems_helper import ( 

65 _noshapevar, _1d_problem, text_alpha_num) 

66 

67 

68def _modify_dimension(X, n_features, seed=19): 

69 """ 

70 Modifies the number of features to increase 

71 or reduce the number of features. 

72 

73 @param X features matrix 

74 @param n_features number of features 

75 @param seed random seed (to get the same 

76 dataset at each call) 

77 @return new featurs matrix 

78 """ 

79 if n_features is None or n_features == X.shape[1]: 

80 return X 

81 if n_features < X.shape[1]: 

82 return X[:, :n_features] 

83 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101 

84 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype) 

85 res[:, :X.shape[1]] = X[:, :] 

86 div = max((n_features // X.shape[1]) + 1, 2) 

87 for i in range(X.shape[1], res.shape[1]): 

88 j = i % X.shape[1] 

89 col = X[:, j] 

90 if X.dtype in (numpy.float32, numpy.float64): 

91 sigma = numpy.var(col) ** 0.5 

92 rnd = rstate.randn(len(col)) * sigma / div 

93 col2 = col + rnd 

94 res[:, j] -= col2 / div 

95 res[:, i] = col2 

96 elif X.dtype in (numpy.int32, numpy.int64): 

97 perm = rstate.permutation(col) 

98 h = rstate.randint(0, div) % X.shape[0] 

99 col2 = col.copy() 

100 col2[h::div] = perm[h::div] # pylint: disable=E1136 

101 res[:, i] = col2 

102 h = (h + 1) % X.shape[0] 

103 res[h, j] = perm[h] # pylint: disable=E1136 

104 else: # pragma: no cover 

105 raise NotImplementedError( # pragma: no cover 

106 f"Unable to add noise to a feature for this type {X.dtype}") 

107 return res 

108 

109 

110########### 

111# datasets 

112########### 

113 

114 

115def _problem_for_predictor_binary_classification( 

116 dtype=numpy.float32, n_features=None, add_nan=False): 

117 """ 

118 Returns *X, y, intial_types, method, node name, X runtime* for a 

119 binary classification problem. 

120 It is based on Iris dataset. 

121 """ 

122 data = load_iris() 

123 X = data.data 

124 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

125 rnd = state.randn(*X.shape) / 3 

126 X += rnd 

127 X = _modify_dimension(X, n_features) 

128 y = data.target 

129 y[y == 2] = 1 

130 if add_nan: 

131 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

132 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

133 X[rows, cols] = numpy.nan 

134 X = X.astype(dtype) 

135 y = y.astype(numpy.int64) 

136 return (X, y, [('X', X[:1].astype(dtype))], 

137 'predict_proba', 1, X.astype(dtype)) 

138 

139 

140def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None): 

141 """ 

142 Returns *X, y, intial_types, method, node name, X runtime* for a 

143 m-cl classification problem. 

144 It is based on Iris dataset. 

145 """ 

146 data = load_iris() 

147 X = data.data 

148 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

149 rnd = state.randn(*X.shape) / 3 

150 X += rnd 

151 X = _modify_dimension(X, n_features) 

152 y = data.target 

153 X = X.astype(dtype) 

154 y = y.astype(numpy.int64) 

155 return (X, y, [('X', X[:1].astype(dtype))], 

156 'predict_proba', 1, X.astype(dtype)) 

157 

158 

159def _problem_for_mixture(dtype=numpy.float32, n_features=None): 

160 """ 

161 Returns *X, y, intial_types, method, node name, X runtime* for a 

162 m-cl classification problem. 

163 It is based on Iris dataset. 

164 """ 

165 data = load_iris() 

166 X = data.data 

167 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

168 rnd = state.randn(*X.shape) / 3 

169 X += rnd 

170 X = _modify_dimension(X, n_features) 

171 y = data.target 

172 X = X.astype(dtype) 

173 y = y.astype(numpy.int64) 

174 return (X, None, [('X', X[:1].astype(dtype))], 

175 'predict_proba', 1, X.astype(dtype)) 

176 

177 

178def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None): 

179 """ 

180 Returns *X, y, intial_types, method, node name, X runtime* for a 

181 m-cl classification problem. 

182 It is based on Iris dataset. 

183 """ 

184 data = load_iris() 

185 X = data.data 

186 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

187 rnd = state.randn(*X.shape) / 3 

188 X += rnd 

189 X = _modify_dimension(X, n_features) 

190 y = data.target 

191 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64) 

192 for i, _ in enumerate(y): 

193 y2[i, _] = 1 

194 for i in range(0, y.shape[0], 5): 

195 y2[i, (y[i] + 1) % 3] = 1 

196 X = X.astype(dtype) 

197 y2 = y2.astype(numpy.int64) 

198 return (X, y2, [('X', X[:1].astype(dtype))], 

199 'predict_proba', 1, X.astype(dtype)) 

200 

201 

202def _problem_for_predictor_regression(many_output=False, options=None, 

203 n_features=None, nbrows=None, 

204 dtype=numpy.float32, add_nan=False, 

205 **kwargs): 

206 """ 

207 Returns *X, y, intial_types, method, name, X runtime* for a 

208 regression problem. 

209 It is based on Iris dataset. 

210 """ 

211 data = load_iris() 

212 X = data.data 

213 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

214 rnd = state.randn(*X.shape) / 3 

215 X += rnd 

216 X = _modify_dimension(X, n_features) 

217 y = data.target + numpy.arange(len(data.target)) / 100 

218 meth = 'predict' if kwargs is None else ('predict', kwargs) 

219 itt = [('X', X[:1].astype(dtype))] 

220 if n_features is not None: 

221 X = X[:, :n_features] 

222 itt = [('X', X[:1].astype(dtype))] 

223 if nbrows is not None: 

224 X = X[:nbrows, :] 

225 y = y[:nbrows] 

226 itt = [('X', X[:1].astype(dtype))] 

227 if options is not None: 

228 itt = itt, options 

229 if add_nan: 

230 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

231 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

232 X[rows, cols] = numpy.nan 

233 X = X.astype(dtype) 

234 y = y.astype(dtype) 

235 return (X, y, itt, 

236 meth, 'all' if many_output else 0, X.astype(dtype)) 

237 

238 

239def _problem_for_predictor_multi_regression(many_output=False, options=None, 

240 n_features=None, nbrows=None, 

241 dtype=numpy.float32, **kwargs): 

242 """ 

243 Returns *X, y, intial_types, method, name, X runtime* for a 

244 mregression problem. 

245 It is based on Iris dataset. 

246 """ 

247 data = load_iris() 

248 X = data.data 

249 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

250 rnd = state.randn(*X.shape) / 3 

251 X += rnd 

252 X = _modify_dimension(X, n_features) 

253 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100 

254 meth = 'predict' if kwargs is None else ('predict', kwargs) 

255 itt = [('X', X[:1].astype(dtype))] 

256 if n_features is not None: 

257 X = X[:, :n_features] 

258 itt = [('X', X[:1].astype(dtype))] 

259 if nbrows is not None: 

260 X = X[:nbrows, :] 

261 y = y[:nbrows] 

262 itt = [('X', X[:1].astype(dtype))] 

263 if options is not None: 

264 itt = itt, options 

265 y2 = numpy.empty((y.shape[0], 2)) 

266 y2[:, 0] = y 

267 y2[:, 1] = y + 0.5 

268 X = X.astype(dtype) 

269 y2 = y2.astype(dtype) 

270 return (X, y2, itt, 

271 meth, 'all' if many_output else 0, X.astype(dtype)) 

272 

273 

274def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None): 

275 """ 

276 Returns *X, intial_types, method, name, X runtime* for a 

277 transformation problem. 

278 It is based on Iris dataset. 

279 """ 

280 data = load_iris() 

281 X = data.data 

282 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

283 rnd = state.randn(*X.shape) / 3 

284 X += rnd 

285 X = _modify_dimension(X, n_features) 

286 X = X.astype(dtype) 

287 return (X, None, [('X', X[:1].astype(dtype))], 

288 'transform', 0, X.astype(dtype=numpy.float32)) 

289 

290 

291def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None): 

292 """ 

293 Returns *X, intial_types, method, name, X runtime* for a 

294 transformation problem. 

295 It is based on Iris dataset. 

296 """ 

297 data = load_iris() 

298 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

299 rnd = state.randn(*data.data.shape) / 3 

300 X = numpy.abs(data.data + rnd) 

301 X = _modify_dimension(X, n_features) 

302 X = X.astype(dtype) 

303 return (X, None, [('X', X[:1].astype(dtype))], 

304 'transform', 0, X.astype(dtype=numpy.float32)) 

305 

306 

307def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None): 

308 """ 

309 Returns *X, intial_types, method, name, X runtime* for a 

310 transformation problem. 

311 It is based on Iris dataset. 

312 """ 

313 data = load_iris() 

314 X = data.data 

315 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

316 rnd = state.randn(*X.shape) / 3 

317 X += rnd 

318 X = _modify_dimension(X, n_features) 

319 y = data.target + numpy.arange(len(data.target)) / 100 

320 X = X.astype(dtype) 

321 y = y.astype(dtype) 

322 return (X, y, [('X', X[:1].astype(dtype))], 

323 'transform', 0, X.astype(dtype)) 

324 

325 

326def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None): 

327 """ 

328 Returns *X, intial_types, method, name, X runtime* for a 

329 transformation problem. 

330 It is based on Iris dataset. 

331 """ 

332 data = load_iris() 

333 X = data.data 

334 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

335 rnd = state.randn(*X.shape) / 3 

336 X += rnd 

337 X = _modify_dimension(X, n_features) 

338 y = data.target 

339 X = X.astype(dtype) 

340 y = y.astype(numpy.int64) 

341 return (X, y, [('X', X[:1].astype(dtype))], 

342 'transform', 0, X.astype(dtype)) 

343 

344 

345def _problem_for_clustering(dtype=numpy.float32, n_features=None): 

346 """ 

347 Returns *X, intial_types, method, name, X runtime* for a 

348 clustering problem. 

349 It is based on Iris dataset. 

350 """ 

351 data = load_iris() 

352 X = data.data 

353 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

354 rnd = state.randn(*X.shape) / 3 

355 X += rnd 

356 X = _modify_dimension(X, n_features) 

357 X = X.astype(dtype) 

358 return (X, None, [('X', X[:1].astype(dtype))], 

359 'predict', 0, X.astype(dtype)) 

360 

361 

362def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None): 

363 """ 

364 Returns *X, intial_types, method, name, X runtime* for a 

365 clustering problem, the score part, not the cluster. 

366 It is based on Iris dataset. 

367 """ 

368 data = load_iris() 

369 X = data.data 

370 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

371 rnd = state.randn(*X.shape) / 3 

372 X += rnd 

373 X = _modify_dimension(X, n_features) 

374 X = X.astype(dtype) 

375 return (X, None, [('X', X[:1].astype(dtype))], 

376 'transform', 1, X.astype(dtype)) 

377 

378 

379def _problem_for_outlier(dtype=numpy.float32, n_features=None): 

380 """ 

381 Returns *X, intial_types, method, name, X runtime* for a 

382 transformation problem. 

383 It is based on Iris dataset. 

384 """ 

385 data = load_iris() 

386 X = data.data 

387 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

388 rnd = state.randn(*X.shape) / 3 

389 X += rnd 

390 X = _modify_dimension(X, n_features) 

391 X = X.astype(dtype) 

392 return (X, None, [('X', X[:1].astype(dtype))], 

393 'predict', 0, X.astype(dtype)) 

394 

395 

396def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None): 

397 """ 

398 Returns *X, y, intial_types, method, name, X runtime* for a 

399 scoring problem. 

400 It is based on Iris dataset. 

401 """ 

402 data = load_iris() 

403 X = data.data 

404 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

405 rnd = state.randn(*X.shape) / 3 

406 X += rnd 

407 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100 

408 y /= numpy.max(y) 

409 X = X.astype(dtype) 

410 y = y.astype(dtype) 

411 return (X, y, [('X', X[:1].astype(dtype))], 

412 'score', 0, X.astype(dtype)) 

413 

414 

415def _problem_for_clnoproba(dtype=numpy.float32, n_features=None): 

416 """ 

417 Returns *X, y, intial_types, method, name, X runtime* for a 

418 scoring problem. 

419 It is based on Iris dataset. 

420 """ 

421 data = load_iris() 

422 X = data.data 

423 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

424 rnd = state.randn(*X.shape) / 3 

425 X += rnd 

426 X = _modify_dimension(X, n_features) 

427 y = data.target 

428 X = X.astype(dtype) 

429 y = y.astype(numpy.int64) 

430 return (X, y, [('X', X[:1].astype(dtype))], 

431 'predict', 0, X.astype(dtype)) 

432 

433 

434def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False): 

435 """ 

436 Returns *X, y, intial_types, method, name, X runtime* for a 

437 scoring problem. Binary classification. 

438 It is based on Iris dataset. 

439 """ 

440 data = load_iris() 

441 X = data.data 

442 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

443 rnd = state.randn(*X.shape) / 3 

444 X += rnd 

445 X = _modify_dimension(X, n_features) 

446 y = data.target 

447 y[y == 2] = 1 

448 if add_nan: 

449 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) 

450 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) 

451 X[rows, cols] = numpy.nan 

452 X = X.astype(dtype) 

453 y = y.astype(numpy.int64) 

454 return (X, y, [('X', X[:1].astype(dtype))], 

455 'predict', 0, X.astype(dtype)) 

456 

457 

458def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None): 

459 """ 

460 Returns *X, y, intial_types, method, name, X runtime* for a 

461 scoring problem. 

462 It is based on Iris dataset. 

463 """ 

464 data = load_iris() 

465 X = data.data 

466 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

467 rnd = state.randn(*X.shape) / 3 

468 X += rnd 

469 X = _modify_dimension(X, n_features) 

470 y = data.target 

471 X = X.astype(dtype) 

472 y = y.astype(numpy.int64) 

473 return (X, y, [('X', X[:1].astype(dtype))], 

474 'decision_function', 1, X.astype(dtype)) 

475 

476 

477def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None): 

478 """ 

479 Returns *X, y, intial_types, method, name, X runtime* for a 

480 scoring problem. Binary classification. 

481 It is based on Iris dataset. 

482 """ 

483 data = load_iris() 

484 X = data.data 

485 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

486 rnd = state.randn(*X.shape) / 3 

487 X += rnd 

488 X = _modify_dimension(X, n_features) 

489 y = data.target 

490 y[y == 2] = 1 

491 X = X.astype(dtype) 

492 y = y.astype(numpy.int64) 

493 return (X, y, [('X', X[:1].astype(dtype))], 

494 'decision_function', 1, X.astype(dtype)) 

495 

496 

497def _problem_for_label_encoder(dtype=numpy.int64, n_features=None): 

498 """ 

499 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`. 

500 """ 

501 data = load_iris() 

502 # X = data.data 

503 y = data.target.astype(dtype) 

504 itt = [('X', y[:1].astype(dtype))] 

505 y = y.astype(dtype) 

506 return (y, None, itt, 'transform', 0, y) 

507 

508 

509def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None): 

510 """ 

511 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`. 

512 """ 

513 from skl2onnx.common.data_types import ( # delayed 

514 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType) 

515 data = load_iris() 

516 # X = data.data 

517 y = data.target 

518 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)] 

519 y2[0][2] = -2 

520 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

521 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))] 

522 y2 = numpy.array(y2) 

523 y = y.astype(numpy.int64) 

524 return (y2, y, itt, 'transform', 0, y2) 

525 

526 

527def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None): 

528 """ 

529 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`. 

530 """ 

531 from skl2onnx.common.data_types import ( # delayed 

532 StringTensorType) 

533 X = numpy.array([_[0] for _ in text_alpha_num]) 

534 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype) 

535 itt = [("X", StringTensorType([None]))] 

536 return (X, y, itt, 'transform', 0, X) 

537 

538 

539def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None): 

540 """ 

541 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`. 

542 """ 

543 from skl2onnx.common.data_types import ( # delayed 

544 FloatTensorType, DoubleTensorType) 

545 X = numpy.array([_[0] for _ in text_alpha_num]) 

546 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype) 

547 X2 = CountVectorizer().fit_transform(X).astype(dtype) 

548 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

549 itt = [("X", cltype([None, X2.shape[1]]))] 

550 return (X2, y, itt, 'transform', 0, X2) 

551 

552 

553def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None): 

554 """ 

555 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`. 

556 """ 

557 from skl2onnx.common.data_types import ( # delayed 

558 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType) 

559 data = load_iris() 

560 # X = data.data 

561 y = data.target 

562 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)] 

563 y2[0]["cl2"] = -2 

564 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType 

565 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))] 

566 y2 = numpy.array(y2) 

567 return (y2, y, itt, 'transform', 0, y2) 

568 

569 

570def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None): 

571 """ 

572 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`. 

573 """ 

574 data = load_iris() 

575 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101 

576 rnd = state.randn(*data.data.shape) / 3 

577 X = _modify_dimension(data.data + rnd, n_features) 

578 X = X.astype(numpy.int32).astype(dtype) 

579 y = data.target 

580 X, y = shuffle(X, y, random_state=1) 

581 itt = [('X', X[:1].astype(dtype))] 

582 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype)) 

583 

584 

585def find_suitable_problem(model): 

586 """ 

587 Determines problems suitable for a given 

588 :epkg:`scikit-learn` operator. It may be 

589 

590 * `b-cl`: binary classification 

591 * `m-cl`: m-cl classification 

592 * `m-label`: classification m-label 

593 (multiple labels possible at the same time) 

594 * `reg`: regression 

595 * `m-reg`: regression multi-output 

596 * `num-tr`: transform numerical features 

597 * `num-tr-pos`: transform numerical positive features 

598 * `scoring`: transform numerical features, target is usually needed 

599 * `outlier`: outlier prediction 

600 * `linearsvc`: classifier without *predict_proba* 

601 * `cluster`: similar to transform 

602 * `num+y-tr`: similar to transform with targets 

603 * `num+y-tr-cl`: similar to transform with classes 

604 * `num-tr-clu`: similar to cluster, but returns 

605 scores or distances instead of cluster 

606 * `key-col`: list of dictionaries 

607 * `text-col`: one column of text 

608 

609 Suffix `nofit` indicates the predictions happens 

610 without the model being fitted. This is the case 

611 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`. 

612 The suffix `-cov` indicates the method `predict` was called 

613 with parameter ``return_cov=True``, `-std` tells 

614 method `predict` was called with parameter ``return_std=True``. 

615 The suffix ``-NSV`` creates an input variable 

616 like the following ``[('X', FloatTensorType([None, None]))]``. 

617 That's a way to bypass :epkg:`onnxruntime` shape checking 

618 as one part of the graph is designed to handle any 

619 kind of dimensions but apparently, if the input shape is 

620 precise, every part of the graph has to be precise. The strings 

621 used variables which means it is at the same time precise 

622 and unprecise. Suffix ``'-64'`` means the model will 

623 do double computations. Suffix ``-nop`` means the classifier 

624 does not implement method *predict_proba*. Suffix ``-1d`` 

625 means a one dimension problem (one feature). Suffix ``-dec`` 

626 checks method `decision_function`. 

627 

628 The following script gives the list of :epkg:`scikit-learn` 

629 models and the problem they can be fitted on. 

630 

631 .. runpython:: 

632 :showcode: 

633 :warningout: DeprecationWarning 

634 :rst: 

635 

636 from mlprodict.onnxrt.validate.validate import ( 

637 sklearn_operators, find_suitable_problem) 

638 from pyquickhelper.pandashelper import df2rst 

639 from pandas import DataFrame 

640 res = sklearn_operators() 

641 rows = [] 

642 for model in res[:20]: 

643 name = model['name'] 

644 row = dict(name=name) 

645 try: 

646 prob = find_suitable_problem(model['cl']) 

647 if prob is None: 

648 continue 

649 for p in prob: 

650 row[p] = 'X' 

651 except RuntimeError: 

652 pass 

653 rows.append(row) 

654 df = DataFrame(rows).set_index('name') 

655 df = df.sort_index() 

656 print(df2rst(df, index=True)) 

657 

658 The list is truncated. The full list can be found at 

659 :ref:`l-model-problem-list`. 

660 """ 

661 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem 

662 

663 def _internal(model): # pylint: disable=R0911 

664 

665 # checks that this model is not overwritten by this module 

666 ext = ext_find_suitable_problem(model) 

667 if ext is not None: 

668 return ext 

669 

670 # Exceptions 

671 if model in {GaussianProcessRegressor}: 

672 # m-reg causes MemoryError on some machine. 

673 return ['~b-reg-NF-64', # '~m-reg-NF-64', 

674 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64', 

675 '~b-reg-NF-std-64', # '~m-reg-NF-std-64', 

676 '~b-reg-NSV-64', # '~m-reg-NSV-64', 

677 '~b-reg-cov-64', # '~m-reg-cov-64', 

678 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64', 

679 'b-reg', '~b-reg-64', # 'm-reg' 

680 ] 

681 

682 if model in {DictVectorizer}: 

683 return ['key-int-col'] 

684 

685 if model in {TfidfVectorizer, CountVectorizer}: 

686 return ['text-col'] 

687 

688 if model in {TfidfTransformer}: 

689 return ['bow'] 

690 

691 if model in {FeatureHasher}: 

692 return ['key-str-col'] 

693 

694 if model in {OneHotEncoder}: 

695 return ['one-hot'] 

696 

697 if model in {LabelBinarizer, LabelEncoder}: 

698 return ['int-col'] 

699 

700 if model in {NuSVC, SVC, SGDClassifier, 

701 HistGradientBoostingClassifier}: 

702 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan'] 

703 

704 if model in {GaussianProcessClassifier}: 

705 return ['b-cl', 'm-cl', '~b-cl-64'] 

706 

707 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV, 

708 ComplementNB, GaussianNB, 

709 GradientBoostingClassifier, LabelPropagation, LabelSpreading, 

710 LinearDiscriminantAnalysis, LogisticRegressionCV, 

711 MultinomialNB, QuadraticDiscriminantAnalysis, 

712 RandomizedSearchCV}: 

713 return ['b-cl', 'm-cl'] 

714 

715 if model in {Perceptron}: 

716 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec'] 

717 

718 if model in {AdaBoostRegressor}: 

719 return ['b-reg', '~b-reg-64'] 

720 

721 if model in {HistGradientBoostingRegressor}: 

722 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64'] 

723 

724 if model in {LinearSVC, NearestCentroid}: 

725 return ['~b-cl-nop', '~b-cl-nop-64'] 

726 

727 if model in {RFE, RFECV}: 

728 return ['num+y-tr'] 

729 

730 if model in {GridSearchCV}: 

731 return ['b-cl', 'm-cl', 

732 'b-reg', 'm-reg', 

733 '~b-reg-64', '~b-cl-64', 

734 'cluster', 'outlier', '~m-label'] 

735 

736 if model in {VotingClassifier}: 

737 return ['b-cl', 'm-cl'] 

738 

739 if StackingClassifier is not None and model in {StackingClassifier}: 

740 return ['b-cl'] 

741 

742 if StackingRegressor is not None and model in {StackingRegressor}: 

743 return ['b-reg'] 

744 

745 # specific scenarios 

746 if model in {IsotonicRegression}: 

747 return ['~num+y-tr-1d', '~b-reg-1d'] 

748 

749 if model in {ARDRegression, BayesianRidge, ElasticNetCV, 

750 GradientBoostingRegressor, 

751 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC, 

752 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV, 

753 PassiveAggressiveRegressor, SGDRegressor, 

754 TheilSenRegressor, HuberRegressor, SVR}: 

755 return ['b-reg', '~b-reg-64'] 

756 

757 if model in {MultiOutputClassifier}: 

758 return ['m-cl', '~m-label'] 

759 

760 if model in {MultiOutputRegressor, MultiTaskElasticNet, 

761 MultiTaskElasticNetCV, MultiTaskLassoCV, 

762 MultiTaskLasso}: 

763 return ['m-reg'] 

764 

765 if model in {OneVsOneClassifier, OutputCodeClassifier, 

766 PassiveAggressiveClassifier, RadiusNeighborsClassifier}: 

767 return ['~b-cl-nop', '~m-cl-nop'] 

768 

769 if model in {RidgeClassifier, RidgeClassifierCV}: 

770 return ['~b-cl-nop', '~m-cl-nop', '~m-label'] 

771 

772 # trainable transform 

773 if model in {GenericUnivariateSelect, 

774 NeighborhoodComponentsAnalysis, 

775 PLSSVD, SelectKBest, 

776 SelectPercentile, SelectFromModel}: 

777 return ["num+y-tr"] 

778 

779 if model in {SelectFwe, SelectFdr, SelectFpr}: 

780 return ["num+y-tr-cl"] 

781 

782 # no m-label 

783 if model in {AdaBoostClassifier}: 

784 return ['b-cl', '~b-cl-64', 'm-cl'] 

785 

786 if model in {LogisticRegression}: 

787 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec'] 

788 

789 if model in {RandomForestClassifier}: 

790 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label'] 

791 

792 if model in {DecisionTreeClassifier, ExtraTreeClassifier}: 

793 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label'] 

794 

795 if model in {DecisionTreeRegressor}: 

796 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100'] 

797 

798 if model in {LatentDirichletAllocation, NMF, PowerTransformer}: 

799 return ['num-tr-pos'] 

800 

801 if hasattr(model, 'predict'): 

802 if "Classifier" in str(model): 

803 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label'] 

804 elif "Regressor" in str(model): 

805 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'] 

806 

807 # Generic case. 

808 res = [] 

809 if hasattr(model, 'transform'): 

810 if issubclass(model, (RegressorMixin, ClassifierMixin)): 

811 res.extend(['num+y-tr']) 

812 elif issubclass(model, (ClusterMixin, BiclusterMixin)): 

813 res.extend(['~num-tr-clu', '~num-tr-clu-64']) 

814 else: 

815 res.extend(['num-tr']) 

816 

817 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)): 

818 res.extend(['cluster', '~b-clu-64']) 

819 

820 if issubclass(model, (OutlierMixin)): 

821 res.extend(['outlier']) 

822 

823 if issubclass(model, ClassifierMixin): 

824 if model is OneVsRestClassifier: 

825 return ['m-cl', '~m-label'] 

826 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label']) 

827 if issubclass(model, RegressorMixin): 

828 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']) 

829 if issubclass(model, BaseMixture): 

830 res.extend(['mix', '~mix-64']) 

831 

832 if len(res) > 0: 

833 return res 

834 

835 raise RuntimeError("Unable to find problem for model '{}' - {}." 

836 "".format(model.__name__, model.__bases__)) 

837 

838 res = _internal(model) 

839 for r in res: 

840 if r not in _problems: 

841 raise ValueError( # pragma: no cover 

842 "Unrecognized problem '{}' in\n{}".format( 

843 r, "\n".join(sorted(_problems)))) 

844 return res 

845 

846 

847_problems = { 

848 # standard 

849 "b-cl": _problem_for_predictor_binary_classification, 

850 "m-cl": _problem_for_predictor_multi_classification, 

851 "b-reg": _problem_for_predictor_regression, 

852 "m-reg": _problem_for_predictor_multi_regression, 

853 "num-tr": _problem_for_numerical_transform, 

854 "num-tr-pos": _problem_for_numerical_transform_positive, 

855 'outlier': _problem_for_outlier, 

856 'cluster': _problem_for_clustering, 

857 'num+y-tr': _problem_for_numerical_trainable_transform, 

858 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl, 

859 'mix': _problem_for_mixture, 

860 # others 

861 '~num-tr-clu': _problem_for_clustering_scores, 

862 "~m-label": _problem_for_predictor_multi_classification_label, 

863 "~scoring": _problem_for_numerical_scoring, 

864 '~b-cl-nop': _problem_for_clnoproba_binary, 

865 '~m-cl-nop': _problem_for_clnoproba, 

866 '~b-cl-dec': _problem_for_cl_decision_function_binary, 

867 '~m-cl-dec': _problem_for_cl_decision_function, 

868 # nan 

869 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression( 

870 n_features=n_features, add_nan=True), 

871 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression( 

872 dtype=numpy.float64, n_features=n_features, add_nan=True), 

873 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification( 

874 dtype=dtype, n_features=n_features, add_nan=True), 

875 # 100 features 

876 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression( 

877 n_features=n_features or 100), 

878 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification( 

879 n_features=n_features or 100), 

880 # 64 

881 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification( 

882 dtype=numpy.float64, n_features=n_features), 

883 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression( 

884 dtype=numpy.float64, n_features=n_features), 

885 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba( 

886 dtype=numpy.float64, n_features=n_features), 

887 '~b-clu-64': lambda n_features=None: _problem_for_clustering( 

888 dtype=numpy.float64, n_features=n_features), 

889 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary( 

890 dtype=numpy.float64, n_features=n_features), 

891 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores( 

892 dtype=numpy.float64, n_features=n_features), 

893 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression( 

894 dtype=numpy.float64, n_features=n_features), 

895 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform( 

896 dtype=numpy.float64, n_features=n_features), 

897 '~mix-64': lambda n_features=None: _problem_for_mixture( 

898 dtype=numpy.float64, n_features=n_features), 

899 # 

900 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification( 

901 n_features=n_features) + (False, )), 

902 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification( 

903 n_features=n_features) + (False, )), 

904 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression( 

905 n_features=n_features) + (False, )), 

906 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression( 

907 n_features=n_features) + (False, )), 

908 # 

909 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification( 

910 dtype=numpy.float64, n_features=n_features) + (False, )), 

911 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification( 

912 dtype=numpy.float64, n_features=n_features) + (False, )), 

913 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression( 

914 dtype=numpy.float64, n_features=n_features) + (False, )), 

915 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

916 dtype=numpy.float64, n_features=n_features) + (False, )), 

917 # GaussianProcess 

918 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression( 

919 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

920 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

921 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

922 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

923 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

924 # 

925 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression( 

926 True, options={GaussianProcessRegressor: {"return_std": True}}, 

927 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

928 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

929 True, options={GaussianProcessRegressor: {"return_std": True}}, 

930 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )), 

931 # 

932 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression( 

933 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

934 return_cov=True, dtype=numpy.float64, n_features=n_features)), 

935 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

936 True, options={GaussianProcessRegressor: {"return_cov": True}}, 

937 return_cov=True, dtype=numpy.float64, n_features=n_features)), 

938 # 

939 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression( 

940 True, options={GaussianProcessRegressor: {"return_std": True}}, 

941 return_std=True, dtype=numpy.float64, n_features=n_features)), 

942 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression( 

943 True, options={GaussianProcessRegressor: {"return_std": True}}, 

944 return_std=True, dtype=numpy.float64, n_features=n_features)), 

945 # 

946 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression( 

947 dtype=numpy.float64, n_features=n_features)), 

948 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression( 

949 dtype=numpy.float64, n_features=n_features)), 

950 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression( 

951 True, options={GaussianProcessRegressor: {"return_std": True}}, 

952 return_std=True, dtype=numpy.float64, n_features=n_features))), 

953 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression( 

954 True, options={GaussianProcessRegressor: {"return_std": True}}, 

955 return_std=True, dtype=numpy.float64, n_features=n_features))), 

956 # isotonic 

957 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression), 

958 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform), 

959 # text 

960 "key-int-col": _problem_for_dict_vectorizer, 

961 "key-str-col": _problem_for_feature_hasher, 

962 "int-col": _problem_for_label_encoder, 

963 "one-hot": _problem_for_one_hot_encoder, 

964 'text-col': _problem_for_tfidf_vectorizer, 

965 'bow': _problem_for_tfidf_transformer, 

966}