Coverage for mlprodict/onnxrt/validate/validate

1# pylint: disable=E1101

2"""

3@file

4@brief Validates runtime for many :scikit-learn: operators.

5The submodule relies on :epkg:`onnxconverter_common`,

6:epkg:`sklearn-onnx`.

7"""

8import numpy

9from sklearn.base import (

10 ClusterMixin, BiclusterMixin, OutlierMixin,

11 RegressorMixin, ClassifierMixin)

12from sklearn.calibration import CalibratedClassifierCV

13from sklearn.cross_decomposition import PLSSVD

14from sklearn.datasets import load_iris

15from sklearn.decomposition import LatentDirichletAllocation, NMF

16from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

17from sklearn.ensemble import (

18 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier,

19 BaggingClassifier, VotingClassifier, GradientBoostingClassifier,

20 RandomForestClassifier)

21try:

22 from sklearn.ensemble import StackingClassifier, StackingRegressor

23except ImportError: # pragma: no cover

24 # new in 0.22

25 StackingClassifier, StackingRegressor = None, None

26from sklearn.feature_extraction import DictVectorizer, FeatureHasher

27from sklearn.feature_extraction.text import (

28 CountVectorizer, TfidfVectorizer, TfidfTransformer)

29from sklearn.ensemble import (

30 HistGradientBoostingRegressor,

31 HistGradientBoostingClassifier)

32from sklearn.feature_selection import (

33 RFE, RFECV, GenericUnivariateSelect,

34 SelectPercentile, SelectFwe, SelectKBest,

35 SelectFdr, SelectFpr, SelectFromModel)

36from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor

37from sklearn.isotonic import IsotonicRegression

38from sklearn.linear_model import (

39 ARDRegression, ElasticNetCV,

40 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,

41 SGDRegressor, OrthogonalMatchingPursuitCV,

42 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet,

43 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso,

44 PassiveAggressiveClassifier, RidgeClassifier,

45 RidgeClassifierCV, PassiveAggressiveRegressor,

46 HuberRegressor, LogisticRegression, SGDClassifier,

47 LogisticRegressionCV, Perceptron)

48from sklearn.mixture._base import BaseMixture

49from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

50from sklearn.multiclass import (

51 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier)

52from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier

53from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB

54from sklearn.neighbors import (

55 NearestCentroid, RadiusNeighborsClassifier,

56 NeighborhoodComponentsAnalysis)

57from sklearn.preprocessing import (

58 LabelBinarizer, LabelEncoder,

59 OneHotEncoder, PowerTransformer)

60from sklearn.semi_supervised import LabelPropagation, LabelSpreading

61from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC

62from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier

63from sklearn.utils import shuffle

64from ._validate_problems_helper import (

65 _noshapevar, _1d_problem, text_alpha_num)

68def _modify_dimension(X, n_features, seed=19):

69 """

70 Modifies the number of features to increase

71 or reduce the number of features.

73 @param X features matrix

74 @param n_features number of features

75 @param seed random seed (to get the same

76 dataset at each call)

77 @return new featurs matrix

78 """

79 if n_features is None or n_features == X.shape[1]:

80 return X

81 if n_features < X.shape[1]:

82 return X[:, :n_features]

83 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101

84 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype)

85 res[:, :X.shape[1]] = X[:, :]

86 div = max((n_features // X.shape[1]) + 1, 2)

87 for i in range(X.shape[1], res.shape[1]):

88 j = i % X.shape[1]

89 col = X[:, j]

90 if X.dtype in (numpy.float32, numpy.float64):

91 sigma = numpy.var(col) ** 0.5

92 rnd = rstate.randn(len(col)) * sigma / div

93 col2 = col + rnd

94 res[:, j] -= col2 / div

95 res[:, i] = col2

96 elif X.dtype in (numpy.int32, numpy.int64):

97 perm = rstate.permutation(col)

98 h = rstate.randint(0, div) % X.shape[0]

99 col2 = col.copy()

100 col2[h::div] = perm[h::div] # pylint: disable=E1136

101 res[:, i] = col2

102 h = (h + 1) % X.shape[0]

103 res[h, j] = perm[h] # pylint: disable=E1136

104 else: # pragma: no cover

105 raise NotImplementedError( # pragma: no cover

106 f"Unable to add noise to a feature for this type {X.dtype}")

107 return res

108

109

110###########

111# datasets

112###########

113

114

115def _problem_for_predictor_binary_classification(

116 dtype=numpy.float32, n_features=None, add_nan=False):

117 """

118 Returns *X, y, intial_types, method, node name, X runtime* for a

119 binary classification problem.

120 It is based on Iris dataset.

121 """

122 data = load_iris()

123 X = data.data

124 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

125 rnd = state.randn(*X.shape) / 3

126 X += rnd

127 X = _modify_dimension(X, n_features)

128 y = data.target

129 y[y == 2] = 1

130 if add_nan:

131 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

132 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

133 X[rows, cols] = numpy.nan

134 X = X.astype(dtype)

135 y = y.astype(numpy.int64)

136 return (X, y, [('X', X[:1].astype(dtype))],

137 'predict_proba', 1, X.astype(dtype))

138

139

140def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None):

141 """

142 Returns *X, y, intial_types, method, node name, X runtime* for a

143 m-cl classification problem.

144 It is based on Iris dataset.

145 """

146 data = load_iris()

147 X = data.data

148 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

149 rnd = state.randn(*X.shape) / 3

150 X += rnd

151 X = _modify_dimension(X, n_features)

152 y = data.target

153 X = X.astype(dtype)

154 y = y.astype(numpy.int64)

155 return (X, y, [('X', X[:1].astype(dtype))],

156 'predict_proba', 1, X.astype(dtype))

157

158

159def _problem_for_mixture(dtype=numpy.float32, n_features=None):

160 """

161 Returns *X, y, intial_types, method, node name, X runtime* for a

162 m-cl classification problem.

163 It is based on Iris dataset.

164 """

165 data = load_iris()

166 X = data.data

167 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

168 rnd = state.randn(*X.shape) / 3

169 X += rnd

170 X = _modify_dimension(X, n_features)

171 y = data.target

172 X = X.astype(dtype)

173 y = y.astype(numpy.int64)

174 return (X, None, [('X', X[:1].astype(dtype))],

175 'predict_proba', 1, X.astype(dtype))

176

177

178def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None):

179 """

180 Returns *X, y, intial_types, method, node name, X runtime* for a

181 m-cl classification problem.

182 It is based on Iris dataset.

183 """

184 data = load_iris()

185 X = data.data

186 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

187 rnd = state.randn(*X.shape) / 3

188 X += rnd

189 X = _modify_dimension(X, n_features)

190 y = data.target

191 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64)

192 for i, _ in enumerate(y):

193 y2[i, _] = 1

194 for i in range(0, y.shape[0], 5):

195 y2[i, (y[i] + 1) % 3] = 1

196 X = X.astype(dtype)

197 y2 = y2.astype(numpy.int64)

198 return (X, y2, [('X', X[:1].astype(dtype))],

199 'predict_proba', 1, X.astype(dtype))

200

201

202def _problem_for_predictor_regression(many_output=False, options=None,

203 n_features=None, nbrows=None,

204 dtype=numpy.float32, add_nan=False,

205 **kwargs):

206 """

207 Returns *X, y, intial_types, method, name, X runtime* for a

208 regression problem.

209 It is based on Iris dataset.

210 """

211 data = load_iris()

212 X = data.data

213 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

214 rnd = state.randn(*X.shape) / 3

215 X += rnd

216 X = _modify_dimension(X, n_features)

217 y = data.target + numpy.arange(len(data.target)) / 100

218 meth = 'predict' if kwargs is None else ('predict', kwargs)

219 itt = [('X', X[:1].astype(dtype))]

220 if n_features is not None:

221 X = X[:, :n_features]

222 itt = [('X', X[:1].astype(dtype))]

223 if nbrows is not None:

224 X = X[:nbrows, :]

225 y = y[:nbrows]

226 itt = [('X', X[:1].astype(dtype))]

227 if options is not None:

228 itt = itt, options

229 if add_nan:

230 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

231 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

232 X[rows, cols] = numpy.nan

233 X = X.astype(dtype)

234 y = y.astype(dtype)

235 return (X, y, itt,

236 meth, 'all' if many_output else 0, X.astype(dtype))

237

238

239def _problem_for_predictor_multi_regression(many_output=False, options=None,

240 n_features=None, nbrows=None,

241 dtype=numpy.float32, **kwargs):

242 """

243 Returns *X, y, intial_types, method, name, X runtime* for a

244 mregression problem.

245 It is based on Iris dataset.

246 """

247 data = load_iris()

248 X = data.data

249 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

250 rnd = state.randn(*X.shape) / 3

251 X += rnd

252 X = _modify_dimension(X, n_features)

253 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100

254 meth = 'predict' if kwargs is None else ('predict', kwargs)

255 itt = [('X', X[:1].astype(dtype))]

256 if n_features is not None:

257 X = X[:, :n_features]

258 itt = [('X', X[:1].astype(dtype))]

259 if nbrows is not None:

260 X = X[:nbrows, :]

261 y = y[:nbrows]

262 itt = [('X', X[:1].astype(dtype))]

263 if options is not None:

264 itt = itt, options

265 y2 = numpy.empty((y.shape[0], 2))

266 y2[:, 0] = y

267 y2[:, 1] = y + 0.5

268 X = X.astype(dtype)

269 y2 = y2.astype(dtype)

270 return (X, y2, itt,

271 meth, 'all' if many_output else 0, X.astype(dtype))

272

273

274def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None):

275 """

276 Returns *X, intial_types, method, name, X runtime* for a

277 transformation problem.

278 It is based on Iris dataset.

279 """

280 data = load_iris()

281 X = data.data

282 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

283 rnd = state.randn(*X.shape) / 3

284 X += rnd

285 X = _modify_dimension(X, n_features)

286 X = X.astype(dtype)

287 return (X, None, [('X', X[:1].astype(dtype))],

288 'transform', 0, X.astype(dtype=numpy.float32))

289

290

291def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None):

292 """

293 Returns *X, intial_types, method, name, X runtime* for a

294 transformation problem.

295 It is based on Iris dataset.

296 """

297 data = load_iris()

298 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

299 rnd = state.randn(*data.data.shape) / 3

300 X = numpy.abs(data.data + rnd)

301 X = _modify_dimension(X, n_features)

302 X = X.astype(dtype)

303 return (X, None, [('X', X[:1].astype(dtype))],

304 'transform', 0, X.astype(dtype=numpy.float32))

305

306

307def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None):

308 """

309 Returns *X, intial_types, method, name, X runtime* for a

310 transformation problem.

311 It is based on Iris dataset.

312 """

313 data = load_iris()

314 X = data.data

315 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

316 rnd = state.randn(*X.shape) / 3

317 X += rnd

318 X = _modify_dimension(X, n_features)

319 y = data.target + numpy.arange(len(data.target)) / 100

320 X = X.astype(dtype)

321 y = y.astype(dtype)

322 return (X, y, [('X', X[:1].astype(dtype))],

323 'transform', 0, X.astype(dtype))

324

325

326def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None):

327 """

328 Returns *X, intial_types, method, name, X runtime* for a

329 transformation problem.

330 It is based on Iris dataset.

331 """

332 data = load_iris()

333 X = data.data

334 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

335 rnd = state.randn(*X.shape) / 3

336 X += rnd

337 X = _modify_dimension(X, n_features)

338 y = data.target

339 X = X.astype(dtype)

340 y = y.astype(numpy.int64)

341 return (X, y, [('X', X[:1].astype(dtype))],

342 'transform', 0, X.astype(dtype))

343

344

345def _problem_for_clustering(dtype=numpy.float32, n_features=None):

346 """

347 Returns *X, intial_types, method, name, X runtime* for a

348 clustering problem.

349 It is based on Iris dataset.

350 """

351 data = load_iris()

352 X = data.data

353 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

354 rnd = state.randn(*X.shape) / 3

355 X += rnd

356 X = _modify_dimension(X, n_features)

357 X = X.astype(dtype)

358 return (X, None, [('X', X[:1].astype(dtype))],

359 'predict', 0, X.astype(dtype))

360

361

362def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None):

363 """

364 Returns *X, intial_types, method, name, X runtime* for a

365 clustering problem, the score part, not the cluster.

366 It is based on Iris dataset.

367 """

368 data = load_iris()

369 X = data.data

370 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

371 rnd = state.randn(*X.shape) / 3

372 X += rnd

373 X = _modify_dimension(X, n_features)

374 X = X.astype(dtype)

375 return (X, None, [('X', X[:1].astype(dtype))],

376 'transform', 1, X.astype(dtype))

377

378

379def _problem_for_outlier(dtype=numpy.float32, n_features=None):

380 """

381 Returns *X, intial_types, method, name, X runtime* for a

382 transformation problem.

383 It is based on Iris dataset.

384 """

385 data = load_iris()

386 X = data.data

387 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

388 rnd = state.randn(*X.shape) / 3

389 X += rnd

390 X = _modify_dimension(X, n_features)

391 X = X.astype(dtype)

392 return (X, None, [('X', X[:1].astype(dtype))],

393 'predict', 0, X.astype(dtype))

394

395

396def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None):

397 """

398 Returns *X, y, intial_types, method, name, X runtime* for a

399 scoring problem.

400 It is based on Iris dataset.

401 """

402 data = load_iris()

403 X = data.data

404 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

405 rnd = state.randn(*X.shape) / 3

406 X += rnd

407 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100

408 y /= numpy.max(y)

409 X = X.astype(dtype)

410 y = y.astype(dtype)

411 return (X, y, [('X', X[:1].astype(dtype))],

412 'score', 0, X.astype(dtype))

413

414

415def _problem_for_clnoproba(dtype=numpy.float32, n_features=None):

416 """

417 Returns *X, y, intial_types, method, name, X runtime* for a

418 scoring problem.

419 It is based on Iris dataset.

420 """

421 data = load_iris()

422 X = data.data

423 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

424 rnd = state.randn(*X.shape) / 3

425 X += rnd

426 X = _modify_dimension(X, n_features)

427 y = data.target

428 X = X.astype(dtype)

429 y = y.astype(numpy.int64)

430 return (X, y, [('X', X[:1].astype(dtype))],

431 'predict', 0, X.astype(dtype))

432

433

434def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False):

435 """

436 Returns *X, y, intial_types, method, name, X runtime* for a

437 scoring problem. Binary classification.

438 It is based on Iris dataset.

439 """

440 data = load_iris()

441 X = data.data

442 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

443 rnd = state.randn(*X.shape) / 3

444 X += rnd

445 X = _modify_dimension(X, n_features)

446 y = data.target

447 y[y == 2] = 1

448 if add_nan:

449 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)

450 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)

451 X[rows, cols] = numpy.nan

452 X = X.astype(dtype)

453 y = y.astype(numpy.int64)

454 return (X, y, [('X', X[:1].astype(dtype))],

455 'predict', 0, X.astype(dtype))

456

457

458def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None):

459 """

460 Returns *X, y, intial_types, method, name, X runtime* for a

461 scoring problem.

462 It is based on Iris dataset.

463 """

464 data = load_iris()

465 X = data.data

466 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

467 rnd = state.randn(*X.shape) / 3

468 X += rnd

469 X = _modify_dimension(X, n_features)

470 y = data.target

471 X = X.astype(dtype)

472 y = y.astype(numpy.int64)

473 return (X, y, [('X', X[:1].astype(dtype))],

474 'decision_function', 1, X.astype(dtype))

475

476

477def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None):

478 """

479 Returns *X, y, intial_types, method, name, X runtime* for a

480 scoring problem. Binary classification.

481 It is based on Iris dataset.

482 """

483 data = load_iris()

484 X = data.data

485 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

486 rnd = state.randn(*X.shape) / 3

487 X += rnd

488 X = _modify_dimension(X, n_features)

489 y = data.target

490 y[y == 2] = 1

491 X = X.astype(dtype)

492 y = y.astype(numpy.int64)

493 return (X, y, [('X', X[:1].astype(dtype))],

494 'decision_function', 1, X.astype(dtype))

495

496

497def _problem_for_label_encoder(dtype=numpy.int64, n_features=None):

498 """

499 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`.

500 """

501 data = load_iris()

502 # X = data.data

503 y = data.target.astype(dtype)

504 itt = [('X', y[:1].astype(dtype))]

505 y = y.astype(dtype)

506 return (y, None, itt, 'transform', 0, y)

507

508

509def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None):

510 """

511 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.

512 """

513 from skl2onnx.common.data_types import ( # delayed

514 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)

515 data = load_iris()

516 # X = data.data

517 y = data.target

518 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)]

519 y2[0][2] = -2

520 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

521 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]

522 y2 = numpy.array(y2)

523 y = y.astype(numpy.int64)

524 return (y2, y, itt, 'transform', 0, y2)

525

526

527def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None):

528 """

529 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`.

530 """

531 from skl2onnx.common.data_types import ( # delayed

532 StringTensorType)

533 X = numpy.array([_[0] for _ in text_alpha_num])

534 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)

535 itt = [("X", StringTensorType([None]))]

536 return (X, y, itt, 'transform', 0, X)

537

538

539def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None):

540 """

541 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`.

542 """

543 from skl2onnx.common.data_types import ( # delayed

544 FloatTensorType, DoubleTensorType)

545 X = numpy.array([_[0] for _ in text_alpha_num])

546 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)

547 X2 = CountVectorizer().fit_transform(X).astype(dtype)

548 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

549 itt = [("X", cltype([None, X2.shape[1]]))]

550 return (X2, y, itt, 'transform', 0, X2)

551

552

553def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None):

554 """

555 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.

556 """

557 from skl2onnx.common.data_types import ( # delayed

558 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)

559 data = load_iris()

560 # X = data.data

561 y = data.target

562 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)]

563 y2[0]["cl2"] = -2

564 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType

565 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]

566 y2 = numpy.array(y2)

567 return (y2, y, itt, 'transform', 0, y2)

568

569

570def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None):

571 """

572 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`.

573 """

574 data = load_iris()

575 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101

576 rnd = state.randn(*data.data.shape) / 3

577 X = _modify_dimension(data.data + rnd, n_features)

578 X = X.astype(numpy.int32).astype(dtype)

579 y = data.target

580 X, y = shuffle(X, y, random_state=1)

581 itt = [('X', X[:1].astype(dtype))]

582 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype))

583

584

585def find_suitable_problem(model):

586 """

587 Determines problems suitable for a given

588 :epkg:`scikit-learn` operator. It may be

589

590 * `b-cl`: binary classification

591 * `m-cl`: m-cl classification

592 * `m-label`: classification m-label

593 (multiple labels possible at the same time)

594 * `reg`: regression

595 * `m-reg`: regression multi-output

596 * `num-tr`: transform numerical features

597 * `num-tr-pos`: transform numerical positive features

598 * `scoring`: transform numerical features, target is usually needed

599 * `outlier`: outlier prediction

600 * `linearsvc`: classifier without *predict_proba*

601 * `cluster`: similar to transform

602 * `num+y-tr`: similar to transform with targets

603 * `num+y-tr-cl`: similar to transform with classes

604 * `num-tr-clu`: similar to cluster, but returns

605 scores or distances instead of cluster

606 * `key-col`: list of dictionaries

607 * `text-col`: one column of text

608

609 Suffix `nofit` indicates the predictions happens

610 without the model being fitted. This is the case

611 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`.

612 The suffix `-cov` indicates the method `predict` was called

613 with parameter ``return_cov=True``, `-std` tells

614 method `predict` was called with parameter ``return_std=True``.

615 The suffix ``-NSV`` creates an input variable

616 like the following ``[('X', FloatTensorType([None, None]))]``.

617 That's a way to bypass :epkg:`onnxruntime` shape checking

618 as one part of the graph is designed to handle any

619 kind of dimensions but apparently, if the input shape is

620 precise, every part of the graph has to be precise. The strings

621 used variables which means it is at the same time precise

622 and unprecise. Suffix ``'-64'`` means the model will

623 do double computations. Suffix ``-nop`` means the classifier

624 does not implement method *predict_proba*. Suffix ``-1d``

625 means a one dimension problem (one feature). Suffix ``-dec``

626 checks method `decision_function`.

627

628 The following script gives the list of :epkg:`scikit-learn`

629 models and the problem they can be fitted on.

630

631 .. runpython::

632 :showcode:

633 :warningout: DeprecationWarning

634 :rst:

635

636 from mlprodict.onnxrt.validate.validate import (

637 sklearn_operators, find_suitable_problem)

638 from pyquickhelper.pandashelper import df2rst

639 from pandas import DataFrame

640 res = sklearn_operators()

641 rows = []

642 for model in res[:20]:

643 name = model['name']

644 row = dict(name=name)

645 try:

646 prob = find_suitable_problem(model['cl'])

647 if prob is None:

648 continue

649 for p in prob:

650 row[p] = 'X'

651 except RuntimeError:

652 pass

653 rows.append(row)

654 df = DataFrame(rows).set_index('name')

655 df = df.sort_index()

656 print(df2rst(df, index=True))

657

658 The list is truncated. The full list can be found at

659 :ref:`l-model-problem-list`.

660 """

661 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem

662

663 def _internal(model): # pylint: disable=R0911

664

665 # checks that this model is not overwritten by this module

666 ext = ext_find_suitable_problem(model)

667 if ext is not None:

668 return ext

669

670 # Exceptions

671 if model in {GaussianProcessRegressor}:

672 # m-reg causes MemoryError on some machine.

673 return ['~b-reg-NF-64', # '~m-reg-NF-64',

674 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64',

675 '~b-reg-NF-std-64', # '~m-reg-NF-std-64',

676 '~b-reg-NSV-64', # '~m-reg-NSV-64',

677 '~b-reg-cov-64', # '~m-reg-cov-64',

678 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64',

679 'b-reg', '~b-reg-64', # 'm-reg'

680 ]

681

682 if model in {DictVectorizer}:

683 return ['key-int-col']

684

685 if model in {TfidfVectorizer, CountVectorizer}:

686 return ['text-col']

687

688 if model in {TfidfTransformer}:

689 return ['bow']

690

691 if model in {FeatureHasher}:

692 return ['key-str-col']

693

694 if model in {OneHotEncoder}:

695 return ['one-hot']

696

697 if model in {LabelBinarizer, LabelEncoder}:

698 return ['int-col']

699

700 if model in {NuSVC, SVC, SGDClassifier,

701 HistGradientBoostingClassifier}:

702 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan']

703

704 if model in {GaussianProcessClassifier}:

705 return ['b-cl', 'm-cl', '~b-cl-64']

706

707 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV,

708 ComplementNB, GaussianNB,

709 GradientBoostingClassifier, LabelPropagation, LabelSpreading,

710 LinearDiscriminantAnalysis, LogisticRegressionCV,

711 MultinomialNB, QuadraticDiscriminantAnalysis,

712 RandomizedSearchCV}:

713 return ['b-cl', 'm-cl']

714

715 if model in {Perceptron}:

716 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec']

717

718 if model in {AdaBoostRegressor}:

719 return ['b-reg', '~b-reg-64']

720

721 if model in {HistGradientBoostingRegressor}:

722 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64']

723

724 if model in {LinearSVC, NearestCentroid}:

725 return ['~b-cl-nop', '~b-cl-nop-64']

726

727 if model in {RFE, RFECV}:

728 return ['num+y-tr']

729

730 if model in {GridSearchCV}:

731 return ['b-cl', 'm-cl',

732 'b-reg', 'm-reg',

733 '~b-reg-64', '~b-cl-64',

734 'cluster', 'outlier', '~m-label']

735

736 if model in {VotingClassifier}:

737 return ['b-cl', 'm-cl']

738

739 if StackingClassifier is not None and model in {StackingClassifier}:

740 return ['b-cl']

741

742 if StackingRegressor is not None and model in {StackingRegressor}:

743 return ['b-reg']

744

745 # specific scenarios

746 if model in {IsotonicRegression}:

747 return ['~num+y-tr-1d', '~b-reg-1d']

748

749 if model in {ARDRegression, BayesianRidge, ElasticNetCV,

750 GradientBoostingRegressor,

751 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,

752 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV,

753 PassiveAggressiveRegressor, SGDRegressor,

754 TheilSenRegressor, HuberRegressor, SVR}:

755 return ['b-reg', '~b-reg-64']

756

757 if model in {MultiOutputClassifier}:

758 return ['m-cl', '~m-label']

759

760 if model in {MultiOutputRegressor, MultiTaskElasticNet,

761 MultiTaskElasticNetCV, MultiTaskLassoCV,

762 MultiTaskLasso}:

763 return ['m-reg']

764

765 if model in {OneVsOneClassifier, OutputCodeClassifier,

766 PassiveAggressiveClassifier, RadiusNeighborsClassifier}:

767 return ['~b-cl-nop', '~m-cl-nop']

768

769 if model in {RidgeClassifier, RidgeClassifierCV}:

770 return ['~b-cl-nop', '~m-cl-nop', '~m-label']

771

772 # trainable transform

773 if model in {GenericUnivariateSelect,

774 NeighborhoodComponentsAnalysis,

775 PLSSVD, SelectKBest,

776 SelectPercentile, SelectFromModel}:

777 return ["num+y-tr"]

778

779 if model in {SelectFwe, SelectFdr, SelectFpr}:

780 return ["num+y-tr-cl"]

781

782 # no m-label

783 if model in {AdaBoostClassifier}:

784 return ['b-cl', '~b-cl-64', 'm-cl']

785

786 if model in {LogisticRegression}:

787 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec']

788

789 if model in {RandomForestClassifier}:

790 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']

791

792 if model in {DecisionTreeClassifier, ExtraTreeClassifier}:

793 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label']

794

795 if model in {DecisionTreeRegressor}:

796 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100']

797

798 if model in {LatentDirichletAllocation, NMF, PowerTransformer}:

799 return ['num-tr-pos']

800

801 if hasattr(model, 'predict'):

802 if "Classifier" in str(model):

803 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']

804 elif "Regressor" in str(model):

805 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']

806

807 # Generic case.

808 res = []

809 if hasattr(model, 'transform'):

810 if issubclass(model, (RegressorMixin, ClassifierMixin)):

811 res.extend(['num+y-tr'])

812 elif issubclass(model, (ClusterMixin, BiclusterMixin)):

813 res.extend(['~num-tr-clu', '~num-tr-clu-64'])

814 else:

815 res.extend(['num-tr'])

816

817 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)):

818 res.extend(['cluster', '~b-clu-64'])

819

820 if issubclass(model, (OutlierMixin)):

821 res.extend(['outlier'])

822

823 if issubclass(model, ClassifierMixin):

824 if model is OneVsRestClassifier:

825 return ['m-cl', '~m-label']

826 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label'])

827 if issubclass(model, RegressorMixin):

828 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'])

829 if issubclass(model, BaseMixture):

830 res.extend(['mix', '~mix-64'])

831

832 if len(res) > 0:

833 return res

834

835 raise RuntimeError("Unable to find problem for model '{}' - {}."

836 "".format(model.__name__, model.__bases__))

837

838 res = _internal(model)

839 for r in res:

840 if r not in _problems:

841 raise ValueError( # pragma: no cover

842 "Unrecognized problem '{}' in\n{}".format(

843 r, "\n".join(sorted(_problems))))

844 return res

845

846

847_problems = {

848 # standard

849 "b-cl": _problem_for_predictor_binary_classification,

850 "m-cl": _problem_for_predictor_multi_classification,

851 "b-reg": _problem_for_predictor_regression,

852 "m-reg": _problem_for_predictor_multi_regression,

853 "num-tr": _problem_for_numerical_transform,

854 "num-tr-pos": _problem_for_numerical_transform_positive,

855 'outlier': _problem_for_outlier,

856 'cluster': _problem_for_clustering,

857 'num+y-tr': _problem_for_numerical_trainable_transform,

858 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl,

859 'mix': _problem_for_mixture,

860 # others

861 '~num-tr-clu': _problem_for_clustering_scores,

862 "~m-label": _problem_for_predictor_multi_classification_label,

863 "~scoring": _problem_for_numerical_scoring,

864 '~b-cl-nop': _problem_for_clnoproba_binary,

865 '~m-cl-nop': _problem_for_clnoproba,

866 '~b-cl-dec': _problem_for_cl_decision_function_binary,

867 '~m-cl-dec': _problem_for_cl_decision_function,

868 # nan

869 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression(

870 n_features=n_features, add_nan=True),

871 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression(

872 dtype=numpy.float64, n_features=n_features, add_nan=True),

873 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification(

874 dtype=dtype, n_features=n_features, add_nan=True),

875 # 100 features

876 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression(

877 n_features=n_features or 100),

878 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification(

879 n_features=n_features or 100),

880 # 64

881 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification(

882 dtype=numpy.float64, n_features=n_features),

883 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression(

884 dtype=numpy.float64, n_features=n_features),

885 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba(

886 dtype=numpy.float64, n_features=n_features),

887 '~b-clu-64': lambda n_features=None: _problem_for_clustering(

888 dtype=numpy.float64, n_features=n_features),

889 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary(

890 dtype=numpy.float64, n_features=n_features),

891 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores(

892 dtype=numpy.float64, n_features=n_features),

893 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression(

894 dtype=numpy.float64, n_features=n_features),

895 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform(

896 dtype=numpy.float64, n_features=n_features),

897 '~mix-64': lambda n_features=None: _problem_for_mixture(

898 dtype=numpy.float64, n_features=n_features),

899 #

900 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification(

901 n_features=n_features) + (False, )),

902 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification(

903 n_features=n_features) + (False, )),

904 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression(

905 n_features=n_features) + (False, )),

906 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression(

907 n_features=n_features) + (False, )),

908 #

909 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification(

910 dtype=numpy.float64, n_features=n_features) + (False, )),

911 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification(

912 dtype=numpy.float64, n_features=n_features) + (False, )),

913 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression(

914 dtype=numpy.float64, n_features=n_features) + (False, )),

915 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

916 dtype=numpy.float64, n_features=n_features) + (False, )),

917 # GaussianProcess

918 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression(

919 True, options={GaussianProcessRegressor: {"return_cov": True}},

920 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),

921 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

922 True, options={GaussianProcessRegressor: {"return_cov": True}},

923 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),

924 #

925 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression(

926 True, options={GaussianProcessRegressor: {"return_std": True}},

927 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),

928 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

929 True, options={GaussianProcessRegressor: {"return_std": True}},

930 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),

931 #

932 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression(

933 True, options={GaussianProcessRegressor: {"return_cov": True}},

934 return_cov=True, dtype=numpy.float64, n_features=n_features)),

935 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

936 True, options={GaussianProcessRegressor: {"return_cov": True}},

937 return_cov=True, dtype=numpy.float64, n_features=n_features)),

938 #

939 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression(

940 True, options={GaussianProcessRegressor: {"return_std": True}},

941 return_std=True, dtype=numpy.float64, n_features=n_features)),

942 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(

943 True, options={GaussianProcessRegressor: {"return_std": True}},

944 return_std=True, dtype=numpy.float64, n_features=n_features)),

945 #

946 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression(

947 dtype=numpy.float64, n_features=n_features)),

948 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(

949 dtype=numpy.float64, n_features=n_features)),

950 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression(

951 True, options={GaussianProcessRegressor: {"return_std": True}},

952 return_std=True, dtype=numpy.float64, n_features=n_features))),

953 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(

954 True, options={GaussianProcessRegressor: {"return_std": True}},

955 return_std=True, dtype=numpy.float64, n_features=n_features))),

956 # isotonic

957 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression),

958 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform),

959 # text

960 "key-int-col": _problem_for_dict_vectorizer,

961 "key-str-col": _problem_for_feature_hasher,

962 "int-col": _problem_for_label_encoder,

963 "one-hot": _problem_for_one_hot_encoder,

964 'text-col': _problem_for_tfidf_vectorizer,

965 'bow': _problem_for_tfidf_transformer,

966}

Coverage for mlprodict/onnxrt/validate/validate_problems.py: 99%

444 statements