Coverage for mlprodict/onnxrt/validate/validate_problems.py: 99%
444 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
1# pylint: disable=E1101
2"""
3@file
4@brief Validates runtime for many :scikit-learn: operators.
5The submodule relies on :epkg:`onnxconverter_common`,
6:epkg:`sklearn-onnx`.
7"""
8import numpy
9from sklearn.base import (
10 ClusterMixin, BiclusterMixin, OutlierMixin,
11 RegressorMixin, ClassifierMixin)
12from sklearn.calibration import CalibratedClassifierCV
13from sklearn.cross_decomposition import PLSSVD
14from sklearn.datasets import load_iris
15from sklearn.decomposition import LatentDirichletAllocation, NMF
16from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
17from sklearn.ensemble import (
18 AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier,
19 BaggingClassifier, VotingClassifier, GradientBoostingClassifier,
20 RandomForestClassifier)
21try:
22 from sklearn.ensemble import StackingClassifier, StackingRegressor
23except ImportError: # pragma: no cover
24 # new in 0.22
25 StackingClassifier, StackingRegressor = None, None
26from sklearn.feature_extraction import DictVectorizer, FeatureHasher
27from sklearn.feature_extraction.text import (
28 CountVectorizer, TfidfVectorizer, TfidfTransformer)
29from sklearn.ensemble import (
30 HistGradientBoostingRegressor,
31 HistGradientBoostingClassifier)
32from sklearn.feature_selection import (
33 RFE, RFECV, GenericUnivariateSelect,
34 SelectPercentile, SelectFwe, SelectKBest,
35 SelectFdr, SelectFpr, SelectFromModel)
36from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
37from sklearn.isotonic import IsotonicRegression
38from sklearn.linear_model import (
39 ARDRegression, ElasticNetCV,
40 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,
41 SGDRegressor, OrthogonalMatchingPursuitCV,
42 TheilSenRegressor, BayesianRidge, MultiTaskElasticNet,
43 MultiTaskElasticNetCV, MultiTaskLassoCV, MultiTaskLasso,
44 PassiveAggressiveClassifier, RidgeClassifier,
45 RidgeClassifierCV, PassiveAggressiveRegressor,
46 HuberRegressor, LogisticRegression, SGDClassifier,
47 LogisticRegressionCV, Perceptron)
48from sklearn.mixture._base import BaseMixture
49from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
50from sklearn.multiclass import (
51 OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier)
52from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
53from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB
54from sklearn.neighbors import (
55 NearestCentroid, RadiusNeighborsClassifier,
56 NeighborhoodComponentsAnalysis)
57from sklearn.preprocessing import (
58 LabelBinarizer, LabelEncoder,
59 OneHotEncoder, PowerTransformer)
60from sklearn.semi_supervised import LabelPropagation, LabelSpreading
61from sklearn.svm import LinearSVC, LinearSVR, NuSVR, SVR, SVC, NuSVC
62from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, ExtraTreeClassifier
63from sklearn.utils import shuffle
64from ._validate_problems_helper import (
65 _noshapevar, _1d_problem, text_alpha_num)
68def _modify_dimension(X, n_features, seed=19):
69 """
70 Modifies the number of features to increase
71 or reduce the number of features.
73 @param X features matrix
74 @param n_features number of features
75 @param seed random seed (to get the same
76 dataset at each call)
77 @return new featurs matrix
78 """
79 if n_features is None or n_features == X.shape[1]:
80 return X
81 if n_features < X.shape[1]:
82 return X[:, :n_features]
83 rstate = numpy.random.RandomState(seed) # pylint: disable=E1101
84 res = numpy.empty((X.shape[0], n_features), dtype=X.dtype)
85 res[:, :X.shape[1]] = X[:, :]
86 div = max((n_features // X.shape[1]) + 1, 2)
87 for i in range(X.shape[1], res.shape[1]):
88 j = i % X.shape[1]
89 col = X[:, j]
90 if X.dtype in (numpy.float32, numpy.float64):
91 sigma = numpy.var(col) ** 0.5
92 rnd = rstate.randn(len(col)) * sigma / div
93 col2 = col + rnd
94 res[:, j] -= col2 / div
95 res[:, i] = col2
96 elif X.dtype in (numpy.int32, numpy.int64):
97 perm = rstate.permutation(col)
98 h = rstate.randint(0, div) % X.shape[0]
99 col2 = col.copy()
100 col2[h::div] = perm[h::div] # pylint: disable=E1136
101 res[:, i] = col2
102 h = (h + 1) % X.shape[0]
103 res[h, j] = perm[h] # pylint: disable=E1136
104 else: # pragma: no cover
105 raise NotImplementedError( # pragma: no cover
106 f"Unable to add noise to a feature for this type {X.dtype}")
107 return res
110###########
111# datasets
112###########
115def _problem_for_predictor_binary_classification(
116 dtype=numpy.float32, n_features=None, add_nan=False):
117 """
118 Returns *X, y, intial_types, method, node name, X runtime* for a
119 binary classification problem.
120 It is based on Iris dataset.
121 """
122 data = load_iris()
123 X = data.data
124 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
125 rnd = state.randn(*X.shape) / 3
126 X += rnd
127 X = _modify_dimension(X, n_features)
128 y = data.target
129 y[y == 2] = 1
130 if add_nan:
131 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
132 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
133 X[rows, cols] = numpy.nan
134 X = X.astype(dtype)
135 y = y.astype(numpy.int64)
136 return (X, y, [('X', X[:1].astype(dtype))],
137 'predict_proba', 1, X.astype(dtype))
140def _problem_for_predictor_multi_classification(dtype=numpy.float32, n_features=None):
141 """
142 Returns *X, y, intial_types, method, node name, X runtime* for a
143 m-cl classification problem.
144 It is based on Iris dataset.
145 """
146 data = load_iris()
147 X = data.data
148 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
149 rnd = state.randn(*X.shape) / 3
150 X += rnd
151 X = _modify_dimension(X, n_features)
152 y = data.target
153 X = X.astype(dtype)
154 y = y.astype(numpy.int64)
155 return (X, y, [('X', X[:1].astype(dtype))],
156 'predict_proba', 1, X.astype(dtype))
159def _problem_for_mixture(dtype=numpy.float32, n_features=None):
160 """
161 Returns *X, y, intial_types, method, node name, X runtime* for a
162 m-cl classification problem.
163 It is based on Iris dataset.
164 """
165 data = load_iris()
166 X = data.data
167 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
168 rnd = state.randn(*X.shape) / 3
169 X += rnd
170 X = _modify_dimension(X, n_features)
171 y = data.target
172 X = X.astype(dtype)
173 y = y.astype(numpy.int64)
174 return (X, None, [('X', X[:1].astype(dtype))],
175 'predict_proba', 1, X.astype(dtype))
178def _problem_for_predictor_multi_classification_label(dtype=numpy.float32, n_features=None):
179 """
180 Returns *X, y, intial_types, method, node name, X runtime* for a
181 m-cl classification problem.
182 It is based on Iris dataset.
183 """
184 data = load_iris()
185 X = data.data
186 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
187 rnd = state.randn(*X.shape) / 3
188 X += rnd
189 X = _modify_dimension(X, n_features)
190 y = data.target
191 y2 = numpy.zeros((y.shape[0], 3), dtype=numpy.int64)
192 for i, _ in enumerate(y):
193 y2[i, _] = 1
194 for i in range(0, y.shape[0], 5):
195 y2[i, (y[i] + 1) % 3] = 1
196 X = X.astype(dtype)
197 y2 = y2.astype(numpy.int64)
198 return (X, y2, [('X', X[:1].astype(dtype))],
199 'predict_proba', 1, X.astype(dtype))
202def _problem_for_predictor_regression(many_output=False, options=None,
203 n_features=None, nbrows=None,
204 dtype=numpy.float32, add_nan=False,
205 **kwargs):
206 """
207 Returns *X, y, intial_types, method, name, X runtime* for a
208 regression problem.
209 It is based on Iris dataset.
210 """
211 data = load_iris()
212 X = data.data
213 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
214 rnd = state.randn(*X.shape) / 3
215 X += rnd
216 X = _modify_dimension(X, n_features)
217 y = data.target + numpy.arange(len(data.target)) / 100
218 meth = 'predict' if kwargs is None else ('predict', kwargs)
219 itt = [('X', X[:1].astype(dtype))]
220 if n_features is not None:
221 X = X[:, :n_features]
222 itt = [('X', X[:1].astype(dtype))]
223 if nbrows is not None:
224 X = X[:nbrows, :]
225 y = y[:nbrows]
226 itt = [('X', X[:1].astype(dtype))]
227 if options is not None:
228 itt = itt, options
229 if add_nan:
230 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
231 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
232 X[rows, cols] = numpy.nan
233 X = X.astype(dtype)
234 y = y.astype(dtype)
235 return (X, y, itt,
236 meth, 'all' if many_output else 0, X.astype(dtype))
239def _problem_for_predictor_multi_regression(many_output=False, options=None,
240 n_features=None, nbrows=None,
241 dtype=numpy.float32, **kwargs):
242 """
243 Returns *X, y, intial_types, method, name, X runtime* for a
244 mregression problem.
245 It is based on Iris dataset.
246 """
247 data = load_iris()
248 X = data.data
249 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
250 rnd = state.randn(*X.shape) / 3
251 X += rnd
252 X = _modify_dimension(X, n_features)
253 y = data.target.astype(float) + numpy.arange(len(data.target)) / 100
254 meth = 'predict' if kwargs is None else ('predict', kwargs)
255 itt = [('X', X[:1].astype(dtype))]
256 if n_features is not None:
257 X = X[:, :n_features]
258 itt = [('X', X[:1].astype(dtype))]
259 if nbrows is not None:
260 X = X[:nbrows, :]
261 y = y[:nbrows]
262 itt = [('X', X[:1].astype(dtype))]
263 if options is not None:
264 itt = itt, options
265 y2 = numpy.empty((y.shape[0], 2))
266 y2[:, 0] = y
267 y2[:, 1] = y + 0.5
268 X = X.astype(dtype)
269 y2 = y2.astype(dtype)
270 return (X, y2, itt,
271 meth, 'all' if many_output else 0, X.astype(dtype))
274def _problem_for_numerical_transform(dtype=numpy.float32, n_features=None):
275 """
276 Returns *X, intial_types, method, name, X runtime* for a
277 transformation problem.
278 It is based on Iris dataset.
279 """
280 data = load_iris()
281 X = data.data
282 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
283 rnd = state.randn(*X.shape) / 3
284 X += rnd
285 X = _modify_dimension(X, n_features)
286 X = X.astype(dtype)
287 return (X, None, [('X', X[:1].astype(dtype))],
288 'transform', 0, X.astype(dtype=numpy.float32))
291def _problem_for_numerical_transform_positive(dtype=numpy.float32, n_features=None):
292 """
293 Returns *X, intial_types, method, name, X runtime* for a
294 transformation problem.
295 It is based on Iris dataset.
296 """
297 data = load_iris()
298 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
299 rnd = state.randn(*data.data.shape) / 3
300 X = numpy.abs(data.data + rnd)
301 X = _modify_dimension(X, n_features)
302 X = X.astype(dtype)
303 return (X, None, [('X', X[:1].astype(dtype))],
304 'transform', 0, X.astype(dtype=numpy.float32))
307def _problem_for_numerical_trainable_transform(dtype=numpy.float32, n_features=None):
308 """
309 Returns *X, intial_types, method, name, X runtime* for a
310 transformation problem.
311 It is based on Iris dataset.
312 """
313 data = load_iris()
314 X = data.data
315 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
316 rnd = state.randn(*X.shape) / 3
317 X += rnd
318 X = _modify_dimension(X, n_features)
319 y = data.target + numpy.arange(len(data.target)) / 100
320 X = X.astype(dtype)
321 y = y.astype(dtype)
322 return (X, y, [('X', X[:1].astype(dtype))],
323 'transform', 0, X.astype(dtype))
326def _problem_for_numerical_trainable_transform_cl(dtype=numpy.float32, n_features=None):
327 """
328 Returns *X, intial_types, method, name, X runtime* for a
329 transformation problem.
330 It is based on Iris dataset.
331 """
332 data = load_iris()
333 X = data.data
334 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
335 rnd = state.randn(*X.shape) / 3
336 X += rnd
337 X = _modify_dimension(X, n_features)
338 y = data.target
339 X = X.astype(dtype)
340 y = y.astype(numpy.int64)
341 return (X, y, [('X', X[:1].astype(dtype))],
342 'transform', 0, X.astype(dtype))
345def _problem_for_clustering(dtype=numpy.float32, n_features=None):
346 """
347 Returns *X, intial_types, method, name, X runtime* for a
348 clustering problem.
349 It is based on Iris dataset.
350 """
351 data = load_iris()
352 X = data.data
353 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
354 rnd = state.randn(*X.shape) / 3
355 X += rnd
356 X = _modify_dimension(X, n_features)
357 X = X.astype(dtype)
358 return (X, None, [('X', X[:1].astype(dtype))],
359 'predict', 0, X.astype(dtype))
362def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None):
363 """
364 Returns *X, intial_types, method, name, X runtime* for a
365 clustering problem, the score part, not the cluster.
366 It is based on Iris dataset.
367 """
368 data = load_iris()
369 X = data.data
370 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
371 rnd = state.randn(*X.shape) / 3
372 X += rnd
373 X = _modify_dimension(X, n_features)
374 X = X.astype(dtype)
375 return (X, None, [('X', X[:1].astype(dtype))],
376 'transform', 1, X.astype(dtype))
379def _problem_for_outlier(dtype=numpy.float32, n_features=None):
380 """
381 Returns *X, intial_types, method, name, X runtime* for a
382 transformation problem.
383 It is based on Iris dataset.
384 """
385 data = load_iris()
386 X = data.data
387 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
388 rnd = state.randn(*X.shape) / 3
389 X += rnd
390 X = _modify_dimension(X, n_features)
391 X = X.astype(dtype)
392 return (X, None, [('X', X[:1].astype(dtype))],
393 'predict', 0, X.astype(dtype))
396def _problem_for_numerical_scoring(dtype=numpy.float32, n_features=None):
397 """
398 Returns *X, y, intial_types, method, name, X runtime* for a
399 scoring problem.
400 It is based on Iris dataset.
401 """
402 data = load_iris()
403 X = data.data
404 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
405 rnd = state.randn(*X.shape) / 3
406 X += rnd
407 y = data.target.astype(dtype) + numpy.arange(len(data.target)) / 100
408 y /= numpy.max(y)
409 X = X.astype(dtype)
410 y = y.astype(dtype)
411 return (X, y, [('X', X[:1].astype(dtype))],
412 'score', 0, X.astype(dtype))
415def _problem_for_clnoproba(dtype=numpy.float32, n_features=None):
416 """
417 Returns *X, y, intial_types, method, name, X runtime* for a
418 scoring problem.
419 It is based on Iris dataset.
420 """
421 data = load_iris()
422 X = data.data
423 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
424 rnd = state.randn(*X.shape) / 3
425 X += rnd
426 X = _modify_dimension(X, n_features)
427 y = data.target
428 X = X.astype(dtype)
429 y = y.astype(numpy.int64)
430 return (X, y, [('X', X[:1].astype(dtype))],
431 'predict', 0, X.astype(dtype))
434def _problem_for_clnoproba_binary(dtype=numpy.float32, n_features=None, add_nan=False):
435 """
436 Returns *X, y, intial_types, method, name, X runtime* for a
437 scoring problem. Binary classification.
438 It is based on Iris dataset.
439 """
440 data = load_iris()
441 X = data.data
442 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
443 rnd = state.randn(*X.shape) / 3
444 X += rnd
445 X = _modify_dimension(X, n_features)
446 y = data.target
447 y[y == 2] = 1
448 if add_nan:
449 rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
450 cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
451 X[rows, cols] = numpy.nan
452 X = X.astype(dtype)
453 y = y.astype(numpy.int64)
454 return (X, y, [('X', X[:1].astype(dtype))],
455 'predict', 0, X.astype(dtype))
458def _problem_for_cl_decision_function(dtype=numpy.float32, n_features=None):
459 """
460 Returns *X, y, intial_types, method, name, X runtime* for a
461 scoring problem.
462 It is based on Iris dataset.
463 """
464 data = load_iris()
465 X = data.data
466 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
467 rnd = state.randn(*X.shape) / 3
468 X += rnd
469 X = _modify_dimension(X, n_features)
470 y = data.target
471 X = X.astype(dtype)
472 y = y.astype(numpy.int64)
473 return (X, y, [('X', X[:1].astype(dtype))],
474 'decision_function', 1, X.astype(dtype))
477def _problem_for_cl_decision_function_binary(dtype=numpy.float32, n_features=None):
478 """
479 Returns *X, y, intial_types, method, name, X runtime* for a
480 scoring problem. Binary classification.
481 It is based on Iris dataset.
482 """
483 data = load_iris()
484 X = data.data
485 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
486 rnd = state.randn(*X.shape) / 3
487 X += rnd
488 X = _modify_dimension(X, n_features)
489 y = data.target
490 y[y == 2] = 1
491 X = X.astype(dtype)
492 y = y.astype(numpy.int64)
493 return (X, y, [('X', X[:1].astype(dtype))],
494 'decision_function', 1, X.astype(dtype))
497def _problem_for_label_encoder(dtype=numpy.int64, n_features=None):
498 """
499 Returns a problem for the :epkg:`sklearn:preprocessing:LabelEncoder`.
500 """
501 data = load_iris()
502 # X = data.data
503 y = data.target.astype(dtype)
504 itt = [('X', y[:1].astype(dtype))]
505 y = y.astype(dtype)
506 return (y, None, itt, 'transform', 0, y)
509def _problem_for_dict_vectorizer(dtype=numpy.float32, n_features=None):
510 """
511 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.
512 """
513 from skl2onnx.common.data_types import ( # delayed
514 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)
515 data = load_iris()
516 # X = data.data
517 y = data.target
518 y2 = [{_: dtype(1000 + i)} for i, _ in enumerate(y)]
519 y2[0][2] = -2
520 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
521 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]
522 y2 = numpy.array(y2)
523 y = y.astype(numpy.int64)
524 return (y2, y, itt, 'transform', 0, y2)
527def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None):
528 """
529 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`.
530 """
531 from skl2onnx.common.data_types import ( # delayed
532 StringTensorType)
533 X = numpy.array([_[0] for _ in text_alpha_num])
534 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)
535 itt = [("X", StringTensorType([None]))]
536 return (X, y, itt, 'transform', 0, X)
539def _problem_for_tfidf_transformer(dtype=numpy.float32, n_features=None):
540 """
541 Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfTransformer`.
542 """
543 from skl2onnx.common.data_types import ( # delayed
544 FloatTensorType, DoubleTensorType)
545 X = numpy.array([_[0] for _ in text_alpha_num])
546 y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)
547 X2 = CountVectorizer().fit_transform(X).astype(dtype)
548 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
549 itt = [("X", cltype([None, X2.shape[1]]))]
550 return (X2, y, itt, 'transform', 0, X2)
553def _problem_for_feature_hasher(dtype=numpy.float32, n_features=None):
554 """
555 Returns a problem for the :epkg:`sklearn:feature_extraction:DictVectorizer`.
556 """
557 from skl2onnx.common.data_types import ( # delayed
558 FloatTensorType, DoubleTensorType, StringTensorType, DictionaryType)
559 data = load_iris()
560 # X = data.data
561 y = data.target
562 y2 = [{("cl%d" % _): dtype(1000 + i)} for i, _ in enumerate(y)]
563 y2[0]["cl2"] = -2
564 cltype = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
565 itt = [("X", DictionaryType(StringTensorType([1]), cltype([1])))]
566 y2 = numpy.array(y2)
567 return (y2, y, itt, 'transform', 0, y2)
570def _problem_for_one_hot_encoder(dtype=numpy.float32, n_features=None):
571 """
572 Returns a problem for the :epkg:`sklearn:preprocessing:OneHotEncoder`.
573 """
574 data = load_iris()
575 state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
576 rnd = state.randn(*data.data.shape) / 3
577 X = _modify_dimension(data.data + rnd, n_features)
578 X = X.astype(numpy.int32).astype(dtype)
579 y = data.target
580 X, y = shuffle(X, y, random_state=1)
581 itt = [('X', X[:1].astype(dtype))]
582 return (X[:, :1], y, itt, 'transform', 0, X[:, :1].astype(dtype))
585def find_suitable_problem(model):
586 """
587 Determines problems suitable for a given
588 :epkg:`scikit-learn` operator. It may be
590 * `b-cl`: binary classification
591 * `m-cl`: m-cl classification
592 * `m-label`: classification m-label
593 (multiple labels possible at the same time)
594 * `reg`: regression
595 * `m-reg`: regression multi-output
596 * `num-tr`: transform numerical features
597 * `num-tr-pos`: transform numerical positive features
598 * `scoring`: transform numerical features, target is usually needed
599 * `outlier`: outlier prediction
600 * `linearsvc`: classifier without *predict_proba*
601 * `cluster`: similar to transform
602 * `num+y-tr`: similar to transform with targets
603 * `num+y-tr-cl`: similar to transform with classes
604 * `num-tr-clu`: similar to cluster, but returns
605 scores or distances instead of cluster
606 * `key-col`: list of dictionaries
607 * `text-col`: one column of text
609 Suffix `nofit` indicates the predictions happens
610 without the model being fitted. This is the case
611 for :epkg:`sklearn:gaussian_process:GaussianProcessRegressor`.
612 The suffix `-cov` indicates the method `predict` was called
613 with parameter ``return_cov=True``, `-std` tells
614 method `predict` was called with parameter ``return_std=True``.
615 The suffix ``-NSV`` creates an input variable
616 like the following ``[('X', FloatTensorType([None, None]))]``.
617 That's a way to bypass :epkg:`onnxruntime` shape checking
618 as one part of the graph is designed to handle any
619 kind of dimensions but apparently, if the input shape is
620 precise, every part of the graph has to be precise. The strings
621 used variables which means it is at the same time precise
622 and unprecise. Suffix ``'-64'`` means the model will
623 do double computations. Suffix ``-nop`` means the classifier
624 does not implement method *predict_proba*. Suffix ``-1d``
625 means a one dimension problem (one feature). Suffix ``-dec``
626 checks method `decision_function`.
628 The following script gives the list of :epkg:`scikit-learn`
629 models and the problem they can be fitted on.
631 .. runpython::
632 :showcode:
633 :warningout: DeprecationWarning
634 :rst:
636 from mlprodict.onnxrt.validate.validate import (
637 sklearn_operators, find_suitable_problem)
638 from pyquickhelper.pandashelper import df2rst
639 from pandas import DataFrame
640 res = sklearn_operators()
641 rows = []
642 for model in res[:20]:
643 name = model['name']
644 row = dict(name=name)
645 try:
646 prob = find_suitable_problem(model['cl'])
647 if prob is None:
648 continue
649 for p in prob:
650 row[p] = 'X'
651 except RuntimeError:
652 pass
653 rows.append(row)
654 df = DataFrame(rows).set_index('name')
655 df = df.sort_index()
656 print(df2rst(df, index=True))
658 The list is truncated. The full list can be found at
659 :ref:`l-model-problem-list`.
660 """
661 from ...onnx_conv.validate_scenarios import find_suitable_problem as ext_find_suitable_problem
663 def _internal(model): # pylint: disable=R0911
665 # checks that this model is not overwritten by this module
666 ext = ext_find_suitable_problem(model)
667 if ext is not None:
668 return ext
670 # Exceptions
671 if model in {GaussianProcessRegressor}:
672 # m-reg causes MemoryError on some machine.
673 return ['~b-reg-NF-64', # '~m-reg-NF-64',
674 '~b-reg-NF-cov-64', # '~m-reg-NF-cov-64',
675 '~b-reg-NF-std-64', # '~m-reg-NF-std-64',
676 '~b-reg-NSV-64', # '~m-reg-NSV-64',
677 '~b-reg-cov-64', # '~m-reg-cov-64',
678 '~b-reg-std-NSV-64', # '~m-reg-std-NSV-64',
679 'b-reg', '~b-reg-64', # 'm-reg'
680 ]
682 if model in {DictVectorizer}:
683 return ['key-int-col']
685 if model in {TfidfVectorizer, CountVectorizer}:
686 return ['text-col']
688 if model in {TfidfTransformer}:
689 return ['bow']
691 if model in {FeatureHasher}:
692 return ['key-str-col']
694 if model in {OneHotEncoder}:
695 return ['one-hot']
697 if model in {LabelBinarizer, LabelEncoder}:
698 return ['int-col']
700 if model in {NuSVC, SVC, SGDClassifier,
701 HistGradientBoostingClassifier}:
702 return ['b-cl', 'm-cl', '~b-cl-64', '~b-cl-nan']
704 if model in {GaussianProcessClassifier}:
705 return ['b-cl', 'm-cl', '~b-cl-64']
707 if model in {BaggingClassifier, BernoulliNB, CalibratedClassifierCV,
708 ComplementNB, GaussianNB,
709 GradientBoostingClassifier, LabelPropagation, LabelSpreading,
710 LinearDiscriminantAnalysis, LogisticRegressionCV,
711 MultinomialNB, QuadraticDiscriminantAnalysis,
712 RandomizedSearchCV}:
713 return ['b-cl', 'm-cl']
715 if model in {Perceptron}:
716 return ['~b-cl-nop', '~m-cl-nop', '~b-cl-dec', '~m-cl-dec']
718 if model in {AdaBoostRegressor}:
719 return ['b-reg', '~b-reg-64']
721 if model in {HistGradientBoostingRegressor}:
722 return ['b-reg', '~b-reg-64', '~b-reg-nan', '~b-reg-nan-64']
724 if model in {LinearSVC, NearestCentroid}:
725 return ['~b-cl-nop', '~b-cl-nop-64']
727 if model in {RFE, RFECV}:
728 return ['num+y-tr']
730 if model in {GridSearchCV}:
731 return ['b-cl', 'm-cl',
732 'b-reg', 'm-reg',
733 '~b-reg-64', '~b-cl-64',
734 'cluster', 'outlier', '~m-label']
736 if model in {VotingClassifier}:
737 return ['b-cl', 'm-cl']
739 if StackingClassifier is not None and model in {StackingClassifier}:
740 return ['b-cl']
742 if StackingRegressor is not None and model in {StackingRegressor}:
743 return ['b-reg']
745 # specific scenarios
746 if model in {IsotonicRegression}:
747 return ['~num+y-tr-1d', '~b-reg-1d']
749 if model in {ARDRegression, BayesianRidge, ElasticNetCV,
750 GradientBoostingRegressor,
751 LarsCV, LassoCV, LassoLarsCV, LassoLarsIC,
752 LinearSVR, NuSVR, OrthogonalMatchingPursuitCV,
753 PassiveAggressiveRegressor, SGDRegressor,
754 TheilSenRegressor, HuberRegressor, SVR}:
755 return ['b-reg', '~b-reg-64']
757 if model in {MultiOutputClassifier}:
758 return ['m-cl', '~m-label']
760 if model in {MultiOutputRegressor, MultiTaskElasticNet,
761 MultiTaskElasticNetCV, MultiTaskLassoCV,
762 MultiTaskLasso}:
763 return ['m-reg']
765 if model in {OneVsOneClassifier, OutputCodeClassifier,
766 PassiveAggressiveClassifier, RadiusNeighborsClassifier}:
767 return ['~b-cl-nop', '~m-cl-nop']
769 if model in {RidgeClassifier, RidgeClassifierCV}:
770 return ['~b-cl-nop', '~m-cl-nop', '~m-label']
772 # trainable transform
773 if model in {GenericUnivariateSelect,
774 NeighborhoodComponentsAnalysis,
775 PLSSVD, SelectKBest,
776 SelectPercentile, SelectFromModel}:
777 return ["num+y-tr"]
779 if model in {SelectFwe, SelectFdr, SelectFpr}:
780 return ["num+y-tr-cl"]
782 # no m-label
783 if model in {AdaBoostClassifier}:
784 return ['b-cl', '~b-cl-64', 'm-cl']
786 if model in {LogisticRegression}:
787 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-dec', '~m-cl-dec']
789 if model in {RandomForestClassifier}:
790 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']
792 if model in {DecisionTreeClassifier, ExtraTreeClassifier}:
793 return ['b-cl', '~b-cl-64', 'm-cl', '~b-cl-f100', '~m-label']
795 if model in {DecisionTreeRegressor}:
796 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64', '~b-reg-f100']
798 if model in {LatentDirichletAllocation, NMF, PowerTransformer}:
799 return ['num-tr-pos']
801 if hasattr(model, 'predict'):
802 if "Classifier" in str(model):
803 return ['b-cl', '~b-cl-64', 'm-cl', '~m-label']
804 elif "Regressor" in str(model):
805 return ['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64']
807 # Generic case.
808 res = []
809 if hasattr(model, 'transform'):
810 if issubclass(model, (RegressorMixin, ClassifierMixin)):
811 res.extend(['num+y-tr'])
812 elif issubclass(model, (ClusterMixin, BiclusterMixin)):
813 res.extend(['~num-tr-clu', '~num-tr-clu-64'])
814 else:
815 res.extend(['num-tr'])
817 if hasattr(model, 'predict') and issubclass(model, (ClusterMixin, BiclusterMixin)):
818 res.extend(['cluster', '~b-clu-64'])
820 if issubclass(model, (OutlierMixin)):
821 res.extend(['outlier'])
823 if issubclass(model, ClassifierMixin):
824 if model is OneVsRestClassifier:
825 return ['m-cl', '~m-label']
826 res.extend(['b-cl', '~b-cl-64', 'm-cl', '~m-label'])
827 if issubclass(model, RegressorMixin):
828 res.extend(['b-reg', 'm-reg', '~b-reg-64', '~m-reg-64'])
829 if issubclass(model, BaseMixture):
830 res.extend(['mix', '~mix-64'])
832 if len(res) > 0:
833 return res
835 raise RuntimeError("Unable to find problem for model '{}' - {}."
836 "".format(model.__name__, model.__bases__))
838 res = _internal(model)
839 for r in res:
840 if r not in _problems:
841 raise ValueError( # pragma: no cover
842 "Unrecognized problem '{}' in\n{}".format(
843 r, "\n".join(sorted(_problems))))
844 return res
847_problems = {
848 # standard
849 "b-cl": _problem_for_predictor_binary_classification,
850 "m-cl": _problem_for_predictor_multi_classification,
851 "b-reg": _problem_for_predictor_regression,
852 "m-reg": _problem_for_predictor_multi_regression,
853 "num-tr": _problem_for_numerical_transform,
854 "num-tr-pos": _problem_for_numerical_transform_positive,
855 'outlier': _problem_for_outlier,
856 'cluster': _problem_for_clustering,
857 'num+y-tr': _problem_for_numerical_trainable_transform,
858 'num+y-tr-cl': _problem_for_numerical_trainable_transform_cl,
859 'mix': _problem_for_mixture,
860 # others
861 '~num-tr-clu': _problem_for_clustering_scores,
862 "~m-label": _problem_for_predictor_multi_classification_label,
863 "~scoring": _problem_for_numerical_scoring,
864 '~b-cl-nop': _problem_for_clnoproba_binary,
865 '~m-cl-nop': _problem_for_clnoproba,
866 '~b-cl-dec': _problem_for_cl_decision_function_binary,
867 '~m-cl-dec': _problem_for_cl_decision_function,
868 # nan
869 "~b-reg-nan": lambda n_features=None: _problem_for_predictor_regression(
870 n_features=n_features, add_nan=True),
871 "~b-reg-nan-64": lambda n_features=None: _problem_for_predictor_regression(
872 dtype=numpy.float64, n_features=n_features, add_nan=True),
873 "~b-cl-nan": lambda dtype=numpy.float32, n_features=None: _problem_for_predictor_binary_classification(
874 dtype=dtype, n_features=n_features, add_nan=True),
875 # 100 features
876 "~b-reg-f100": lambda n_features=100: _problem_for_predictor_regression(
877 n_features=n_features or 100),
878 "~b-cl-f100": lambda n_features=100: _problem_for_predictor_binary_classification(
879 n_features=n_features or 100),
880 # 64
881 "~b-cl-64": lambda n_features=None: _problem_for_predictor_binary_classification(
882 dtype=numpy.float64, n_features=n_features),
883 "~b-reg-64": lambda n_features=None: _problem_for_predictor_regression(
884 dtype=numpy.float64, n_features=n_features),
885 '~b-cl-nop-64': lambda n_features=None: _problem_for_clnoproba(
886 dtype=numpy.float64, n_features=n_features),
887 '~b-clu-64': lambda n_features=None: _problem_for_clustering(
888 dtype=numpy.float64, n_features=n_features),
889 '~b-cl-dec-64': lambda n_features=None: _problem_for_cl_decision_function_binary(
890 dtype=numpy.float64, n_features=n_features),
891 '~num-tr-clu-64': lambda n_features=None: _problem_for_clustering_scores(
892 dtype=numpy.float64, n_features=n_features),
893 "~m-reg-64": lambda n_features=None: _problem_for_predictor_multi_regression(
894 dtype=numpy.float64, n_features=n_features),
895 "~num-tr-64": lambda n_features=None: _problem_for_numerical_transform(
896 dtype=numpy.float64, n_features=n_features),
897 '~mix-64': lambda n_features=None: _problem_for_mixture(
898 dtype=numpy.float64, n_features=n_features),
899 #
900 "~b-cl-NF": (lambda n_features=None: _problem_for_predictor_binary_classification(
901 n_features=n_features) + (False, )),
902 "~m-cl-NF": (lambda n_features=None: _problem_for_predictor_multi_classification(
903 n_features=n_features) + (False, )),
904 "~b-reg-NF": (lambda n_features=None: _problem_for_predictor_regression(
905 n_features=n_features) + (False, )),
906 "~m-reg-NF": (lambda n_features=None: _problem_for_predictor_multi_regression(
907 n_features=n_features) + (False, )),
908 #
909 "~b-cl-NF-64": (lambda n_features=None: _problem_for_predictor_binary_classification(
910 dtype=numpy.float64, n_features=n_features) + (False, )),
911 "~m-cl-NF-64": (lambda n_features=None: _problem_for_predictor_multi_classification(
912 dtype=numpy.float64, n_features=n_features) + (False, )),
913 "~b-reg-NF-64": (lambda n_features=None: _problem_for_predictor_regression(
914 dtype=numpy.float64, n_features=n_features) + (False, )),
915 "~m-reg-NF-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
916 dtype=numpy.float64, n_features=n_features) + (False, )),
917 # GaussianProcess
918 "~b-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_regression(
919 True, options={GaussianProcessRegressor: {"return_cov": True}},
920 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),
921 "~m-reg-NF-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
922 True, options={GaussianProcessRegressor: {"return_cov": True}},
923 return_cov=True, dtype=numpy.float64, n_features=n_features) + (False, )),
924 #
925 "~b-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_regression(
926 True, options={GaussianProcessRegressor: {"return_std": True}},
927 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),
928 "~m-reg-NF-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
929 True, options={GaussianProcessRegressor: {"return_std": True}},
930 return_std=True, dtype=numpy.float64, n_features=n_features) + (False, )),
931 #
932 "~b-reg-cov-64": (lambda n_features=None: _problem_for_predictor_regression(
933 True, options={GaussianProcessRegressor: {"return_cov": True}},
934 return_cov=True, dtype=numpy.float64, n_features=n_features)),
935 "~m-reg-cov-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
936 True, options={GaussianProcessRegressor: {"return_cov": True}},
937 return_cov=True, dtype=numpy.float64, n_features=n_features)),
938 #
939 "~reg-std-64": (lambda n_features=None: _problem_for_predictor_regression(
940 True, options={GaussianProcessRegressor: {"return_std": True}},
941 return_std=True, dtype=numpy.float64, n_features=n_features)),
942 "~m-reg-std-64": (lambda n_features=None: _problem_for_predictor_multi_regression(
943 True, options={GaussianProcessRegressor: {"return_std": True}},
944 return_std=True, dtype=numpy.float64, n_features=n_features)),
945 #
946 '~b-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_regression(
947 dtype=numpy.float64, n_features=n_features)),
948 '~m-reg-NSV-64': _noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(
949 dtype=numpy.float64, n_features=n_features)),
950 "~b-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_regression(
951 True, options={GaussianProcessRegressor: {"return_std": True}},
952 return_std=True, dtype=numpy.float64, n_features=n_features))),
953 "~m-reg-std-NSV-64": (_noshapevar(lambda n_features=None: _problem_for_predictor_multi_regression(
954 True, options={GaussianProcessRegressor: {"return_std": True}},
955 return_std=True, dtype=numpy.float64, n_features=n_features))),
956 # isotonic
957 "~b-reg-1d": _1d_problem(_problem_for_predictor_regression),
958 '~num+y-tr-1d': _1d_problem(_problem_for_numerical_trainable_transform),
959 # text
960 "key-int-col": _problem_for_dict_vectorizer,
961 "key-str-col": _problem_for_feature_hasher,
962 "int-col": _problem_for_label_encoder,
963 "one-hot": _problem_for_one_hot_encoder,
964 'text-col': _problem_for_tfidf_vectorizer,
965 'bow': _problem_for_tfidf_transformer,
966}