Coverage for mlprodict/onnx_conv/convert.py: 88%
384 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
1# -*- encoding: utf-8 -*-
2# pylint: disable=C0302,R0914
3"""
4@file
5@brief Overloads a conversion function.
6"""
7import json
8import pprint
9from collections import OrderedDict
10import logging
11import numpy
12from onnx import ValueInfoProto
13import pandas
14try:
15 from sklearn.metrics._scorer import _PredictScorer
16except ImportError: # pragma: no cover
17 # scikit-learn < 0.22
18 from sklearn.metrics.scorer import _PredictScorer
19from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version
20from sklearn.pipeline import Pipeline, FeatureUnion
21from sklearn.compose import ColumnTransformer
22from sklearn.utils.metaestimators import _BaseComposition
23from skl2onnx.common.data_types import (
24 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type,
25 StringTensorType, Int64TensorType, _guess_type_proto)
26from skl2onnx import convert_sklearn
27from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin
28from skl2onnx.algebra.type_helper import _guess_type
29from ..onnx_tools.onnx_manipulations import onnx_rename_names
30from ..onnx_tools.onnx2py_helper import (
31 guess_dtype, get_tensor_shape, get_tensor_elem_type)
32from .register_rewritten_converters import (
33 register_rewritten_operators, register_new_operators)
34from .register import register_converters
35from .scorers import CustomScorerTransform
38logger = logging.getLogger('mlprodict')
41def _fix_opset_skl2onnx():
42 import skl2onnx
43 from .. import __max_supported_opset__
44 if skl2onnx.__max_supported_opset__ != __max_supported_opset__:
45 skl2onnx.__max_supported_opset__ = __max_supported_opset__ # pragma: no cover
48def convert_scorer(fct, initial_types, name=None,
49 target_opset=None, options=None,
50 custom_conversion_functions=None,
51 custom_shape_calculators=None,
52 custom_parsers=None, white_op=None,
53 black_op=None, final_types=None,
54 verbose=0):
55 """
56 Converts a scorer into :epkg:`ONNX` assuming
57 there exists a converter associated to it.
58 The function wraps the function into a custom
59 transformer, then calls function *convert_sklearn*
60 from :epkg:`sklearn-onnx`.
62 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`)
63 :param initial_types: types information
64 :param name: name of the produced model
65 :param target_opset: to do it with a different target opset
66 :param options: additional parameters for the conversion
67 :param custom_conversion_functions: a dictionary for specifying the user
68 customized conversion function, it takes precedence over
69 registered converters
70 :param custom_shape_calculators: a dictionary for specifying the user
71 customized shape calculator it takes precedence over registered
72 shape calculators.
73 :param custom_parsers: parsers determine which outputs is expected
74 for which particular task, default parsers are
75 defined for classifiers, regressors, pipeline but
76 they can be rewritten, *custom_parsers* is a dictionary
77 ``{ type: fct_parser(scope, model, inputs,
78 custom_parsers=None) }``
79 :param white_op: white list of ONNX nodes allowed
80 while converting a pipeline, if empty, all are allowed
81 :param black_op: black list of ONNX nodes allowed
82 while converting a pipeline, if empty, none are blacklisted
83 :param final_types: a python list. Works the same way as
84 initial_types but not mandatory, it is used
85 to overwrites the type (if type is not None)
86 and the name of every output.
87 :param verbose: displays information while converting
88 :return: :epkg:`ONNX` graph
89 """
90 if hasattr(fct, '_score_func'):
91 kwargs = fct._kwargs
92 fct = fct._score_func
93 else:
94 kwargs = None # pragma: no cover
95 if name is None:
96 name = f"mlprodict_fct_ONNX({fct.__name__})"
97 tr = CustomScorerTransform(fct.__name__, fct, kwargs)
98 _fix_opset_skl2onnx()
99 return convert_sklearn(
100 tr, initial_types=initial_types,
101 target_opset=target_opset, options=options,
102 custom_conversion_functions=custom_conversion_functions,
103 custom_shape_calculators=custom_shape_calculators,
104 custom_parsers=custom_parsers, white_op=white_op,
105 black_op=black_op, final_types=final_types,
106 verbose=verbose)
109def guess_initial_types(X, initial_types):
110 """
111 Guesses initial types from an array or a dataframe.
113 :param X: array or dataframe
114 :param initial_types: hints about X
115 :return: data types
116 """
117 if X is None and initial_types is None:
118 raise NotImplementedError( # pragma: no cover
119 "Initial types must be specified.")
120 elif initial_types is None:
121 if isinstance(X, (numpy.ndarray, pandas.DataFrame)):
122 X = X[:1]
123 if isinstance(X, pandas.DataFrame):
124 initial_types = []
125 for c in X.columns:
126 if isinstance(X[c].values[0], (str, numpy.str_)):
127 g = StringTensorType()
128 else:
129 g = _guess_type(X[c].values)
130 g.shape = [None, 1]
131 initial_types.append((c, g))
132 else:
133 gt = _guess_type(X)
134 initial_types = [('X', gt)]
135 return initial_types
138def _replace_tensor_type(schema, tensor_type):
139 res = []
140 for name, ty in schema:
141 cl = ty.__class__
142 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type:
143 ty = tensor_type(ty.shape)
144 res.append((name, ty))
145 return res
148def guess_schema_from_data(X, tensor_type=None, schema=None):
149 """
150 Guesses initial types from a dataset.
152 @param X dataset (dataframe, array)
153 @param tensor_type if not None, replaces every
154 *FloatTensorType* or *DoubleTensorType*
155 by this one
156 @param schema known schema
157 @return schema (list of typed and named columns)
158 """
159 init = guess_initial_types(X, schema)
160 if tensor_type is not None:
161 init = _replace_tensor_type(init, tensor_type)
162 # Grouping column
163 unique = set()
164 for _, col in init:
165 if len(col.shape) != 2:
166 return init # pragma: no cover
167 if col.shape[0] is not None:
168 return init # pragma: no cover
169 if len(unique) > 0 and col.__class__ not in unique:
170 return init # pragma: no cover
171 unique.add(col.__class__)
172 unique = list(unique)
173 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))]
176def get_inputs_from_data(X, schema=None):
177 """
178 Produces input data for *onnx* runtime.
180 @param X data
181 @param schema schema if None, schema is guessed with
182 @see fn guess_schema_from_data
183 @return input data
184 """
185 def _cast_data(X, ct):
186 if isinstance(ct, FloatTensorType):
187 return X.astype(numpy.float32)
188 if isinstance(ct, DoubleTensorType):
189 return X.astype(numpy.float64)
190 if isinstance(ct, StringTensorType):
191 return X.astype(numpy.str_)
192 if isinstance(ct, Int64TensorType):
193 return X.astype(numpy.int64)
194 raise RuntimeError( # pragma: no cover
195 f"Unexpected column type {ct} for type {type(X)}.")
197 if schema is None:
198 schema = guess_schema_from_data(X)
199 if isinstance(X, numpy.ndarray):
200 if len(schema) != 1:
201 raise RuntimeError( # pragma: no cover
202 "More than one column but input is an array.")
203 return {schema[0][0]: _cast_data(X, schema[0][1])}
204 if isinstance(X, pandas.DataFrame):
205 if len(schema) != X.shape[1]:
206 raise RuntimeError( # pragma: no cover
207 "Mismatch between onnx columns {} and DataFrame columns {}"
208 "".format(len(schema), X.shape[1]))
209 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1))
210 for sch, c in zip(schema, X.columns)}
211 raise TypeError( # pragma: no cover
212 f"Unexpected type {type(X)}, expecting an array or a dataframe.")
215def guess_schema_from_model(model, tensor_type=None, schema=None):
216 """
217 Guesses initial types from a model.
219 @param model model
220 @param tensor_type if not None, replaces every
221 *FloatTensorType* or *DoubleTensorType*
222 by this one
223 @param schema known schema
224 @return schema (list of typed and named columns)
225 """
226 if schema is not None:
227 try:
228 guessed = guess_schema_from_model(model)
229 except NotImplementedError: # pragma: no cover
230 return _replace_tensor_type(schema, tensor_type)
231 if len(guessed) != len(schema):
232 raise RuntimeError( # pragma: no cover
233 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format(
234 schema, guessed))
235 return _replace_tensor_type(schema, tensor_type)
237 if hasattr(model, 'coef_'):
238 # linear model
239 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))]
240 return _replace_tensor_type(init, tensor_type)
241 elif hasattr(model, 'dump_model'):
242 dumped = model.dump_model()
243 if isinstance(dumped, dict) and 'feature_names' in dumped:
244 names = dumped['feature_names']
245 init = [(name, FloatTensorType([None, 1])) for name in names]
246 return _replace_tensor_type(init, tensor_type)
248 data = pprint.pformat(model.__dict__)
249 dirs = pprint.pformat(dir(model))
250 if hasattr(model, 'dump_model'): # pragma: no cover
251 dumped = model.dump_model()
252 keys = list(sorted(dumped))
253 last = pprint.pformat([keys, dumped])
254 if len(last) >= 200000:
255 last = last[:200000] + "\n..."
256 else:
257 last = ""
258 raise NotImplementedError( # pragma: no cover
259 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format(
260 model.__class__, data, dirs, last))
263def _guess_type_(X, itype, dtype):
264 initial_types = guess_initial_types(X, itype)
265 if dtype is None:
266 if hasattr(X, 'dtypes'): # DataFrame
267 dtype = numpy.float32
268 elif hasattr(X, 'dtype'):
269 dtype = X.dtype
270 elif hasattr(X, 'type'):
271 dtype = guess_numpy_type(X.type)
272 elif isinstance(initial_types[0], ValueInfoProto):
273 dtype = guess_dtype(initial_types[0].type.tensor_type.elem_type)
274 elif initial_types is not None:
275 dtype = guess_numpy_type(initial_types[0][1])
276 else:
277 raise RuntimeError( # pragma: no cover
278 f"dtype cannot be guessed: {type(X)}")
279 if dtype != numpy.float64:
280 dtype = numpy.float32
281 if dtype is None:
282 raise RuntimeError("dtype cannot be None") # pragma: no cover
283 if isinstance(dtype, FloatTensorType):
284 dtype = numpy.float32 # pragma: no cover
285 elif isinstance(dtype, DoubleTensorType):
286 dtype = numpy.float64 # pragma: no cover
287 new_dtype = dtype
288 if isinstance(dtype, numpy.ndarray):
289 new_dtype = dtype.dtype # pragma: no cover
290 elif isinstance(dtype, DataType):
291 new_dtype = numpy.float32 # pragma: no cover
292 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64,
293 numpy.int32, numpy.float16):
294 raise NotImplementedError( # pragma: no cover
295 f"dtype should be real not {new_dtype} ({dtype})")
296 return initial_types, dtype, new_dtype
299def to_onnx(model, X=None, name=None, initial_types=None,
300 target_opset=None, options=None, rewrite_ops=False,
301 white_op=None, black_op=None, final_types=None,
302 rename_strategy=None, verbose=0,
303 as_function=False, prefix_name=None,
304 run_shape=False, single_function=True):
305 """
306 Converts a model using on :epkg:`sklearn-onnx`.
308 :param model: model to convert or a function
309 wrapped into :epkg:`_PredictScorer` with
310 function :epkg:`make_scorer`
311 :param X: training set (at least one row),
312 can be None, it is used to infered the
313 input types (*initial_types*)
314 :param initial_types: if *X* is None, then *initial_types*
315 must be defined
316 :param name: name of the produced model
317 :param target_opset: to do it with a different target opset
318 :param options: additional parameters for the conversion
319 :param rewrite_ops: rewrites some existing converters,
320 the changes are permanent
321 :param white_op: white list of ONNX nodes allowed
322 while converting a pipeline, if empty, all are allowed
323 :param black_op: black list of ONNX nodes allowed
324 while converting a pipeline, if empty,
325 none are blacklisted
326 :param final_types: a python list. Works the same way as
327 initial_types but not mandatory, it is used
328 to overwrites the type (if type is not None)
329 and the name of every output.
330 :param rename_strategy: rename any name in the graph, select shorter
331 names, see @see fn onnx_rename_names
332 :param verbose: display information while converting the model
333 :param as_function: exposes every model in a pipeline as a function,
334 the main graph contains the pipeline structure,
335 see :ref:`onnxsklearnfunctionsrst` for an example
336 :param prefix_name: used if *as_function* is True, to give
337 a prefix to variable in a pipeline
338 :param run_shape: run shape inference
339 :param single_function: if *as_function* is True, the function returns one graph
340 with one call to the main function if *single_function* is True or
341 a list of node corresponding to the graph structure
342 :return: converted model
344 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx`
345 but may changes a few converters if *rewrite_ops* is True.
346 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor*
347 for float but not for double. It becomes available
348 if ``rewrite_ops=True``.
350 .. faqref::
351 :title: How to deal with a dataframe as input?
353 Each column of the dataframe is considered as an named input.
354 The first step is to make sure that every column type is correct.
355 :epkg:`pandas` tends to select the least generic type to
356 hold the content of one column. :epkg:`ONNX` does not automatically
357 cast the data it receives. The data must have the same type with
358 the model is converted and when the converted model receives
359 the data to predict.
361 .. runpython::
362 :showcode:
363 :warningout: DeprecationWarning
365 from io import StringIO
366 from textwrap import dedent
367 import numpy
368 import pandas
369 from pyquickhelper.pycode import ExtTestCase
370 from sklearn.preprocessing import OneHotEncoder
371 from sklearn.pipeline import Pipeline
372 from sklearn.compose import ColumnTransformer
373 from mlprodict.onnx_conv import to_onnx
374 from mlprodict.onnxrt import OnnxInference
376 text = dedent('''
377 __SCHEMA__
378 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
379 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
380 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
381 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
382 ''')
383 text = text.replace(
384 "__SCHEMA__",
385 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,"
386 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,"
387 "alcohol,quality,color")
389 X_train = pandas.read_csv(StringIO(text))
390 for c in X_train.columns:
391 if c != 'color':
392 X_train[c] = X_train[c].astype(numpy.float32)
393 numeric_features = [c for c in X_train if c != 'color']
395 pipe = Pipeline([
396 ("prep", ColumnTransformer([
397 ("color", Pipeline([
398 ('one', OneHotEncoder()),
399 ('select', ColumnTransformer(
400 [('sel1', 'passthrough', [0])]))
401 ]), ['color']),
402 ("others", "passthrough", numeric_features)
403 ])),
404 ])
406 pipe.fit(X_train)
407 pred = pipe.transform(X_train)
408 print(pred)
410 model_onnx = to_onnx(pipe, X_train, target_opset=12)
411 oinf = OnnxInference(model_onnx)
413 # The dataframe is converted into a dictionary,
414 # each key is a column name, each value is a numpy array.
415 inputs = {c: X_train[c].values for c in X_train.columns}
416 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()}
418 onxp = oinf.run(inputs)
419 print(onxp)
421 .. versionchanged:: 0.9
422 Parameter *as_function* was added.
423 """
424 logger.debug("to_onnx(%s, X=%r, initial_types=%r, target_opset=%r, "
425 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "
426 "final_types=%r)",
427 model.__class__.__name__, type(X), initial_types,
428 target_opset, options, rewrite_ops, white_op, black_op,
429 final_types)
431 if isinstance(model, OnnxOperatorMixin):
432 if not hasattr(model, 'op_version'):
433 raise RuntimeError( # pragma: no cover
434 f"Missing attribute 'op_version' for type '{type(model)}'.")
435 _fix_opset_skl2onnx()
436 return model.to_onnx(
437 X=X, name=name, options=options, black_op=black_op,
438 white_op=white_op, final_types=final_types,
439 target_opset=target_opset)
440 # verbose=verbose)
442 if rewrite_ops:
443 old_values, old_shapes = register_rewritten_operators()
444 register_new_operators()
445 register_converters()
446 else:
447 old_values, old_shapes = {}, {}
449 if as_function and isinstance(
450 model, (ColumnTransformer, Pipeline, FeatureUnion)):
451 res = to_onnx_function(
452 model, X=X, name=name, initial_types=initial_types,
453 target_opset=target_opset, options=options,
454 rewrite_ops=False, # already handled
455 white_op=white_op, black_op=black_op, final_types=final_types,
456 rename_strategy=None, # already handled
457 verbose=verbose, prefix_name=prefix_name,
458 run_shape=run_shape, single_function=single_function)
460 elif isinstance(model, _PredictScorer):
461 if X is not None and not isinstance(X, OrderedDict):
462 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}."
463 "".format(type(X)))
464 if initial_types is None:
465 dts = []
466 initial_types = []
467 for k, v in X.items():
468 if hasattr(v, 'dtype'):
469 dtype = guess_numpy_type(v.dtype)
470 else:
471 dtype = v # pragma: no cover
472 it, _, ndt = _guess_type_(v, None, dtype)
473 for i in range(len(it)): # pylint: disable=C0200
474 it[i] = (k, it[i][1]) # pylint: disable=C0200
475 initial_types.extend(it)
476 dts.append(ndt)
477 ndt = set(dts)
478 if len(ndt) != 1:
479 raise RuntimeError( # pragma: no cover
480 f"Multiple dtype is not efficient {ndt}.")
481 res = convert_scorer(model, initial_types, name=name,
482 target_opset=target_opset, options=options,
483 black_op=black_op, white_op=white_op,
484 final_types=final_types, verbose=verbose)
485 else:
486 if name is None:
487 name = f"mlprodict_ONNX({model.__class__.__name__})"
489 initial_types, dtype, _ = _guess_type_(X, initial_types, None)
491 _fix_opset_skl2onnx()
492 res = convert_sklearn(model, initial_types=initial_types, name=name,
493 target_opset=target_opset, options=options,
494 black_op=black_op, white_op=white_op,
495 final_types=final_types, verbose=verbose)
497 register_rewritten_operators(old_values, old_shapes)
499 # optimisation
500 if rename_strategy is not None:
501 res = onnx_rename_names(res, strategy=rename_strategy)
502 return res
505def _guess_s2o_type(vtype: ValueInfoProto):
506 return _guess_type_proto(
507 get_tensor_elem_type(vtype), get_tensor_shape(vtype))
510def _new_options(options, prefix, sklop):
511 if sklop is None:
512 raise RuntimeError( # pragma: no cover
513 "sklop cannot be None.")
514 if isinstance(sklop, str):
515 return None # pragma: no cover
516 if options is None:
517 step_options = None
518 else:
519 step_options = {}
520 for k, v in options.items():
521 if k.startswith(prefix):
522 step_options[k[len(prefix):]] = v
523 elif '__' in k:
524 step_options[k.split('__', maxsplit=1)[1]] = v
525 if isinstance(sklop, _BaseComposition):
526 step_options[k] = v
527 else:
528 from skl2onnx._supported_operators import _get_sklearn_operator_name
529 from skl2onnx.common._registration import get_converter
530 alias = _get_sklearn_operator_name(type(sklop))
531 if alias is None:
532 step_options[k] = v
533 else:
534 conv = get_converter(alias)
535 allowed = conv.get_allowed_options()
536 if allowed is not None and k in allowed:
537 step_options[k] = v
538 return step_options
541class _ParamEncoder(json.JSONEncoder):
542 def default(self, obj): # pylint: disable=W0237
543 try:
544 return json.JSONEncoder.default(self, obj)
545 except TypeError as e:
546 # Unable to serialize
547 return '{"classname": "%s", "EXC": "%s"}' % (
548 obj.__class__.__name__, str(e))
551def get_sklearn_json_params(model):
552 """
553 Retrieves all the parameters of a :epkg:`scikit-learn` model.
554 """
555 pars = model.get_params(deep=False)
556 try:
557 return json.dumps(pars, cls=_ParamEncoder)
558 except TypeError as e: # pragma: no cover
559 raise RuntimeError(
560 f"Unable to serialize parameters {pprint.pformat(pars)}.") from e
563def _to_onnx_function_pipeline(
564 model, X=None, name=None, initial_types=None,
565 target_opset=None, options=None, rewrite_ops=False,
566 white_op=None, black_op=None, final_types=None,
567 rename_strategy=None, verbose=0,
568 prefix_name=None, run_shape=False,
569 single_function=True):
571 from ..npy.xop_variable import Variable
572 from ..npy.xop import OnnxOperatorFunction, loadop
573 from ..onnx_tools.onnx_manipulations import onnx_model_to_function
575 OnnxIdentity = loadop('Identity')
577 if len(model.steps) == 0:
578 raise RuntimeError( # pragma: no cover
579 "The pipeline to be converted cannot be empty.")
581 if target_opset is None:
582 from .. import __max_supported_opset__
583 op_version = __max_supported_opset__
584 elif isinstance(target_opset, int):
585 op_version = target_opset
586 else: # pragma: no cover
587 from .. import __max_supported_opset__
588 op_version = target_opset.get('', __max_supported_opset__)
590 i_types = guess_initial_types(X, initial_types)
591 input_nodes = [OnnxIdentity(i[0], op_version=op_version)
592 for i in i_types]
594 inputs = i_types
595 last_op = None
596 for i_step, step in enumerate(model.steps):
597 prefix = step[0] + "__"
598 step_options = _new_options(options, prefix, step[1])
599 if prefix_name is not None:
600 prefix = prefix_name + prefix
601 protom = to_onnx(
602 step[1], name=name, initial_types=inputs,
603 target_opset=target_opset,
604 options=step_options, rewrite_ops=rewrite_ops,
605 white_op=white_op, black_op=black_op, verbose=verbose,
606 as_function=True, prefix_name=prefix, run_shape=run_shape,
607 single_function=False)
608 for o in protom.graph.output:
609 if get_tensor_elem_type(o) == 0:
610 raise RuntimeError( # pragma: no cover
611 "Unabble to guess output type of output %r "
612 "from model step %d: %r, output=%r." % (
613 protom.graph.output, i_step, step[1], o))
614 jspar = 'HYPER:{"%s":%s}' % (
615 step[1].__class__.__name__, get_sklearn_json_params(step[1]))
616 protof, subf = onnx_model_to_function(
617 protom, domain='sklearn',
618 name=f"{prefix}_{step[1].__class__.__name__}_{i_step}",
619 doc_string=jspar)
620 input_names = [f"{step[0]}_{o}" for o in protof.input]
621 if last_op is not None:
622 if len(input_names) == 1:
623 input_nodes = [OnnxIdentity(
624 last_op, output_names=input_names[0],
625 op_version=op_version)]
626 else:
627 input_nodes = [ # pragma: no cover
628 OnnxIdentity(last_op[i], output_names=[n], # pylint: disable=E1136
629 op_version=op_version)
630 for i, n in enumerate(input_names)]
631 output_names = [f"{step[0]}_{o}" for o in protof.output]
633 logger.debug("_to_onnx_function_pipeline:%s:%r->%r:%r:%s",
634 step[1].__class__.__name__,
635 input_names, output_names,
636 len(protof.node), jspar)
638 op = OnnxOperatorFunction(
639 protof, *input_nodes, output_names=output_names,
640 sub_functions=subf)
641 last_op = op
642 inputs = [
643 ('X%d' % i, _guess_s2o_type(o))
644 for i, o in enumerate(protom.graph.output)]
646 logger.debug("_to_onnx_function_pipeline:end:(%s-%d, X=%r, "
647 "initial_types=%r, target_opset=%r, "
648 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "
649 "final_types=%r, outputs=%r)",
650 model.__class__.__name__, id(model),
651 type(X), initial_types,
652 target_opset, options, rewrite_ops, white_op, black_op,
653 final_types, inputs)
655 i_vars = [Variable.from_skl2onnx_tuple(i) for i in i_types]
656 if final_types is None:
657 outputs_tuple = [
658 (n, _guess_s2o_type(o))
659 for i, (n, o) in enumerate(zip(output_names, protom.graph.output))]
660 outputs = [Variable.from_skl2onnx_tuple(i) for i in outputs_tuple]
661 else:
662 outputs = final_types
664 onx = last_op.to_onnx(inputs=i_vars, target_opset=target_opset,
665 verbose=verbose, run_shape=run_shape,
666 outputs=outputs)
668 for o in onx.graph.output:
669 if get_tensor_elem_type(o) == 0:
670 raise RuntimeError( # pragma: no cover
671 "Unable to guess output type of output %r "
672 "from model %r." % (onx.graph.output, model))
673 return onx
676def get_column_index(i, inputs):
677 """
678 Returns a tuples (variable index, column index in that variable).
679 The function has two different behaviours, one when *i* (column index)
680 is an integer, another one when *i* is a string (column name).
681 If *i* is a string, the function looks for input name with
682 this name and returns `(index, 0)`.
683 If *i* is an integer, let's assume first we have two inputs
684 `I0 = FloatTensorType([None, 2])` and `I1 = FloatTensorType([None, 3])`,
685 in this case, here are the results:
687 ::
689 get_column_index(0, inputs) -> (0, 0)
690 get_column_index(1, inputs) -> (0, 1)
691 get_column_index(2, inputs) -> (1, 0)
692 get_column_index(3, inputs) -> (1, 1)
693 get_column_index(4, inputs) -> (1, 2)
694 """
695 if isinstance(i, int):
696 if i == 0:
697 # Useful shortcut, skips the case when end is None
698 # (unknown dimension)
699 return 0, 0
700 vi = 0
701 pos = 0
702 end = inputs[0][1].shape[1]
703 if end is None:
704 raise RuntimeError( # pragma: no cover
705 "Cannot extract a specific column %r when "
706 "one input (%r) has unknown "
707 "dimension." % (i, inputs[0]))
708 while True:
709 if pos <= i < end:
710 return vi, i - pos
711 vi += 1
712 pos = end
713 if vi >= len(inputs):
714 raise RuntimeError( # pragma: no cover
715 "Input %r (i=%r, end=%r) is not available in\n%r" % (
716 vi, i, end, pprint.pformat(inputs)))
717 rel_end = inputs[vi][1].shape[1]
718 if rel_end is None:
719 raise RuntimeError( # pragma: no cover
720 "Cannot extract a specific column %r when "
721 "one input (%r) has unknown "
722 "dimension." % (i, inputs[vi]))
723 end += rel_end
724 else:
725 for ind, inp in enumerate(inputs):
726 if inp[0] == i:
727 return ind, 0
728 raise RuntimeError( # pragma: no cover
729 "Unable to find column name %r among names %r. "
730 "Make sure the input names specified with parameter "
731 "initial_types fits the column names specified in the "
732 "pipeline to convert. This may happen because a "
733 "ColumnTransformer follows a transformer without "
734 "any mapped converter in a pipeline." % (
735 i, [n[0] for n in inputs]))
738def get_column_indices(indices, inputs, multiple):
739 """
740 Returns the requested graph inpudes based on their
741 indices or names. See :func:`get_column_index`.
743 :param indices: variables indices or names
744 :param inputs: graph inputs
745 :param multiple: allows column to come from multiple variables
746 :return: a tuple *(variable name, list of requested indices)* if
747 *multiple* is False, a dictionary *{ var_index: [ list of
748 requested indices ] }*
749 if *multiple* is True
750 """
751 if multiple:
752 res = OrderedDict()
753 for p in indices:
754 ov, onnx_i = get_column_index(p, inputs)
755 if ov not in res:
756 res[ov] = []
757 res[ov].append(onnx_i)
758 return res
760 onnx_var = None
761 onnx_is = []
762 for p in indices:
763 ov, onnx_i = get_column_index(p, inputs)
764 onnx_is.append(onnx_i)
765 if onnx_var is None:
766 onnx_var = ov
767 elif onnx_var != ov:
768 cols = [onnx_var, ov]
769 raise NotImplementedError( # pragma: no cover
770 "sklearn-onnx is not able to merge multiple columns from "
771 "multiple variables ({0}). You should think about merging "
772 "initial types.".format(cols))
773 return onnx_var, onnx_is
776def _merge_initial_types(i_types, transform_inputs, merge):
777 if len(i_types) == len(transform_inputs):
778 new_types = []
779 for it, sli in zip(i_types, transform_inputs):
780 name, ty = it
781 begin, end = sli.inputs[1], sli.inputs[2]
782 delta = end - begin
783 shape = [ty.shape[0], int(delta[0])]
784 new_types.append((name, ty.__class__(shape)))
785 else:
786 raise NotImplementedError( # pragma: no cover
787 "Not implemented when i_types=%r, transform_inputs=%r."
788 "" % (i_types, transform_inputs))
789 if merge and len(new_types) > 1:
790 raise NotImplementedError( # pragma: no cover
791 "Cannot merge %r built from i_types=%r, transform_inputs=%r."
792 "" % (new_types, i_types, transform_inputs))
793 return new_types
796def _to_onnx_function_column_transformer(
797 model, X=None, name=None, initial_types=None,
798 target_opset=None, options=None, rewrite_ops=False,
799 white_op=None, black_op=None, final_types=None,
800 rename_strategy=None, verbose=0,
801 prefix_name=None, run_shape=False,
802 single_function=True):
804 from sklearn.preprocessing import OneHotEncoder
805 from ..npy.xop_variable import Variable
806 from ..npy.xop import OnnxOperatorFunction, loadop
807 from ..onnx_tools.onnx_manipulations import onnx_model_to_function
809 OnnxConcat, OnnxSlice, OnnxIdentity = loadop('Concat', 'Slice', 'Identity')
811 transformers = model.transformers_
812 if len(transformers) == 0:
813 raise RuntimeError( # pragma: no cover
814 "The ColumnTransformer to be converted cannot be empty.")
816 if target_opset is None:
817 from .. import __max_supported_opset__
818 op_version = __max_supported_opset__
819 elif isinstance(target_opset, int):
820 op_version = target_opset
821 else: # pragma: no cover
822 from .. import __max_supported_opset__
823 op_version = target_opset.get('', __max_supported_opset__)
825 i_types = guess_initial_types(X, initial_types)
826 ops = []
827 protoms = []
828 output_namess = []
829 for i_step, (name_step, op, column_indices) in enumerate(transformers):
830 if op == 'drop':
831 continue
832 input_nodes = [OnnxIdentity(i[0], op_version=op_version)
833 for i in initial_types]
834 if isinstance(column_indices, slice):
835 column_indices = list(range(
836 column_indices.start
837 if column_indices.start is not None else 0,
838 column_indices.stop, column_indices.step
839 if column_indices.step is not None else 1))
840 elif isinstance(column_indices, (int, str)):
841 column_indices = [column_indices]
842 names = get_column_indices(column_indices, i_types, multiple=True)
843 transform_inputs = []
844 for onnx_var, onnx_is in names.items():
845 if max(onnx_is) - min(onnx_is) != len(onnx_is) - 1:
846 raise RuntimeError( # pragma: no cover
847 "The converter only with contiguous columns indices not %r "
848 "for step %r." % (column_indices, name_step))
849 tr_inputs = OnnxSlice(input_nodes[onnx_var],
850 numpy.array([onnx_is[0]], dtype=numpy.int64),
851 numpy.array([onnx_is[-1] + 1],
852 dtype=numpy.int64),
853 numpy.array([1], dtype=numpy.int64),
854 op_version=op_version)
855 transform_inputs.append(tr_inputs)
857 merged_cols = False
858 if len(transform_inputs) > 1:
859 if isinstance(op, Pipeline):
860 if not isinstance(op.steps[0][1],
861 (OneHotEncoder, ColumnTransformer)):
862 merged_cols = True
863 elif not isinstance(op, (OneHotEncoder, ColumnTransformer)):
864 merged_cols = True
866 if merged_cols:
867 concatenated = OnnxConcat(
868 *transform_inputs, op_version=op_version, axis=1)
869 else:
870 concatenated = transform_inputs
871 initial_types = _merge_initial_types(
872 i_types, transform_inputs, merged_cols)
874 prefix = name_step + "__"
875 step_options = _new_options(options, prefix, op)
876 if prefix_name is not None:
877 prefix = prefix_name + prefix
879 if op == 'passthrough':
880 ops.extend(concatenated)
881 continue
883 protom = to_onnx(
884 op, name=name_step, X=X, initial_types=initial_types,
885 target_opset=target_opset,
886 options=step_options, rewrite_ops=rewrite_ops,
887 white_op=white_op, black_op=black_op, verbose=verbose,
888 as_function=True, prefix_name=prefix, run_shape=run_shape,
889 single_function=False)
890 protoms.append(protom)
892 for o in protom.graph.output:
893 if get_tensor_elem_type(o) == 0:
894 raise RuntimeError( # pragma: no cover
895 "Unabble to guess output type of output %r "
896 "from model step %d: %r." % (
897 protom.graph.output, i_step, op))
898 jspar = 'HYPER:{"%s":%s}' % (
899 op.__class__.__name__, get_sklearn_json_params(op))
900 protof, fcts = onnx_model_to_function(
901 protom, domain='sklearn',
902 name=f"{prefix}_{op.__class__.__name__}_{id(op)}",
903 doc_string=jspar)
904 output_names = [f"{name_step}_{o}" for o in protof.output]
905 output_namess.append(output_names)
907 logger.debug("_to_onnx_function_column_transformer:%s:->%r:%r:%s",
908 op.__class__.__name__, output_names, len(protof.node), jspar)
910 op = OnnxOperatorFunction(
911 protof, *concatenated, output_names=output_names,
912 sub_functions=list(fcts))
913 ops.append(op)
915 logger.debug("_to_onnx_function_column_transformer:end:(%s-%d, X=%r, "
916 "initial_types=%r, target_opset=%r, "
917 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "
918 "final_types=%r, outputs=%r)",
919 model.__class__.__name__, id(model),
920 type(X), initial_types, target_opset,
921 options, rewrite_ops, white_op, black_op,
922 final_types, i_types)
924 i_vars = [Variable.from_skl2onnx_tuple(i) for i in i_types]
925 if final_types is None:
926 outputs_tuple = []
927 for protom, output_names in zip(protoms, output_namess):
928 outputs_tuple.extend([
929 (n, _guess_s2o_type(o))
930 for i, (n, o) in enumerate(zip(output_names, protom.graph.output))])
931 outputs = [Variable.from_skl2onnx_tuple(i) for i in outputs_tuple]
932 else:
933 outputs = final_types
935 last_op = OnnxConcat(*ops, op_version=op_version, axis=1)
937 onx = last_op.to_onnx(inputs=i_vars, target_opset=target_opset,
938 verbose=verbose, run_shape=run_shape,
939 outputs=outputs)
941 for o in onx.graph.output:
942 if get_tensor_elem_type(o) == 0:
943 raise RuntimeError( # pragma: no cover
944 "Unable to guess output type of output %r "
945 "from model %r." % (onx.graph.output, model))
946 return onx
949def to_onnx_function(model, X=None, name=None, initial_types=None,
950 target_opset=None, options=None, rewrite_ops=False,
951 white_op=None, black_op=None, final_types=None,
952 rename_strategy=None, verbose=0,
953 prefix_name=None, run_shape=False,
954 single_function=True):
955 """
956 Converts a model using on :epkg:`sklearn-onnx`.
957 The functions works as the same as function @see fn to_onnx
958 but every model is exported as a single function and the main
959 graph represents the pipeline structure.
961 :param model: model to convert or a function
962 wrapped into :epkg:`_PredictScorer` with
963 function :epkg:`make_scorer`
964 :param X: training set (at least one row),
965 can be None, it is used to infered the
966 input types (*initial_types*)
967 :param initial_types: if *X* is None, then *initial_types*
968 must be defined
969 :param name: name of the produced model
970 :param target_opset: to do it with a different target opset
971 :param options: additional parameters for the conversion
972 :param rewrite_ops: rewrites some existing converters,
973 the changes are permanent
974 :param white_op: white list of ONNX nodes allowed
975 while converting a pipeline, if empty, all are allowed
976 :param black_op: black list of ONNX nodes allowed
977 while converting a pipeline, if empty,
978 none are blacklisted
979 :param final_types: a python list. Works the same way as
980 initial_types but not mandatory, it is used
981 to overwrites the type (if type is not None)
982 and the name of every output.
983 :param rename_strategy: rename any name in the graph, select shorter
984 names, see @see fn onnx_rename_names
985 :param verbose: display information while converting the model
986 :param prefix_name: prefix for variable names
987 :param run_shape: run shape inference on the final onnx model
988 :param single_function: if True, the main graph only includes one node
989 calling the main function
990 :return: converted model
991 """
992 if rename_strategy is not None or rewrite_ops:
993 return to_onnx(
994 model, X=X, name=name, initial_types=initial_types,
995 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops,
996 white_op=white_op, black_op=black_op, final_types=final_types,
997 rename_strategy=rename_strategy, verbose=verbose,
998 run_shape=run_shape)
1000 logger.debug("to_onnx_function:begin:(%s-%d, X=%r, initial_types=%r, target_opset=%r, "
1001 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "
1002 "final_types=%r)",
1003 model.__class__.__name__, id(model), type(X), initial_types,
1004 target_opset, options, rewrite_ops, white_op, black_op,
1005 final_types)
1007 if final_types is not None:
1008 raise NotImplementedError( # pragma: no cover
1009 "final_types != None, not implemented yet.")
1011 if single_function and (not isinstance(model, Pipeline) or
1012 len(model.steps) != 1):
1013 # Wraps the model into a single pipeline.
1014 new_model = Pipeline(steps=[('main', model)])
1015 return to_onnx_function(
1016 new_model, X=X, name=name, initial_types=initial_types,
1017 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops,
1018 white_op=white_op, black_op=black_op, final_types=final_types,
1019 rename_strategy=rename_strategy, verbose=verbose,
1020 prefix_name=prefix_name, run_shape=run_shape, single_function=False)
1022 if isinstance(model, Pipeline):
1023 return _to_onnx_function_pipeline(
1024 model, X=X, name=name, initial_types=initial_types,
1025 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops,
1026 white_op=white_op, black_op=black_op, final_types=final_types,
1027 rename_strategy=rename_strategy, verbose=verbose,
1028 prefix_name=prefix_name, run_shape=run_shape,
1029 single_function=single_function)
1031 if isinstance(model, ColumnTransformer):
1032 return _to_onnx_function_column_transformer(
1033 model, X=X, name=name, initial_types=initial_types,
1034 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops,
1035 white_op=white_op, black_op=black_op, final_types=final_types,
1036 rename_strategy=rename_strategy, verbose=verbose,
1037 prefix_name=prefix_name, run_shape=run_shape,
1038 single_function=single_function)
1040 raise TypeError( # pragma: no cover
1041 f"Unexpected type {type(model)!r} for model to convert.")