Coverage for mlprodict/onnx_conv/convert.py: 88%

1# -*- encoding: utf-8 -*-

2# pylint: disable=C0302,R0914

3"""

4@file

5@brief Overloads a conversion function.

6"""

7import json

8import pprint

9from collections import OrderedDict

10import logging

11import numpy

12from onnx import ValueInfoProto

13import pandas

14try:

15 from sklearn.metrics._scorer import _PredictScorer

16except ImportError: # pragma: no cover

17 # scikit-learn < 0.22

18 from sklearn.metrics.scorer import _PredictScorer

19from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version

20from sklearn.pipeline import Pipeline, FeatureUnion

21from sklearn.compose import ColumnTransformer

22from sklearn.utils.metaestimators import _BaseComposition

23from skl2onnx.common.data_types import (

24 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type,

25 StringTensorType, Int64TensorType, _guess_type_proto)

26from skl2onnx import convert_sklearn

27from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin

28from skl2onnx.algebra.type_helper import _guess_type

29from ..onnx_tools.onnx_manipulations import onnx_rename_names

30from ..onnx_tools.onnx2py_helper import (

31 guess_dtype, get_tensor_shape, get_tensor_elem_type)

32from .register_rewritten_converters import (

33 register_rewritten_operators, register_new_operators)

34from .register import register_converters

35from .scorers import CustomScorerTransform

38logger = logging.getLogger('mlprodict')

41def _fix_opset_skl2onnx():

42 import skl2onnx

43 from .. import __max_supported_opset__

44 if skl2onnx.__max_supported_opset__ != __max_supported_opset__:

45 skl2onnx.__max_supported_opset__ = __max_supported_opset__ # pragma: no cover

48def convert_scorer(fct, initial_types, name=None,

49 target_opset=None, options=None,

50 custom_conversion_functions=None,

51 custom_shape_calculators=None,

52 custom_parsers=None, white_op=None,

53 black_op=None, final_types=None,

54 verbose=0):

55 """

56 Converts a scorer into :epkg:`ONNX` assuming

57 there exists a converter associated to it.

58 The function wraps the function into a custom

59 transformer, then calls function *convert_sklearn*

60 from :epkg:`sklearn-onnx`.

62 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`)

63 :param initial_types: types information

64 :param name: name of the produced model

65 :param target_opset: to do it with a different target opset

66 :param options: additional parameters for the conversion

67 :param custom_conversion_functions: a dictionary for specifying the user

68 customized conversion function, it takes precedence over

69 registered converters

70 :param custom_shape_calculators: a dictionary for specifying the user

71 customized shape calculator it takes precedence over registered

72 shape calculators.

73 :param custom_parsers: parsers determine which outputs is expected

74 for which particular task, default parsers are

75 defined for classifiers, regressors, pipeline but

76 they can be rewritten, *custom_parsers* is a dictionary

77 ``{ type: fct_parser(scope, model, inputs,

78 custom_parsers=None) }``

79 :param white_op: white list of ONNX nodes allowed

80 while converting a pipeline, if empty, all are allowed

81 :param black_op: black list of ONNX nodes allowed

82 while converting a pipeline, if empty, none are blacklisted

83 :param final_types: a python list. Works the same way as

84 initial_types but not mandatory, it is used

85 to overwrites the type (if type is not None)

86 and the name of every output.

87 :param verbose: displays information while converting

88 :return: :epkg:`ONNX` graph

89 """

90 if hasattr(fct, '_score_func'):

91 kwargs = fct._kwargs

92 fct = fct._score_func

93 else:

94 kwargs = None # pragma: no cover

95 if name is None:

96 name = f"mlprodict_fct_ONNX({fct.__name__})"

97 tr = CustomScorerTransform(fct.__name__, fct, kwargs)

98 _fix_opset_skl2onnx()

99 return convert_sklearn(

100 tr, initial_types=initial_types,

101 target_opset=target_opset, options=options,

102 custom_conversion_functions=custom_conversion_functions,

103 custom_shape_calculators=custom_shape_calculators,

104 custom_parsers=custom_parsers, white_op=white_op,

105 black_op=black_op, final_types=final_types,

106 verbose=verbose)

107

108

109def guess_initial_types(X, initial_types):

110 """

111 Guesses initial types from an array or a dataframe.

112

113 :param X: array or dataframe

114 :param initial_types: hints about X

115 :return: data types

116 """

117 if X is None and initial_types is None:

118 raise NotImplementedError( # pragma: no cover

119 "Initial types must be specified.")

120 elif initial_types is None:

121 if isinstance(X, (numpy.ndarray, pandas.DataFrame)):

122 X = X[:1]

123 if isinstance(X, pandas.DataFrame):

124 initial_types = []

125 for c in X.columns:

126 if isinstance(X[c].values[0], (str, numpy.str_)):

127 g = StringTensorType()

128 else:

129 g = _guess_type(X[c].values)

130 g.shape = [None, 1]

131 initial_types.append((c, g))

132 else:

133 gt = _guess_type(X)

134 initial_types = [('X', gt)]

135 return initial_types

136

137

138def _replace_tensor_type(schema, tensor_type):

139 res = []

140 for name, ty in schema:

141 cl = ty.__class__

142 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type:

143 ty = tensor_type(ty.shape)

144 res.append((name, ty))

145 return res

146

147

148def guess_schema_from_data(X, tensor_type=None, schema=None):

149 """

150 Guesses initial types from a dataset.

151

152 @param X dataset (dataframe, array)

153 @param tensor_type if not None, replaces every

154 *FloatTensorType* or *DoubleTensorType*

155 by this one

156 @param schema known schema

157 @return schema (list of typed and named columns)

158 """

159 init = guess_initial_types(X, schema)

160 if tensor_type is not None:

161 init = _replace_tensor_type(init, tensor_type)

162 # Grouping column

163 unique = set()

164 for _, col in init:

165 if len(col.shape) != 2:

166 return init # pragma: no cover

167 if col.shape[0] is not None:

168 return init # pragma: no cover

169 if len(unique) > 0 and col.__class__ not in unique:

170 return init # pragma: no cover

171 unique.add(col.__class__)

172 unique = list(unique)

173 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))]

174

175

176def get_inputs_from_data(X, schema=None):

177 """

178 Produces input data for *onnx* runtime.

179

180 @param X data

181 @param schema schema if None, schema is guessed with

182 @see fn guess_schema_from_data

183 @return input data

184 """

185 def _cast_data(X, ct):

186 if isinstance(ct, FloatTensorType):

187 return X.astype(numpy.float32)

188 if isinstance(ct, DoubleTensorType):

189 return X.astype(numpy.float64)

190 if isinstance(ct, StringTensorType):

191 return X.astype(numpy.str_)

192 if isinstance(ct, Int64TensorType):

193 return X.astype(numpy.int64)

194 raise RuntimeError( # pragma: no cover

195 f"Unexpected column type {ct} for type {type(X)}.")

196

197 if schema is None:

198 schema = guess_schema_from_data(X)

199 if isinstance(X, numpy.ndarray):

200 if len(schema) != 1:

201 raise RuntimeError( # pragma: no cover

202 "More than one column but input is an array.")

203 return {schema[0][0]: _cast_data(X, schema[0][1])}

204 if isinstance(X, pandas.DataFrame):

205 if len(schema) != X.shape[1]:

206 raise RuntimeError( # pragma: no cover

207 "Mismatch between onnx columns {} and DataFrame columns {}"

208 "".format(len(schema), X.shape[1]))

209 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1))

210 for sch, c in zip(schema, X.columns)}

211 raise TypeError( # pragma: no cover

212 f"Unexpected type {type(X)}, expecting an array or a dataframe.")

213

214

215def guess_schema_from_model(model, tensor_type=None, schema=None):

216 """

217 Guesses initial types from a model.

218

219 @param model model

220 @param tensor_type if not None, replaces every

221 *FloatTensorType* or *DoubleTensorType*

222 by this one

223 @param schema known schema

224 @return schema (list of typed and named columns)

225 """

226 if schema is not None:

227 try:

228 guessed = guess_schema_from_model(model)

229 except NotImplementedError: # pragma: no cover

230 return _replace_tensor_type(schema, tensor_type)

231 if len(guessed) != len(schema):

232 raise RuntimeError( # pragma: no cover

233 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format(

234 schema, guessed))

235 return _replace_tensor_type(schema, tensor_type)

236

237 if hasattr(model, 'coef_'):

238 # linear model

239 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))]

240 return _replace_tensor_type(init, tensor_type)

241 elif hasattr(model, 'dump_model'):

242 dumped = model.dump_model()

243 if isinstance(dumped, dict) and 'feature_names' in dumped:

244 names = dumped['feature_names']

245 init = [(name, FloatTensorType([None, 1])) for name in names]

246 return _replace_tensor_type(init, tensor_type)

247

248 data = pprint.pformat(model.__dict__)

249 dirs = pprint.pformat(dir(model))

250 if hasattr(model, 'dump_model'): # pragma: no cover

251 dumped = model.dump_model()

252 keys = list(sorted(dumped))

253 last = pprint.pformat([keys, dumped])

254 if len(last) >= 200000:

255 last = last[:200000] + "\n..."

256 else:

257 last = ""

258 raise NotImplementedError( # pragma: no cover

259 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format(

260 model.__class__, data, dirs, last))

261

262

263def _guess_type_(X, itype, dtype):

264 initial_types = guess_initial_types(X, itype)

265 if dtype is None:

266 if hasattr(X, 'dtypes'): # DataFrame

267 dtype = numpy.float32

268 elif hasattr(X, 'dtype'):

269 dtype = X.dtype

270 elif hasattr(X, 'type'):

271 dtype = guess_numpy_type(X.type)

272 elif isinstance(initial_types[0], ValueInfoProto):

273 dtype = guess_dtype(initial_types[0].type.tensor_type.elem_type)

274 elif initial_types is not None:

275 dtype = guess_numpy_type(initial_types[0][1])

276 else:

277 raise RuntimeError( # pragma: no cover

278 f"dtype cannot be guessed: {type(X)}")

279 if dtype != numpy.float64:

280 dtype = numpy.float32

281 if dtype is None:

282 raise RuntimeError("dtype cannot be None") # pragma: no cover

283 if isinstance(dtype, FloatTensorType):

284 dtype = numpy.float32 # pragma: no cover

285 elif isinstance(dtype, DoubleTensorType):

286 dtype = numpy.float64 # pragma: no cover

287 new_dtype = dtype

288 if isinstance(dtype, numpy.ndarray):

289 new_dtype = dtype.dtype # pragma: no cover

290 elif isinstance(dtype, DataType):

291 new_dtype = numpy.float32 # pragma: no cover

292 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64,

293 numpy.int32, numpy.float16):

294 raise NotImplementedError( # pragma: no cover

295 f"dtype should be real not {new_dtype} ({dtype})")

296 return initial_types, dtype, new_dtype

297

298

299def to_onnx(model, X=None, name=None, initial_types=None,

300 target_opset=None, options=None, rewrite_ops=False,

301 white_op=None, black_op=None, final_types=None,

302 rename_strategy=None, verbose=0,

303 as_function=False, prefix_name=None,

304 run_shape=False, single_function=True):

305 """

306 Converts a model using on :epkg:`sklearn-onnx`.

307

308 :param model: model to convert or a function

309 wrapped into :epkg:`_PredictScorer` with

310 function :epkg:`make_scorer`

311 :param X: training set (at least one row),

312 can be None, it is used to infered the

313 input types (*initial_types*)

314 :param initial_types: if *X* is None, then *initial_types*

315 must be defined

316 :param name: name of the produced model

317 :param target_opset: to do it with a different target opset

318 :param options: additional parameters for the conversion

319 :param rewrite_ops: rewrites some existing converters,

320 the changes are permanent

321 :param white_op: white list of ONNX nodes allowed

322 while converting a pipeline, if empty, all are allowed

323 :param black_op: black list of ONNX nodes allowed

324 while converting a pipeline, if empty,

325 none are blacklisted

326 :param final_types: a python list. Works the same way as

327 initial_types but not mandatory, it is used

328 to overwrites the type (if type is not None)

329 and the name of every output.

330 :param rename_strategy: rename any name in the graph, select shorter

331 names, see @see fn onnx_rename_names

332 :param verbose: display information while converting the model

333 :param as_function: exposes every model in a pipeline as a function,

334 the main graph contains the pipeline structure,

335 see :ref:`onnxsklearnfunctionsrst` for an example

336 :param prefix_name: used if *as_function* is True, to give

337 a prefix to variable in a pipeline

338 :param run_shape: run shape inference

339 :param single_function: if *as_function* is True, the function returns one graph

340 with one call to the main function if *single_function* is True or

341 a list of node corresponding to the graph structure

342 :return: converted model

343

344 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx`

345 but may changes a few converters if *rewrite_ops* is True.

346 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor*

347 for float but not for double. It becomes available

348 if ``rewrite_ops=True``.

349

350 .. faqref::

351 :title: How to deal with a dataframe as input?

352

353 Each column of the dataframe is considered as an named input.

354 The first step is to make sure that every column type is correct.

355 :epkg:`pandas` tends to select the least generic type to

356 hold the content of one column. :epkg:`ONNX` does not automatically

357 cast the data it receives. The data must have the same type with

358 the model is converted and when the converted model receives

359 the data to predict.

360

361 .. runpython::

362 :showcode:

363 :warningout: DeprecationWarning

364

365 from io import StringIO

366 from textwrap import dedent

367 import numpy

368 import pandas

369 from pyquickhelper.pycode import ExtTestCase

370 from sklearn.preprocessing import OneHotEncoder

371 from sklearn.pipeline import Pipeline

372 from sklearn.compose import ColumnTransformer

373 from mlprodict.onnx_conv import to_onnx

374 from mlprodict.onnxrt import OnnxInference

375

376 text = dedent('''

377 __SCHEMA__

378 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red

379 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red

380 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red

381 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red

382 ''')

383 text = text.replace(

384 "__SCHEMA__",

385 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,"

386 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,"

387 "alcohol,quality,color")

388

389 X_train = pandas.read_csv(StringIO(text))

390 for c in X_train.columns:

391 if c != 'color':

392 X_train[c] = X_train[c].astype(numpy.float32)

393 numeric_features = [c for c in X_train if c != 'color']

394

395 pipe = Pipeline([

396 ("prep", ColumnTransformer([

397 ("color", Pipeline([

398 ('one', OneHotEncoder()),

399 ('select', ColumnTransformer(

400 [('sel1', 'passthrough', [0])]))

401 ]), ['color']),

402 ("others", "passthrough", numeric_features)

403 ])),

404 ])

405

406 pipe.fit(X_train)

407 pred = pipe.transform(X_train)

408 print(pred)

409

410 model_onnx = to_onnx(pipe, X_train, target_opset=12)

411 oinf = OnnxInference(model_onnx)

412

413 # The dataframe is converted into a dictionary,

414 # each key is a column name, each value is a numpy array.

415 inputs = {c: X_train[c].values for c in X_train.columns}

416 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()}

417

418 onxp = oinf.run(inputs)

419 print(onxp)

420

421 .. versionchanged:: 0.9

422 Parameter *as_function* was added.

423 """

424 logger.debug("to_onnx(%s, X=%r, initial_types=%r, target_opset=%r, "

425 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "

426 "final_types=%r)",

427 model.__class__.__name__, type(X), initial_types,

428 target_opset, options, rewrite_ops, white_op, black_op,

429 final_types)

430

431 if isinstance(model, OnnxOperatorMixin):

432 if not hasattr(model, 'op_version'):

433 raise RuntimeError( # pragma: no cover

434 f"Missing attribute 'op_version' for type '{type(model)}'.")

435 _fix_opset_skl2onnx()

436 return model.to_onnx(

437 X=X, name=name, options=options, black_op=black_op,

438 white_op=white_op, final_types=final_types,

439 target_opset=target_opset)

440 # verbose=verbose)

441

442 if rewrite_ops:

443 old_values, old_shapes = register_rewritten_operators()

444 register_new_operators()

445 register_converters()

446 else:

447 old_values, old_shapes = {}, {}

448

449 if as_function and isinstance(

450 model, (ColumnTransformer, Pipeline, FeatureUnion)):

451 res = to_onnx_function(

452 model, X=X, name=name, initial_types=initial_types,

453 target_opset=target_opset, options=options,

454 rewrite_ops=False, # already handled

455 white_op=white_op, black_op=black_op, final_types=final_types,

456 rename_strategy=None, # already handled

457 verbose=verbose, prefix_name=prefix_name,

458 run_shape=run_shape, single_function=single_function)

459

460 elif isinstance(model, _PredictScorer):

461 if X is not None and not isinstance(X, OrderedDict):

462 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}."

463 "".format(type(X)))

464 if initial_types is None:

465 dts = []

466 initial_types = []

467 for k, v in X.items():

468 if hasattr(v, 'dtype'):

469 dtype = guess_numpy_type(v.dtype)

470 else:

471 dtype = v # pragma: no cover

472 it, _, ndt = _guess_type_(v, None, dtype)

473 for i in range(len(it)): # pylint: disable=C0200

474 it[i] = (k, it[i][1]) # pylint: disable=C0200

475 initial_types.extend(it)

476 dts.append(ndt)

477 ndt = set(dts)

478 if len(ndt) != 1:

479 raise RuntimeError( # pragma: no cover

480 f"Multiple dtype is not efficient {ndt}.")

481 res = convert_scorer(model, initial_types, name=name,

482 target_opset=target_opset, options=options,

483 black_op=black_op, white_op=white_op,

484 final_types=final_types, verbose=verbose)

485 else:

486 if name is None:

487 name = f"mlprodict_ONNX({model.__class__.__name__})"

488

489 initial_types, dtype, _ = _guess_type_(X, initial_types, None)

490

491 _fix_opset_skl2onnx()

492 res = convert_sklearn(model, initial_types=initial_types, name=name,

493 target_opset=target_opset, options=options,

494 black_op=black_op, white_op=white_op,

495 final_types=final_types, verbose=verbose)

496

497 register_rewritten_operators(old_values, old_shapes)

498

499 # optimisation

500 if rename_strategy is not None:

501 res = onnx_rename_names(res, strategy=rename_strategy)

502 return res

503

504

505def _guess_s2o_type(vtype: ValueInfoProto):

506 return _guess_type_proto(

507 get_tensor_elem_type(vtype), get_tensor_shape(vtype))

508

509

510def _new_options(options, prefix, sklop):

511 if sklop is None:

512 raise RuntimeError( # pragma: no cover

513 "sklop cannot be None.")

514 if isinstance(sklop, str):

515 return None # pragma: no cover

516 if options is None:

517 step_options = None

518 else:

519 step_options = {}

520 for k, v in options.items():

521 if k.startswith(prefix):

522 step_options[k[len(prefix):]] = v

523 elif '__' in k:

524 step_options[k.split('__', maxsplit=1)[1]] = v

525 if isinstance(sklop, _BaseComposition):

526 step_options[k] = v

527 else:

528 from skl2onnx._supported_operators import _get_sklearn_operator_name

529 from skl2onnx.common._registration import get_converter

530 alias = _get_sklearn_operator_name(type(sklop))

531 if alias is None:

532 step_options[k] = v

533 else:

534 conv = get_converter(alias)

535 allowed = conv.get_allowed_options()

536 if allowed is not None and k in allowed:

537 step_options[k] = v

538 return step_options

539

540

541class _ParamEncoder(json.JSONEncoder):

542 def default(self, obj): # pylint: disable=W0237

543 try:

544 return json.JSONEncoder.default(self, obj)

545 except TypeError as e:

546 # Unable to serialize

547 return '{"classname": "%s", "EXC": "%s"}' % (

548 obj.__class__.__name__, str(e))

549

550

551def get_sklearn_json_params(model):

552 """

553 Retrieves all the parameters of a :epkg:`scikit-learn` model.

554 """

555 pars = model.get_params(deep=False)

556 try:

557 return json.dumps(pars, cls=_ParamEncoder)

558 except TypeError as e: # pragma: no cover

559 raise RuntimeError(

560 f"Unable to serialize parameters {pprint.pformat(pars)}.") from e

561

562

563def _to_onnx_function_pipeline(

564 model, X=None, name=None, initial_types=None,

565 target_opset=None, options=None, rewrite_ops=False,

566 white_op=None, black_op=None, final_types=None,

567 rename_strategy=None, verbose=0,

568 prefix_name=None, run_shape=False,

569 single_function=True):

570

571 from ..npy.xop_variable import Variable

572 from ..npy.xop import OnnxOperatorFunction, loadop

573 from ..onnx_tools.onnx_manipulations import onnx_model_to_function

574

575 OnnxIdentity = loadop('Identity')

576

577 if len(model.steps) == 0:

578 raise RuntimeError( # pragma: no cover

579 "The pipeline to be converted cannot be empty.")

580

581 if target_opset is None:

582 from .. import __max_supported_opset__

583 op_version = __max_supported_opset__

584 elif isinstance(target_opset, int):

585 op_version = target_opset

586 else: # pragma: no cover

587 from .. import __max_supported_opset__

588 op_version = target_opset.get('', __max_supported_opset__)

589

590 i_types = guess_initial_types(X, initial_types)

591 input_nodes = [OnnxIdentity(i[0], op_version=op_version)

592 for i in i_types]

593

594 inputs = i_types

595 last_op = None

596 for i_step, step in enumerate(model.steps):

597 prefix = step[0] + "__"

598 step_options = _new_options(options, prefix, step[1])

599 if prefix_name is not None:

600 prefix = prefix_name + prefix

601 protom = to_onnx(

602 step[1], name=name, initial_types=inputs,

603 target_opset=target_opset,

604 options=step_options, rewrite_ops=rewrite_ops,

605 white_op=white_op, black_op=black_op, verbose=verbose,

606 as_function=True, prefix_name=prefix, run_shape=run_shape,

607 single_function=False)

608 for o in protom.graph.output:

609 if get_tensor_elem_type(o) == 0:

610 raise RuntimeError( # pragma: no cover

611 "Unabble to guess output type of output %r "

612 "from model step %d: %r, output=%r." % (

613 protom.graph.output, i_step, step[1], o))

614 jspar = 'HYPER:{"%s":%s}' % (

615 step[1].__class__.__name__, get_sklearn_json_params(step[1]))

616 protof, subf = onnx_model_to_function(

617 protom, domain='sklearn',

618 name=f"{prefix}_{step[1].__class__.__name__}_{i_step}",

619 doc_string=jspar)

620 input_names = [f"{step[0]}_{o}" for o in protof.input]

621 if last_op is not None:

622 if len(input_names) == 1:

623 input_nodes = [OnnxIdentity(

624 last_op, output_names=input_names[0],

625 op_version=op_version)]

626 else:

627 input_nodes = [ # pragma: no cover

628 OnnxIdentity(last_op[i], output_names=[n], # pylint: disable=E1136

629 op_version=op_version)

630 for i, n in enumerate(input_names)]

631 output_names = [f"{step[0]}_{o}" for o in protof.output]

632

633 logger.debug("_to_onnx_function_pipeline:%s:%r->%r:%r:%s",

634 step[1].__class__.__name__,

635 input_names, output_names,

636 len(protof.node), jspar)

637

638 op = OnnxOperatorFunction(

639 protof, *input_nodes, output_names=output_names,

640 sub_functions=subf)

641 last_op = op

642 inputs = [

643 ('X%d' % i, _guess_s2o_type(o))

644 for i, o in enumerate(protom.graph.output)]

645

646 logger.debug("_to_onnx_function_pipeline:end:(%s-%d, X=%r, "

647 "initial_types=%r, target_opset=%r, "

648 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "

649 "final_types=%r, outputs=%r)",

650 model.__class__.__name__, id(model),

651 type(X), initial_types,

652 target_opset, options, rewrite_ops, white_op, black_op,

653 final_types, inputs)

654

655 i_vars = [Variable.from_skl2onnx_tuple(i) for i in i_types]

656 if final_types is None:

657 outputs_tuple = [

658 (n, _guess_s2o_type(o))

659 for i, (n, o) in enumerate(zip(output_names, protom.graph.output))]

660 outputs = [Variable.from_skl2onnx_tuple(i) for i in outputs_tuple]

661 else:

662 outputs = final_types

663

664 onx = last_op.to_onnx(inputs=i_vars, target_opset=target_opset,

665 verbose=verbose, run_shape=run_shape,

666 outputs=outputs)

667

668 for o in onx.graph.output:

669 if get_tensor_elem_type(o) == 0:

670 raise RuntimeError( # pragma: no cover

671 "Unable to guess output type of output %r "

672 "from model %r." % (onx.graph.output, model))

673 return onx

674

675

676def get_column_index(i, inputs):

677 """

678 Returns a tuples (variable index, column index in that variable).

679 The function has two different behaviours, one when *i* (column index)

680 is an integer, another one when *i* is a string (column name).

681 If *i* is a string, the function looks for input name with

682 this name and returns `(index, 0)`.

683 If *i* is an integer, let's assume first we have two inputs

684 `I0 = FloatTensorType([None, 2])` and `I1 = FloatTensorType([None, 3])`,

685 in this case, here are the results:

686

687 ::

688

689 get_column_index(0, inputs) -> (0, 0)

690 get_column_index(1, inputs) -> (0, 1)

691 get_column_index(2, inputs) -> (1, 0)

692 get_column_index(3, inputs) -> (1, 1)

693 get_column_index(4, inputs) -> (1, 2)

694 """

695 if isinstance(i, int):

696 if i == 0:

697 # Useful shortcut, skips the case when end is None

698 # (unknown dimension)

699 return 0, 0

700 vi = 0

701 pos = 0

702 end = inputs[0][1].shape[1]

703 if end is None:

704 raise RuntimeError( # pragma: no cover

705 "Cannot extract a specific column %r when "

706 "one input (%r) has unknown "

707 "dimension." % (i, inputs[0]))

708 while True:

709 if pos <= i < end:

710 return vi, i - pos

711 vi += 1

712 pos = end

713 if vi >= len(inputs):

714 raise RuntimeError( # pragma: no cover

715 "Input %r (i=%r, end=%r) is not available in\n%r" % (

716 vi, i, end, pprint.pformat(inputs)))

717 rel_end = inputs[vi][1].shape[1]

718 if rel_end is None:

719 raise RuntimeError( # pragma: no cover

720 "Cannot extract a specific column %r when "

721 "one input (%r) has unknown "

722 "dimension." % (i, inputs[vi]))

723 end += rel_end

724 else:

725 for ind, inp in enumerate(inputs):

726 if inp[0] == i:

727 return ind, 0

728 raise RuntimeError( # pragma: no cover

729 "Unable to find column name %r among names %r. "

730 "Make sure the input names specified with parameter "

731 "initial_types fits the column names specified in the "

732 "pipeline to convert. This may happen because a "

733 "ColumnTransformer follows a transformer without "

734 "any mapped converter in a pipeline." % (

735 i, [n[0] for n in inputs]))

736

737

738def get_column_indices(indices, inputs, multiple):

739 """

740 Returns the requested graph inpudes based on their

741 indices or names. See :func:`get_column_index`.

742

743 :param indices: variables indices or names

744 :param inputs: graph inputs

745 :param multiple: allows column to come from multiple variables

746 :return: a tuple *(variable name, list of requested indices)* if

747 *multiple* is False, a dictionary *{ var_index: [ list of

748 requested indices ] }*

749 if *multiple* is True

750 """

751 if multiple:

752 res = OrderedDict()

753 for p in indices:

754 ov, onnx_i = get_column_index(p, inputs)

755 if ov not in res:

756 res[ov] = []

757 res[ov].append(onnx_i)

758 return res

759

760 onnx_var = None

761 onnx_is = []

762 for p in indices:

763 ov, onnx_i = get_column_index(p, inputs)

764 onnx_is.append(onnx_i)

765 if onnx_var is None:

766 onnx_var = ov

767 elif onnx_var != ov:

768 cols = [onnx_var, ov]

769 raise NotImplementedError( # pragma: no cover

770 "sklearn-onnx is not able to merge multiple columns from "

771 "multiple variables ({0}). You should think about merging "

772 "initial types.".format(cols))

773 return onnx_var, onnx_is

774

775

776def _merge_initial_types(i_types, transform_inputs, merge):

777 if len(i_types) == len(transform_inputs):

778 new_types = []

779 for it, sli in zip(i_types, transform_inputs):

780 name, ty = it

781 begin, end = sli.inputs[1], sli.inputs[2]

782 delta = end - begin

783 shape = [ty.shape[0], int(delta[0])]

784 new_types.append((name, ty.__class__(shape)))

785 else:

786 raise NotImplementedError( # pragma: no cover

787 "Not implemented when i_types=%r, transform_inputs=%r."

788 "" % (i_types, transform_inputs))

789 if merge and len(new_types) > 1:

790 raise NotImplementedError( # pragma: no cover

791 "Cannot merge %r built from i_types=%r, transform_inputs=%r."

792 "" % (new_types, i_types, transform_inputs))

793 return new_types

794

795

796def _to_onnx_function_column_transformer(

797 model, X=None, name=None, initial_types=None,

798 target_opset=None, options=None, rewrite_ops=False,

799 white_op=None, black_op=None, final_types=None,

800 rename_strategy=None, verbose=0,

801 prefix_name=None, run_shape=False,

802 single_function=True):

803

804 from sklearn.preprocessing import OneHotEncoder

805 from ..npy.xop_variable import Variable

806 from ..npy.xop import OnnxOperatorFunction, loadop

807 from ..onnx_tools.onnx_manipulations import onnx_model_to_function

808

809 OnnxConcat, OnnxSlice, OnnxIdentity = loadop('Concat', 'Slice', 'Identity')

810

811 transformers = model.transformers_

812 if len(transformers) == 0:

813 raise RuntimeError( # pragma: no cover

814 "The ColumnTransformer to be converted cannot be empty.")

815

816 if target_opset is None:

817 from .. import __max_supported_opset__

818 op_version = __max_supported_opset__

819 elif isinstance(target_opset, int):

820 op_version = target_opset

821 else: # pragma: no cover

822 from .. import __max_supported_opset__

823 op_version = target_opset.get('', __max_supported_opset__)

824

825 i_types = guess_initial_types(X, initial_types)

826 ops = []

827 protoms = []

828 output_namess = []

829 for i_step, (name_step, op, column_indices) in enumerate(transformers):

830 if op == 'drop':

831 continue

832 input_nodes = [OnnxIdentity(i[0], op_version=op_version)

833 for i in initial_types]

834 if isinstance(column_indices, slice):

835 column_indices = list(range(

836 column_indices.start

837 if column_indices.start is not None else 0,

838 column_indices.stop, column_indices.step

839 if column_indices.step is not None else 1))

840 elif isinstance(column_indices, (int, str)):

841 column_indices = [column_indices]

842 names = get_column_indices(column_indices, i_types, multiple=True)

843 transform_inputs = []

844 for onnx_var, onnx_is in names.items():

845 if max(onnx_is) - min(onnx_is) != len(onnx_is) - 1:

846 raise RuntimeError( # pragma: no cover

847 "The converter only with contiguous columns indices not %r "

848 "for step %r." % (column_indices, name_step))

849 tr_inputs = OnnxSlice(input_nodes[onnx_var],

850 numpy.array([onnx_is[0]], dtype=numpy.int64),

851 numpy.array([onnx_is[-1] + 1],

852 dtype=numpy.int64),

853 numpy.array([1], dtype=numpy.int64),

854 op_version=op_version)

855 transform_inputs.append(tr_inputs)

856

857 merged_cols = False

858 if len(transform_inputs) > 1:

859 if isinstance(op, Pipeline):

860 if not isinstance(op.steps[0][1],

861 (OneHotEncoder, ColumnTransformer)):

862 merged_cols = True

863 elif not isinstance(op, (OneHotEncoder, ColumnTransformer)):

864 merged_cols = True

865

866 if merged_cols:

867 concatenated = OnnxConcat(

868 *transform_inputs, op_version=op_version, axis=1)

869 else:

870 concatenated = transform_inputs

871 initial_types = _merge_initial_types(

872 i_types, transform_inputs, merged_cols)

873

874 prefix = name_step + "__"

875 step_options = _new_options(options, prefix, op)

876 if prefix_name is not None:

877 prefix = prefix_name + prefix

878

879 if op == 'passthrough':

880 ops.extend(concatenated)

881 continue

882

883 protom = to_onnx(

884 op, name=name_step, X=X, initial_types=initial_types,

885 target_opset=target_opset,

886 options=step_options, rewrite_ops=rewrite_ops,

887 white_op=white_op, black_op=black_op, verbose=verbose,

888 as_function=True, prefix_name=prefix, run_shape=run_shape,

889 single_function=False)

890 protoms.append(protom)

891

892 for o in protom.graph.output:

893 if get_tensor_elem_type(o) == 0:

894 raise RuntimeError( # pragma: no cover

895 "Unabble to guess output type of output %r "

896 "from model step %d: %r." % (

897 protom.graph.output, i_step, op))

898 jspar = 'HYPER:{"%s":%s}' % (

899 op.__class__.__name__, get_sklearn_json_params(op))

900 protof, fcts = onnx_model_to_function(

901 protom, domain='sklearn',

902 name=f"{prefix}_{op.__class__.__name__}_{id(op)}",

903 doc_string=jspar)

904 output_names = [f"{name_step}_{o}" for o in protof.output]

905 output_namess.append(output_names)

906

907 logger.debug("_to_onnx_function_column_transformer:%s:->%r:%r:%s",

908 op.__class__.__name__, output_names, len(protof.node), jspar)

909

910 op = OnnxOperatorFunction(

911 protof, *concatenated, output_names=output_names,

912 sub_functions=list(fcts))

913 ops.append(op)

914

915 logger.debug("_to_onnx_function_column_transformer:end:(%s-%d, X=%r, "

916 "initial_types=%r, target_opset=%r, "

917 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "

918 "final_types=%r, outputs=%r)",

919 model.__class__.__name__, id(model),

920 type(X), initial_types, target_opset,

921 options, rewrite_ops, white_op, black_op,

922 final_types, i_types)

923

924 i_vars = [Variable.from_skl2onnx_tuple(i) for i in i_types]

925 if final_types is None:

926 outputs_tuple = []

927 for protom, output_names in zip(protoms, output_namess):

928 outputs_tuple.extend([

929 (n, _guess_s2o_type(o))

930 for i, (n, o) in enumerate(zip(output_names, protom.graph.output))])

931 outputs = [Variable.from_skl2onnx_tuple(i) for i in outputs_tuple]

932 else:

933 outputs = final_types

934

935 last_op = OnnxConcat(*ops, op_version=op_version, axis=1)

936

937 onx = last_op.to_onnx(inputs=i_vars, target_opset=target_opset,

938 verbose=verbose, run_shape=run_shape,

939 outputs=outputs)

940

941 for o in onx.graph.output:

942 if get_tensor_elem_type(o) == 0:

943 raise RuntimeError( # pragma: no cover

944 "Unable to guess output type of output %r "

945 "from model %r." % (onx.graph.output, model))

946 return onx

947

948

949def to_onnx_function(model, X=None, name=None, initial_types=None,

950 target_opset=None, options=None, rewrite_ops=False,

951 white_op=None, black_op=None, final_types=None,

952 rename_strategy=None, verbose=0,

953 prefix_name=None, run_shape=False,

954 single_function=True):

955 """

956 Converts a model using on :epkg:`sklearn-onnx`.

957 The functions works as the same as function @see fn to_onnx

958 but every model is exported as a single function and the main

959 graph represents the pipeline structure.

960

961 :param model: model to convert or a function

962 wrapped into :epkg:`_PredictScorer` with

963 function :epkg:`make_scorer`

964 :param X: training set (at least one row),

965 can be None, it is used to infered the

966 input types (*initial_types*)

967 :param initial_types: if *X* is None, then *initial_types*

968 must be defined

969 :param name: name of the produced model

970 :param target_opset: to do it with a different target opset

971 :param options: additional parameters for the conversion

972 :param rewrite_ops: rewrites some existing converters,

973 the changes are permanent

974 :param white_op: white list of ONNX nodes allowed

975 while converting a pipeline, if empty, all are allowed

976 :param black_op: black list of ONNX nodes allowed

977 while converting a pipeline, if empty,

978 none are blacklisted

979 :param final_types: a python list. Works the same way as

980 initial_types but not mandatory, it is used

981 to overwrites the type (if type is not None)

982 and the name of every output.

983 :param rename_strategy: rename any name in the graph, select shorter

984 names, see @see fn onnx_rename_names

985 :param verbose: display information while converting the model

986 :param prefix_name: prefix for variable names

987 :param run_shape: run shape inference on the final onnx model

988 :param single_function: if True, the main graph only includes one node

989 calling the main function

990 :return: converted model

991 """

992 if rename_strategy is not None or rewrite_ops:

993 return to_onnx(

994 model, X=X, name=name, initial_types=initial_types,

995 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops,

996 white_op=white_op, black_op=black_op, final_types=final_types,

997 rename_strategy=rename_strategy, verbose=verbose,

998 run_shape=run_shape)

999

1000 logger.debug("to_onnx_function:begin:(%s-%d, X=%r, initial_types=%r, target_opset=%r, "

1001 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, "

1002 "final_types=%r)",

1003 model.__class__.__name__, id(model), type(X), initial_types,

1004 target_opset, options, rewrite_ops, white_op, black_op,

1005 final_types)

1006

1007 if final_types is not None:

1008 raise NotImplementedError( # pragma: no cover

1009 "final_types != None, not implemented yet.")

1010

1011 if single_function and (not isinstance(model, Pipeline) or

1012 len(model.steps) != 1):

1013 # Wraps the model into a single pipeline.

1014 new_model = Pipeline(steps=[('main', model)])

1015 return to_onnx_function(

1016 new_model, X=X, name=name, initial_types=initial_types,

1017 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops,

1018 white_op=white_op, black_op=black_op, final_types=final_types,

1019 rename_strategy=rename_strategy, verbose=verbose,

1020 prefix_name=prefix_name, run_shape=run_shape, single_function=False)

1021

1022 if isinstance(model, Pipeline):

1023 return _to_onnx_function_pipeline(

1024 model, X=X, name=name, initial_types=initial_types,

1025 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops,

1026 white_op=white_op, black_op=black_op, final_types=final_types,

1027 rename_strategy=rename_strategy, verbose=verbose,

1028 prefix_name=prefix_name, run_shape=run_shape,

1029 single_function=single_function)

1030

1031 if isinstance(model, ColumnTransformer):

1032 return _to_onnx_function_column_transformer(

1033 model, X=X, name=name, initial_types=initial_types,

1034 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops,

1035 white_op=white_op, black_op=black_op, final_types=final_types,

1036 rename_strategy=rename_strategy, verbose=verbose,

1037 prefix_name=prefix_name, run_shape=run_shape,

1038 single_function=single_function)

1039

1040 raise TypeError( # pragma: no cover

1041 f"Unexpected type {type(model)!r} for model to convert.")