Coverage for mlprodict/onnx_conv/convert.py: 88%

384 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1# -*- encoding: utf-8 -*- 

2# pylint: disable=C0302,R0914 

3""" 

4@file 

5@brief Overloads a conversion function. 

6""" 

7import json 

8import pprint 

9from collections import OrderedDict 

10import logging 

11import numpy 

12from onnx import ValueInfoProto 

13import pandas 

14try: 

15 from sklearn.metrics._scorer import _PredictScorer 

16except ImportError: # pragma: no cover 

17 # scikit-learn < 0.22 

18 from sklearn.metrics.scorer import _PredictScorer 

19from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version 

20from sklearn.pipeline import Pipeline, FeatureUnion 

21from sklearn.compose import ColumnTransformer 

22from sklearn.utils.metaestimators import _BaseComposition 

23from skl2onnx.common.data_types import ( 

24 FloatTensorType, DoubleTensorType, DataType, guess_numpy_type, 

25 StringTensorType, Int64TensorType, _guess_type_proto) 

26from skl2onnx import convert_sklearn 

27from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin 

28from skl2onnx.algebra.type_helper import _guess_type 

29from ..onnx_tools.onnx_manipulations import onnx_rename_names 

30from ..onnx_tools.onnx2py_helper import ( 

31 guess_dtype, get_tensor_shape, get_tensor_elem_type) 

32from .register_rewritten_converters import ( 

33 register_rewritten_operators, register_new_operators) 

34from .register import register_converters 

35from .scorers import CustomScorerTransform 

36 

37 

38logger = logging.getLogger('mlprodict') 

39 

40 

41def _fix_opset_skl2onnx(): 

42 import skl2onnx 

43 from .. import __max_supported_opset__ 

44 if skl2onnx.__max_supported_opset__ != __max_supported_opset__: 

45 skl2onnx.__max_supported_opset__ = __max_supported_opset__ # pragma: no cover 

46 

47 

48def convert_scorer(fct, initial_types, name=None, 

49 target_opset=None, options=None, 

50 custom_conversion_functions=None, 

51 custom_shape_calculators=None, 

52 custom_parsers=None, white_op=None, 

53 black_op=None, final_types=None, 

54 verbose=0): 

55 """ 

56 Converts a scorer into :epkg:`ONNX` assuming 

57 there exists a converter associated to it. 

58 The function wraps the function into a custom 

59 transformer, then calls function *convert_sklearn* 

60 from :epkg:`sklearn-onnx`. 

61 

62 :param fct: function to convert (or a scorer from :epkg:`scikit-learn`) 

63 :param initial_types: types information 

64 :param name: name of the produced model 

65 :param target_opset: to do it with a different target opset 

66 :param options: additional parameters for the conversion 

67 :param custom_conversion_functions: a dictionary for specifying the user 

68 customized conversion function, it takes precedence over 

69 registered converters 

70 :param custom_shape_calculators: a dictionary for specifying the user 

71 customized shape calculator it takes precedence over registered 

72 shape calculators. 

73 :param custom_parsers: parsers determine which outputs is expected 

74 for which particular task, default parsers are 

75 defined for classifiers, regressors, pipeline but 

76 they can be rewritten, *custom_parsers* is a dictionary 

77 ``{ type: fct_parser(scope, model, inputs, 

78 custom_parsers=None) }`` 

79 :param white_op: white list of ONNX nodes allowed 

80 while converting a pipeline, if empty, all are allowed 

81 :param black_op: black list of ONNX nodes allowed 

82 while converting a pipeline, if empty, none are blacklisted 

83 :param final_types: a python list. Works the same way as 

84 initial_types but not mandatory, it is used 

85 to overwrites the type (if type is not None) 

86 and the name of every output. 

87 :param verbose: displays information while converting 

88 :return: :epkg:`ONNX` graph 

89 """ 

90 if hasattr(fct, '_score_func'): 

91 kwargs = fct._kwargs 

92 fct = fct._score_func 

93 else: 

94 kwargs = None # pragma: no cover 

95 if name is None: 

96 name = f"mlprodict_fct_ONNX({fct.__name__})" 

97 tr = CustomScorerTransform(fct.__name__, fct, kwargs) 

98 _fix_opset_skl2onnx() 

99 return convert_sklearn( 

100 tr, initial_types=initial_types, 

101 target_opset=target_opset, options=options, 

102 custom_conversion_functions=custom_conversion_functions, 

103 custom_shape_calculators=custom_shape_calculators, 

104 custom_parsers=custom_parsers, white_op=white_op, 

105 black_op=black_op, final_types=final_types, 

106 verbose=verbose) 

107 

108 

109def guess_initial_types(X, initial_types): 

110 """ 

111 Guesses initial types from an array or a dataframe. 

112 

113 :param X: array or dataframe 

114 :param initial_types: hints about X 

115 :return: data types 

116 """ 

117 if X is None and initial_types is None: 

118 raise NotImplementedError( # pragma: no cover 

119 "Initial types must be specified.") 

120 elif initial_types is None: 

121 if isinstance(X, (numpy.ndarray, pandas.DataFrame)): 

122 X = X[:1] 

123 if isinstance(X, pandas.DataFrame): 

124 initial_types = [] 

125 for c in X.columns: 

126 if isinstance(X[c].values[0], (str, numpy.str_)): 

127 g = StringTensorType() 

128 else: 

129 g = _guess_type(X[c].values) 

130 g.shape = [None, 1] 

131 initial_types.append((c, g)) 

132 else: 

133 gt = _guess_type(X) 

134 initial_types = [('X', gt)] 

135 return initial_types 

136 

137 

138def _replace_tensor_type(schema, tensor_type): 

139 res = [] 

140 for name, ty in schema: 

141 cl = ty.__class__ 

142 if cl in (FloatTensorType, DoubleTensorType) and cl != tensor_type: 

143 ty = tensor_type(ty.shape) 

144 res.append((name, ty)) 

145 return res 

146 

147 

148def guess_schema_from_data(X, tensor_type=None, schema=None): 

149 """ 

150 Guesses initial types from a dataset. 

151 

152 @param X dataset (dataframe, array) 

153 @param tensor_type if not None, replaces every 

154 *FloatTensorType* or *DoubleTensorType* 

155 by this one 

156 @param schema known schema 

157 @return schema (list of typed and named columns) 

158 """ 

159 init = guess_initial_types(X, schema) 

160 if tensor_type is not None: 

161 init = _replace_tensor_type(init, tensor_type) 

162 # Grouping column 

163 unique = set() 

164 for _, col in init: 

165 if len(col.shape) != 2: 

166 return init # pragma: no cover 

167 if col.shape[0] is not None: 

168 return init # pragma: no cover 

169 if len(unique) > 0 and col.__class__ not in unique: 

170 return init # pragma: no cover 

171 unique.add(col.__class__) 

172 unique = list(unique) 

173 return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))] 

174 

175 

176def get_inputs_from_data(X, schema=None): 

177 """ 

178 Produces input data for *onnx* runtime. 

179 

180 @param X data 

181 @param schema schema if None, schema is guessed with 

182 @see fn guess_schema_from_data 

183 @return input data 

184 """ 

185 def _cast_data(X, ct): 

186 if isinstance(ct, FloatTensorType): 

187 return X.astype(numpy.float32) 

188 if isinstance(ct, DoubleTensorType): 

189 return X.astype(numpy.float64) 

190 if isinstance(ct, StringTensorType): 

191 return X.astype(numpy.str_) 

192 if isinstance(ct, Int64TensorType): 

193 return X.astype(numpy.int64) 

194 raise RuntimeError( # pragma: no cover 

195 f"Unexpected column type {ct} for type {type(X)}.") 

196 

197 if schema is None: 

198 schema = guess_schema_from_data(X) 

199 if isinstance(X, numpy.ndarray): 

200 if len(schema) != 1: 

201 raise RuntimeError( # pragma: no cover 

202 "More than one column but input is an array.") 

203 return {schema[0][0]: _cast_data(X, schema[0][1])} 

204 if isinstance(X, pandas.DataFrame): 

205 if len(schema) != X.shape[1]: 

206 raise RuntimeError( # pragma: no cover 

207 "Mismatch between onnx columns {} and DataFrame columns {}" 

208 "".format(len(schema), X.shape[1])) 

209 return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1)) 

210 for sch, c in zip(schema, X.columns)} 

211 raise TypeError( # pragma: no cover 

212 f"Unexpected type {type(X)}, expecting an array or a dataframe.") 

213 

214 

215def guess_schema_from_model(model, tensor_type=None, schema=None): 

216 """ 

217 Guesses initial types from a model. 

218 

219 @param model model 

220 @param tensor_type if not None, replaces every 

221 *FloatTensorType* or *DoubleTensorType* 

222 by this one 

223 @param schema known schema 

224 @return schema (list of typed and named columns) 

225 """ 

226 if schema is not None: 

227 try: 

228 guessed = guess_schema_from_model(model) 

229 except NotImplementedError: # pragma: no cover 

230 return _replace_tensor_type(schema, tensor_type) 

231 if len(guessed) != len(schema): 

232 raise RuntimeError( # pragma: no cover 

233 "Given schema and guessed schema are not the same:\nGOT: {}\n-----\nGOT:\n{}".format( 

234 schema, guessed)) 

235 return _replace_tensor_type(schema, tensor_type) 

236 

237 if hasattr(model, 'coef_'): 

238 # linear model 

239 init = [('X', FloatTensorType([None, model.coef_.shape[1]]))] 

240 return _replace_tensor_type(init, tensor_type) 

241 elif hasattr(model, 'dump_model'): 

242 dumped = model.dump_model() 

243 if isinstance(dumped, dict) and 'feature_names' in dumped: 

244 names = dumped['feature_names'] 

245 init = [(name, FloatTensorType([None, 1])) for name in names] 

246 return _replace_tensor_type(init, tensor_type) 

247 

248 data = pprint.pformat(model.__dict__) 

249 dirs = pprint.pformat(dir(model)) 

250 if hasattr(model, 'dump_model'): # pragma: no cover 

251 dumped = model.dump_model() 

252 keys = list(sorted(dumped)) 

253 last = pprint.pformat([keys, dumped]) 

254 if len(last) >= 200000: 

255 last = last[:200000] + "\n..." 

256 else: 

257 last = "" 

258 raise NotImplementedError( # pragma: no cover 

259 "Unable to guess schema for model {}\n{}\n----\n{}\n------\n{}".format( 

260 model.__class__, data, dirs, last)) 

261 

262 

263def _guess_type_(X, itype, dtype): 

264 initial_types = guess_initial_types(X, itype) 

265 if dtype is None: 

266 if hasattr(X, 'dtypes'): # DataFrame 

267 dtype = numpy.float32 

268 elif hasattr(X, 'dtype'): 

269 dtype = X.dtype 

270 elif hasattr(X, 'type'): 

271 dtype = guess_numpy_type(X.type) 

272 elif isinstance(initial_types[0], ValueInfoProto): 

273 dtype = guess_dtype(initial_types[0].type.tensor_type.elem_type) 

274 elif initial_types is not None: 

275 dtype = guess_numpy_type(initial_types[0][1]) 

276 else: 

277 raise RuntimeError( # pragma: no cover 

278 f"dtype cannot be guessed: {type(X)}") 

279 if dtype != numpy.float64: 

280 dtype = numpy.float32 

281 if dtype is None: 

282 raise RuntimeError("dtype cannot be None") # pragma: no cover 

283 if isinstance(dtype, FloatTensorType): 

284 dtype = numpy.float32 # pragma: no cover 

285 elif isinstance(dtype, DoubleTensorType): 

286 dtype = numpy.float64 # pragma: no cover 

287 new_dtype = dtype 

288 if isinstance(dtype, numpy.ndarray): 

289 new_dtype = dtype.dtype # pragma: no cover 

290 elif isinstance(dtype, DataType): 

291 new_dtype = numpy.float32 # pragma: no cover 

292 if new_dtype not in (numpy.float32, numpy.float64, numpy.int64, 

293 numpy.int32, numpy.float16): 

294 raise NotImplementedError( # pragma: no cover 

295 f"dtype should be real not {new_dtype} ({dtype})") 

296 return initial_types, dtype, new_dtype 

297 

298 

299def to_onnx(model, X=None, name=None, initial_types=None, 

300 target_opset=None, options=None, rewrite_ops=False, 

301 white_op=None, black_op=None, final_types=None, 

302 rename_strategy=None, verbose=0, 

303 as_function=False, prefix_name=None, 

304 run_shape=False, single_function=True): 

305 """ 

306 Converts a model using on :epkg:`sklearn-onnx`. 

307 

308 :param model: model to convert or a function 

309 wrapped into :epkg:`_PredictScorer` with 

310 function :epkg:`make_scorer` 

311 :param X: training set (at least one row), 

312 can be None, it is used to infered the 

313 input types (*initial_types*) 

314 :param initial_types: if *X* is None, then *initial_types* 

315 must be defined 

316 :param name: name of the produced model 

317 :param target_opset: to do it with a different target opset 

318 :param options: additional parameters for the conversion 

319 :param rewrite_ops: rewrites some existing converters, 

320 the changes are permanent 

321 :param white_op: white list of ONNX nodes allowed 

322 while converting a pipeline, if empty, all are allowed 

323 :param black_op: black list of ONNX nodes allowed 

324 while converting a pipeline, if empty, 

325 none are blacklisted 

326 :param final_types: a python list. Works the same way as 

327 initial_types but not mandatory, it is used 

328 to overwrites the type (if type is not None) 

329 and the name of every output. 

330 :param rename_strategy: rename any name in the graph, select shorter 

331 names, see @see fn onnx_rename_names 

332 :param verbose: display information while converting the model 

333 :param as_function: exposes every model in a pipeline as a function, 

334 the main graph contains the pipeline structure, 

335 see :ref:`onnxsklearnfunctionsrst` for an example 

336 :param prefix_name: used if *as_function* is True, to give 

337 a prefix to variable in a pipeline 

338 :param run_shape: run shape inference 

339 :param single_function: if *as_function* is True, the function returns one graph 

340 with one call to the main function if *single_function* is True or 

341 a list of node corresponding to the graph structure 

342 :return: converted model 

343 

344 The function rewrites function *to_onnx* from :epkg:`sklearn-onnx` 

345 but may changes a few converters if *rewrite_ops* is True. 

346 For example, :epkg:`ONNX` only supports *TreeEnsembleRegressor* 

347 for float but not for double. It becomes available 

348 if ``rewrite_ops=True``. 

349 

350 .. faqref:: 

351 :title: How to deal with a dataframe as input? 

352 

353 Each column of the dataframe is considered as an named input. 

354 The first step is to make sure that every column type is correct. 

355 :epkg:`pandas` tends to select the least generic type to 

356 hold the content of one column. :epkg:`ONNX` does not automatically 

357 cast the data it receives. The data must have the same type with 

358 the model is converted and when the converted model receives 

359 the data to predict. 

360 

361 .. runpython:: 

362 :showcode: 

363 :warningout: DeprecationWarning 

364 

365 from io import StringIO 

366 from textwrap import dedent 

367 import numpy 

368 import pandas 

369 from pyquickhelper.pycode import ExtTestCase 

370 from sklearn.preprocessing import OneHotEncoder 

371 from sklearn.pipeline import Pipeline 

372 from sklearn.compose import ColumnTransformer 

373 from mlprodict.onnx_conv import to_onnx 

374 from mlprodict.onnxrt import OnnxInference 

375 

376 text = dedent(''' 

377 __SCHEMA__ 

378 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red 

379 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red 

380 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red 

381 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red 

382 ''') 

383 text = text.replace( 

384 "__SCHEMA__", 

385 "fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides," 

386 "free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates," 

387 "alcohol,quality,color") 

388 

389 X_train = pandas.read_csv(StringIO(text)) 

390 for c in X_train.columns: 

391 if c != 'color': 

392 X_train[c] = X_train[c].astype(numpy.float32) 

393 numeric_features = [c for c in X_train if c != 'color'] 

394 

395 pipe = Pipeline([ 

396 ("prep", ColumnTransformer([ 

397 ("color", Pipeline([ 

398 ('one', OneHotEncoder()), 

399 ('select', ColumnTransformer( 

400 [('sel1', 'passthrough', [0])])) 

401 ]), ['color']), 

402 ("others", "passthrough", numeric_features) 

403 ])), 

404 ]) 

405 

406 pipe.fit(X_train) 

407 pred = pipe.transform(X_train) 

408 print(pred) 

409 

410 model_onnx = to_onnx(pipe, X_train, target_opset=12) 

411 oinf = OnnxInference(model_onnx) 

412 

413 # The dataframe is converted into a dictionary, 

414 # each key is a column name, each value is a numpy array. 

415 inputs = {c: X_train[c].values for c in X_train.columns} 

416 inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()} 

417 

418 onxp = oinf.run(inputs) 

419 print(onxp) 

420 

421 .. versionchanged:: 0.9 

422 Parameter *as_function* was added. 

423 """ 

424 logger.debug("to_onnx(%s, X=%r, initial_types=%r, target_opset=%r, " 

425 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, " 

426 "final_types=%r)", 

427 model.__class__.__name__, type(X), initial_types, 

428 target_opset, options, rewrite_ops, white_op, black_op, 

429 final_types) 

430 

431 if isinstance(model, OnnxOperatorMixin): 

432 if not hasattr(model, 'op_version'): 

433 raise RuntimeError( # pragma: no cover 

434 f"Missing attribute 'op_version' for type '{type(model)}'.") 

435 _fix_opset_skl2onnx() 

436 return model.to_onnx( 

437 X=X, name=name, options=options, black_op=black_op, 

438 white_op=white_op, final_types=final_types, 

439 target_opset=target_opset) 

440 # verbose=verbose) 

441 

442 if rewrite_ops: 

443 old_values, old_shapes = register_rewritten_operators() 

444 register_new_operators() 

445 register_converters() 

446 else: 

447 old_values, old_shapes = {}, {} 

448 

449 if as_function and isinstance( 

450 model, (ColumnTransformer, Pipeline, FeatureUnion)): 

451 res = to_onnx_function( 

452 model, X=X, name=name, initial_types=initial_types, 

453 target_opset=target_opset, options=options, 

454 rewrite_ops=False, # already handled 

455 white_op=white_op, black_op=black_op, final_types=final_types, 

456 rename_strategy=None, # already handled 

457 verbose=verbose, prefix_name=prefix_name, 

458 run_shape=run_shape, single_function=single_function) 

459 

460 elif isinstance(model, _PredictScorer): 

461 if X is not None and not isinstance(X, OrderedDict): 

462 raise ValueError("For a scorer, parameter X should be a OrderedDict not {}." 

463 "".format(type(X))) 

464 if initial_types is None: 

465 dts = [] 

466 initial_types = [] 

467 for k, v in X.items(): 

468 if hasattr(v, 'dtype'): 

469 dtype = guess_numpy_type(v.dtype) 

470 else: 

471 dtype = v # pragma: no cover 

472 it, _, ndt = _guess_type_(v, None, dtype) 

473 for i in range(len(it)): # pylint: disable=C0200 

474 it[i] = (k, it[i][1]) # pylint: disable=C0200 

475 initial_types.extend(it) 

476 dts.append(ndt) 

477 ndt = set(dts) 

478 if len(ndt) != 1: 

479 raise RuntimeError( # pragma: no cover 

480 f"Multiple dtype is not efficient {ndt}.") 

481 res = convert_scorer(model, initial_types, name=name, 

482 target_opset=target_opset, options=options, 

483 black_op=black_op, white_op=white_op, 

484 final_types=final_types, verbose=verbose) 

485 else: 

486 if name is None: 

487 name = f"mlprodict_ONNX({model.__class__.__name__})" 

488 

489 initial_types, dtype, _ = _guess_type_(X, initial_types, None) 

490 

491 _fix_opset_skl2onnx() 

492 res = convert_sklearn(model, initial_types=initial_types, name=name, 

493 target_opset=target_opset, options=options, 

494 black_op=black_op, white_op=white_op, 

495 final_types=final_types, verbose=verbose) 

496 

497 register_rewritten_operators(old_values, old_shapes) 

498 

499 # optimisation 

500 if rename_strategy is not None: 

501 res = onnx_rename_names(res, strategy=rename_strategy) 

502 return res 

503 

504 

505def _guess_s2o_type(vtype: ValueInfoProto): 

506 return _guess_type_proto( 

507 get_tensor_elem_type(vtype), get_tensor_shape(vtype)) 

508 

509 

510def _new_options(options, prefix, sklop): 

511 if sklop is None: 

512 raise RuntimeError( # pragma: no cover 

513 "sklop cannot be None.") 

514 if isinstance(sklop, str): 

515 return None # pragma: no cover 

516 if options is None: 

517 step_options = None 

518 else: 

519 step_options = {} 

520 for k, v in options.items(): 

521 if k.startswith(prefix): 

522 step_options[k[len(prefix):]] = v 

523 elif '__' in k: 

524 step_options[k.split('__', maxsplit=1)[1]] = v 

525 if isinstance(sklop, _BaseComposition): 

526 step_options[k] = v 

527 else: 

528 from skl2onnx._supported_operators import _get_sklearn_operator_name 

529 from skl2onnx.common._registration import get_converter 

530 alias = _get_sklearn_operator_name(type(sklop)) 

531 if alias is None: 

532 step_options[k] = v 

533 else: 

534 conv = get_converter(alias) 

535 allowed = conv.get_allowed_options() 

536 if allowed is not None and k in allowed: 

537 step_options[k] = v 

538 return step_options 

539 

540 

541class _ParamEncoder(json.JSONEncoder): 

542 def default(self, obj): # pylint: disable=W0237 

543 try: 

544 return json.JSONEncoder.default(self, obj) 

545 except TypeError as e: 

546 # Unable to serialize 

547 return '{"classname": "%s", "EXC": "%s"}' % ( 

548 obj.__class__.__name__, str(e)) 

549 

550 

551def get_sklearn_json_params(model): 

552 """ 

553 Retrieves all the parameters of a :epkg:`scikit-learn` model. 

554 """ 

555 pars = model.get_params(deep=False) 

556 try: 

557 return json.dumps(pars, cls=_ParamEncoder) 

558 except TypeError as e: # pragma: no cover 

559 raise RuntimeError( 

560 f"Unable to serialize parameters {pprint.pformat(pars)}.") from e 

561 

562 

563def _to_onnx_function_pipeline( 

564 model, X=None, name=None, initial_types=None, 

565 target_opset=None, options=None, rewrite_ops=False, 

566 white_op=None, black_op=None, final_types=None, 

567 rename_strategy=None, verbose=0, 

568 prefix_name=None, run_shape=False, 

569 single_function=True): 

570 

571 from ..npy.xop_variable import Variable 

572 from ..npy.xop import OnnxOperatorFunction, loadop 

573 from ..onnx_tools.onnx_manipulations import onnx_model_to_function 

574 

575 OnnxIdentity = loadop('Identity') 

576 

577 if len(model.steps) == 0: 

578 raise RuntimeError( # pragma: no cover 

579 "The pipeline to be converted cannot be empty.") 

580 

581 if target_opset is None: 

582 from .. import __max_supported_opset__ 

583 op_version = __max_supported_opset__ 

584 elif isinstance(target_opset, int): 

585 op_version = target_opset 

586 else: # pragma: no cover 

587 from .. import __max_supported_opset__ 

588 op_version = target_opset.get('', __max_supported_opset__) 

589 

590 i_types = guess_initial_types(X, initial_types) 

591 input_nodes = [OnnxIdentity(i[0], op_version=op_version) 

592 for i in i_types] 

593 

594 inputs = i_types 

595 last_op = None 

596 for i_step, step in enumerate(model.steps): 

597 prefix = step[0] + "__" 

598 step_options = _new_options(options, prefix, step[1]) 

599 if prefix_name is not None: 

600 prefix = prefix_name + prefix 

601 protom = to_onnx( 

602 step[1], name=name, initial_types=inputs, 

603 target_opset=target_opset, 

604 options=step_options, rewrite_ops=rewrite_ops, 

605 white_op=white_op, black_op=black_op, verbose=verbose, 

606 as_function=True, prefix_name=prefix, run_shape=run_shape, 

607 single_function=False) 

608 for o in protom.graph.output: 

609 if get_tensor_elem_type(o) == 0: 

610 raise RuntimeError( # pragma: no cover 

611 "Unabble to guess output type of output %r " 

612 "from model step %d: %r, output=%r." % ( 

613 protom.graph.output, i_step, step[1], o)) 

614 jspar = 'HYPER:{"%s":%s}' % ( 

615 step[1].__class__.__name__, get_sklearn_json_params(step[1])) 

616 protof, subf = onnx_model_to_function( 

617 protom, domain='sklearn', 

618 name=f"{prefix}_{step[1].__class__.__name__}_{i_step}", 

619 doc_string=jspar) 

620 input_names = [f"{step[0]}_{o}" for o in protof.input] 

621 if last_op is not None: 

622 if len(input_names) == 1: 

623 input_nodes = [OnnxIdentity( 

624 last_op, output_names=input_names[0], 

625 op_version=op_version)] 

626 else: 

627 input_nodes = [ # pragma: no cover 

628 OnnxIdentity(last_op[i], output_names=[n], # pylint: disable=E1136 

629 op_version=op_version) 

630 for i, n in enumerate(input_names)] 

631 output_names = [f"{step[0]}_{o}" for o in protof.output] 

632 

633 logger.debug("_to_onnx_function_pipeline:%s:%r->%r:%r:%s", 

634 step[1].__class__.__name__, 

635 input_names, output_names, 

636 len(protof.node), jspar) 

637 

638 op = OnnxOperatorFunction( 

639 protof, *input_nodes, output_names=output_names, 

640 sub_functions=subf) 

641 last_op = op 

642 inputs = [ 

643 ('X%d' % i, _guess_s2o_type(o)) 

644 for i, o in enumerate(protom.graph.output)] 

645 

646 logger.debug("_to_onnx_function_pipeline:end:(%s-%d, X=%r, " 

647 "initial_types=%r, target_opset=%r, " 

648 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, " 

649 "final_types=%r, outputs=%r)", 

650 model.__class__.__name__, id(model), 

651 type(X), initial_types, 

652 target_opset, options, rewrite_ops, white_op, black_op, 

653 final_types, inputs) 

654 

655 i_vars = [Variable.from_skl2onnx_tuple(i) for i in i_types] 

656 if final_types is None: 

657 outputs_tuple = [ 

658 (n, _guess_s2o_type(o)) 

659 for i, (n, o) in enumerate(zip(output_names, protom.graph.output))] 

660 outputs = [Variable.from_skl2onnx_tuple(i) for i in outputs_tuple] 

661 else: 

662 outputs = final_types 

663 

664 onx = last_op.to_onnx(inputs=i_vars, target_opset=target_opset, 

665 verbose=verbose, run_shape=run_shape, 

666 outputs=outputs) 

667 

668 for o in onx.graph.output: 

669 if get_tensor_elem_type(o) == 0: 

670 raise RuntimeError( # pragma: no cover 

671 "Unable to guess output type of output %r " 

672 "from model %r." % (onx.graph.output, model)) 

673 return onx 

674 

675 

676def get_column_index(i, inputs): 

677 """ 

678 Returns a tuples (variable index, column index in that variable). 

679 The function has two different behaviours, one when *i* (column index) 

680 is an integer, another one when *i* is a string (column name). 

681 If *i* is a string, the function looks for input name with 

682 this name and returns `(index, 0)`. 

683 If *i* is an integer, let's assume first we have two inputs 

684 `I0 = FloatTensorType([None, 2])` and `I1 = FloatTensorType([None, 3])`, 

685 in this case, here are the results: 

686 

687 :: 

688 

689 get_column_index(0, inputs) -> (0, 0) 

690 get_column_index(1, inputs) -> (0, 1) 

691 get_column_index(2, inputs) -> (1, 0) 

692 get_column_index(3, inputs) -> (1, 1) 

693 get_column_index(4, inputs) -> (1, 2) 

694 """ 

695 if isinstance(i, int): 

696 if i == 0: 

697 # Useful shortcut, skips the case when end is None 

698 # (unknown dimension) 

699 return 0, 0 

700 vi = 0 

701 pos = 0 

702 end = inputs[0][1].shape[1] 

703 if end is None: 

704 raise RuntimeError( # pragma: no cover 

705 "Cannot extract a specific column %r when " 

706 "one input (%r) has unknown " 

707 "dimension." % (i, inputs[0])) 

708 while True: 

709 if pos <= i < end: 

710 return vi, i - pos 

711 vi += 1 

712 pos = end 

713 if vi >= len(inputs): 

714 raise RuntimeError( # pragma: no cover 

715 "Input %r (i=%r, end=%r) is not available in\n%r" % ( 

716 vi, i, end, pprint.pformat(inputs))) 

717 rel_end = inputs[vi][1].shape[1] 

718 if rel_end is None: 

719 raise RuntimeError( # pragma: no cover 

720 "Cannot extract a specific column %r when " 

721 "one input (%r) has unknown " 

722 "dimension." % (i, inputs[vi])) 

723 end += rel_end 

724 else: 

725 for ind, inp in enumerate(inputs): 

726 if inp[0] == i: 

727 return ind, 0 

728 raise RuntimeError( # pragma: no cover 

729 "Unable to find column name %r among names %r. " 

730 "Make sure the input names specified with parameter " 

731 "initial_types fits the column names specified in the " 

732 "pipeline to convert. This may happen because a " 

733 "ColumnTransformer follows a transformer without " 

734 "any mapped converter in a pipeline." % ( 

735 i, [n[0] for n in inputs])) 

736 

737 

738def get_column_indices(indices, inputs, multiple): 

739 """ 

740 Returns the requested graph inpudes based on their 

741 indices or names. See :func:`get_column_index`. 

742 

743 :param indices: variables indices or names 

744 :param inputs: graph inputs 

745 :param multiple: allows column to come from multiple variables 

746 :return: a tuple *(variable name, list of requested indices)* if 

747 *multiple* is False, a dictionary *{ var_index: [ list of 

748 requested indices ] }* 

749 if *multiple* is True 

750 """ 

751 if multiple: 

752 res = OrderedDict() 

753 for p in indices: 

754 ov, onnx_i = get_column_index(p, inputs) 

755 if ov not in res: 

756 res[ov] = [] 

757 res[ov].append(onnx_i) 

758 return res 

759 

760 onnx_var = None 

761 onnx_is = [] 

762 for p in indices: 

763 ov, onnx_i = get_column_index(p, inputs) 

764 onnx_is.append(onnx_i) 

765 if onnx_var is None: 

766 onnx_var = ov 

767 elif onnx_var != ov: 

768 cols = [onnx_var, ov] 

769 raise NotImplementedError( # pragma: no cover 

770 "sklearn-onnx is not able to merge multiple columns from " 

771 "multiple variables ({0}). You should think about merging " 

772 "initial types.".format(cols)) 

773 return onnx_var, onnx_is 

774 

775 

776def _merge_initial_types(i_types, transform_inputs, merge): 

777 if len(i_types) == len(transform_inputs): 

778 new_types = [] 

779 for it, sli in zip(i_types, transform_inputs): 

780 name, ty = it 

781 begin, end = sli.inputs[1], sli.inputs[2] 

782 delta = end - begin 

783 shape = [ty.shape[0], int(delta[0])] 

784 new_types.append((name, ty.__class__(shape))) 

785 else: 

786 raise NotImplementedError( # pragma: no cover 

787 "Not implemented when i_types=%r, transform_inputs=%r." 

788 "" % (i_types, transform_inputs)) 

789 if merge and len(new_types) > 1: 

790 raise NotImplementedError( # pragma: no cover 

791 "Cannot merge %r built from i_types=%r, transform_inputs=%r." 

792 "" % (new_types, i_types, transform_inputs)) 

793 return new_types 

794 

795 

796def _to_onnx_function_column_transformer( 

797 model, X=None, name=None, initial_types=None, 

798 target_opset=None, options=None, rewrite_ops=False, 

799 white_op=None, black_op=None, final_types=None, 

800 rename_strategy=None, verbose=0, 

801 prefix_name=None, run_shape=False, 

802 single_function=True): 

803 

804 from sklearn.preprocessing import OneHotEncoder 

805 from ..npy.xop_variable import Variable 

806 from ..npy.xop import OnnxOperatorFunction, loadop 

807 from ..onnx_tools.onnx_manipulations import onnx_model_to_function 

808 

809 OnnxConcat, OnnxSlice, OnnxIdentity = loadop('Concat', 'Slice', 'Identity') 

810 

811 transformers = model.transformers_ 

812 if len(transformers) == 0: 

813 raise RuntimeError( # pragma: no cover 

814 "The ColumnTransformer to be converted cannot be empty.") 

815 

816 if target_opset is None: 

817 from .. import __max_supported_opset__ 

818 op_version = __max_supported_opset__ 

819 elif isinstance(target_opset, int): 

820 op_version = target_opset 

821 else: # pragma: no cover 

822 from .. import __max_supported_opset__ 

823 op_version = target_opset.get('', __max_supported_opset__) 

824 

825 i_types = guess_initial_types(X, initial_types) 

826 ops = [] 

827 protoms = [] 

828 output_namess = [] 

829 for i_step, (name_step, op, column_indices) in enumerate(transformers): 

830 if op == 'drop': 

831 continue 

832 input_nodes = [OnnxIdentity(i[0], op_version=op_version) 

833 for i in initial_types] 

834 if isinstance(column_indices, slice): 

835 column_indices = list(range( 

836 column_indices.start 

837 if column_indices.start is not None else 0, 

838 column_indices.stop, column_indices.step 

839 if column_indices.step is not None else 1)) 

840 elif isinstance(column_indices, (int, str)): 

841 column_indices = [column_indices] 

842 names = get_column_indices(column_indices, i_types, multiple=True) 

843 transform_inputs = [] 

844 for onnx_var, onnx_is in names.items(): 

845 if max(onnx_is) - min(onnx_is) != len(onnx_is) - 1: 

846 raise RuntimeError( # pragma: no cover 

847 "The converter only with contiguous columns indices not %r " 

848 "for step %r." % (column_indices, name_step)) 

849 tr_inputs = OnnxSlice(input_nodes[onnx_var], 

850 numpy.array([onnx_is[0]], dtype=numpy.int64), 

851 numpy.array([onnx_is[-1] + 1], 

852 dtype=numpy.int64), 

853 numpy.array([1], dtype=numpy.int64), 

854 op_version=op_version) 

855 transform_inputs.append(tr_inputs) 

856 

857 merged_cols = False 

858 if len(transform_inputs) > 1: 

859 if isinstance(op, Pipeline): 

860 if not isinstance(op.steps[0][1], 

861 (OneHotEncoder, ColumnTransformer)): 

862 merged_cols = True 

863 elif not isinstance(op, (OneHotEncoder, ColumnTransformer)): 

864 merged_cols = True 

865 

866 if merged_cols: 

867 concatenated = OnnxConcat( 

868 *transform_inputs, op_version=op_version, axis=1) 

869 else: 

870 concatenated = transform_inputs 

871 initial_types = _merge_initial_types( 

872 i_types, transform_inputs, merged_cols) 

873 

874 prefix = name_step + "__" 

875 step_options = _new_options(options, prefix, op) 

876 if prefix_name is not None: 

877 prefix = prefix_name + prefix 

878 

879 if op == 'passthrough': 

880 ops.extend(concatenated) 

881 continue 

882 

883 protom = to_onnx( 

884 op, name=name_step, X=X, initial_types=initial_types, 

885 target_opset=target_opset, 

886 options=step_options, rewrite_ops=rewrite_ops, 

887 white_op=white_op, black_op=black_op, verbose=verbose, 

888 as_function=True, prefix_name=prefix, run_shape=run_shape, 

889 single_function=False) 

890 protoms.append(protom) 

891 

892 for o in protom.graph.output: 

893 if get_tensor_elem_type(o) == 0: 

894 raise RuntimeError( # pragma: no cover 

895 "Unabble to guess output type of output %r " 

896 "from model step %d: %r." % ( 

897 protom.graph.output, i_step, op)) 

898 jspar = 'HYPER:{"%s":%s}' % ( 

899 op.__class__.__name__, get_sklearn_json_params(op)) 

900 protof, fcts = onnx_model_to_function( 

901 protom, domain='sklearn', 

902 name=f"{prefix}_{op.__class__.__name__}_{id(op)}", 

903 doc_string=jspar) 

904 output_names = [f"{name_step}_{o}" for o in protof.output] 

905 output_namess.append(output_names) 

906 

907 logger.debug("_to_onnx_function_column_transformer:%s:->%r:%r:%s", 

908 op.__class__.__name__, output_names, len(protof.node), jspar) 

909 

910 op = OnnxOperatorFunction( 

911 protof, *concatenated, output_names=output_names, 

912 sub_functions=list(fcts)) 

913 ops.append(op) 

914 

915 logger.debug("_to_onnx_function_column_transformer:end:(%s-%d, X=%r, " 

916 "initial_types=%r, target_opset=%r, " 

917 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, " 

918 "final_types=%r, outputs=%r)", 

919 model.__class__.__name__, id(model), 

920 type(X), initial_types, target_opset, 

921 options, rewrite_ops, white_op, black_op, 

922 final_types, i_types) 

923 

924 i_vars = [Variable.from_skl2onnx_tuple(i) for i in i_types] 

925 if final_types is None: 

926 outputs_tuple = [] 

927 for protom, output_names in zip(protoms, output_namess): 

928 outputs_tuple.extend([ 

929 (n, _guess_s2o_type(o)) 

930 for i, (n, o) in enumerate(zip(output_names, protom.graph.output))]) 

931 outputs = [Variable.from_skl2onnx_tuple(i) for i in outputs_tuple] 

932 else: 

933 outputs = final_types 

934 

935 last_op = OnnxConcat(*ops, op_version=op_version, axis=1) 

936 

937 onx = last_op.to_onnx(inputs=i_vars, target_opset=target_opset, 

938 verbose=verbose, run_shape=run_shape, 

939 outputs=outputs) 

940 

941 for o in onx.graph.output: 

942 if get_tensor_elem_type(o) == 0: 

943 raise RuntimeError( # pragma: no cover 

944 "Unable to guess output type of output %r " 

945 "from model %r." % (onx.graph.output, model)) 

946 return onx 

947 

948 

949def to_onnx_function(model, X=None, name=None, initial_types=None, 

950 target_opset=None, options=None, rewrite_ops=False, 

951 white_op=None, black_op=None, final_types=None, 

952 rename_strategy=None, verbose=0, 

953 prefix_name=None, run_shape=False, 

954 single_function=True): 

955 """ 

956 Converts a model using on :epkg:`sklearn-onnx`. 

957 The functions works as the same as function @see fn to_onnx 

958 but every model is exported as a single function and the main 

959 graph represents the pipeline structure. 

960 

961 :param model: model to convert or a function 

962 wrapped into :epkg:`_PredictScorer` with 

963 function :epkg:`make_scorer` 

964 :param X: training set (at least one row), 

965 can be None, it is used to infered the 

966 input types (*initial_types*) 

967 :param initial_types: if *X* is None, then *initial_types* 

968 must be defined 

969 :param name: name of the produced model 

970 :param target_opset: to do it with a different target opset 

971 :param options: additional parameters for the conversion 

972 :param rewrite_ops: rewrites some existing converters, 

973 the changes are permanent 

974 :param white_op: white list of ONNX nodes allowed 

975 while converting a pipeline, if empty, all are allowed 

976 :param black_op: black list of ONNX nodes allowed 

977 while converting a pipeline, if empty, 

978 none are blacklisted 

979 :param final_types: a python list. Works the same way as 

980 initial_types but not mandatory, it is used 

981 to overwrites the type (if type is not None) 

982 and the name of every output. 

983 :param rename_strategy: rename any name in the graph, select shorter 

984 names, see @see fn onnx_rename_names 

985 :param verbose: display information while converting the model 

986 :param prefix_name: prefix for variable names 

987 :param run_shape: run shape inference on the final onnx model 

988 :param single_function: if True, the main graph only includes one node 

989 calling the main function 

990 :return: converted model 

991 """ 

992 if rename_strategy is not None or rewrite_ops: 

993 return to_onnx( 

994 model, X=X, name=name, initial_types=initial_types, 

995 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops, 

996 white_op=white_op, black_op=black_op, final_types=final_types, 

997 rename_strategy=rename_strategy, verbose=verbose, 

998 run_shape=run_shape) 

999 

1000 logger.debug("to_onnx_function:begin:(%s-%d, X=%r, initial_types=%r, target_opset=%r, " 

1001 "options=%r, rewrite_ops=%r, white_op=%r, black_op=%r, " 

1002 "final_types=%r)", 

1003 model.__class__.__name__, id(model), type(X), initial_types, 

1004 target_opset, options, rewrite_ops, white_op, black_op, 

1005 final_types) 

1006 

1007 if final_types is not None: 

1008 raise NotImplementedError( # pragma: no cover 

1009 "final_types != None, not implemented yet.") 

1010 

1011 if single_function and (not isinstance(model, Pipeline) or 

1012 len(model.steps) != 1): 

1013 # Wraps the model into a single pipeline. 

1014 new_model = Pipeline(steps=[('main', model)]) 

1015 return to_onnx_function( 

1016 new_model, X=X, name=name, initial_types=initial_types, 

1017 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops, 

1018 white_op=white_op, black_op=black_op, final_types=final_types, 

1019 rename_strategy=rename_strategy, verbose=verbose, 

1020 prefix_name=prefix_name, run_shape=run_shape, single_function=False) 

1021 

1022 if isinstance(model, Pipeline): 

1023 return _to_onnx_function_pipeline( 

1024 model, X=X, name=name, initial_types=initial_types, 

1025 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops, 

1026 white_op=white_op, black_op=black_op, final_types=final_types, 

1027 rename_strategy=rename_strategy, verbose=verbose, 

1028 prefix_name=prefix_name, run_shape=run_shape, 

1029 single_function=single_function) 

1030 

1031 if isinstance(model, ColumnTransformer): 

1032 return _to_onnx_function_column_transformer( 

1033 model, X=X, name=name, initial_types=initial_types, 

1034 target_opset=target_opset, options=options, rewrite_ops=rewrite_ops, 

1035 white_op=white_op, black_op=black_op, final_types=final_types, 

1036 rename_strategy=rename_strategy, verbose=verbose, 

1037 prefix_name=prefix_name, run_shape=run_shape, 

1038 single_function=single_function) 

1039 

1040 raise TypeError( # pragma: no cover 

1041 f"Unexpected type {type(model)!r} for model to convert.")