Coverage for mlprodict/onnx_conv/operator_converters/conv_xgboost.py: 91%
203 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-04 02:28 +0100
1"""
2@file
3@brief Modified converter from
4`XGBoost.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
5xgboost/operator_converters/XGBoost.py>`_.
6"""
7import json
8from pprint import pformat
9import numpy
10from xgboost import XGBClassifier
11from skl2onnx.common.data_types import guess_numpy_type # pylint: disable=C0411
12from ..sklconv.tree_converters import _fix_tree_ensemble
15class XGBConverter:
16 "common methods for converters"
18 @staticmethod
19 def get_xgb_params(xgb_node):
20 """
21 Retrieves parameters of a model.
22 """
23 pars = xgb_node.get_xgb_params()
24 # xgboost >= 1.0
25 if 'n_estimators' not in pars:
26 pars['n_estimators'] = xgb_node.n_estimators
27 return pars
29 @staticmethod
30 def validate(xgb_node):
31 "validates the model"
32 params = XGBConverter.get_xgb_params(xgb_node)
33 try:
34 if "objective" not in params:
35 raise AttributeError('ojective')
36 except AttributeError as e: # pragma: no cover
37 raise RuntimeError('Missing attribute in XGBoost model.') from e
39 @staticmethod
40 def common_members(xgb_node, inputs):
41 "common to regresssor and classifier"
42 params = XGBConverter.get_xgb_params(xgb_node)
43 objective = params["objective"]
44 base_score = params["base_score"]
45 booster = xgb_node.get_booster()
46 # The json format was available in October 2017.
47 # XGBoost 0.7 was the first version released with it.
48 js_tree_list = booster.get_dump(with_stats=True, dump_format='json')
49 js_trees = [json.loads(s) for s in js_tree_list]
50 return objective, base_score, js_trees
52 @staticmethod
53 def _get_default_tree_attribute_pairs(is_classifier):
54 attrs = {}
55 for k in {'nodes_treeids', 'nodes_nodeids',
56 'nodes_featureids', 'nodes_modes', 'nodes_values',
57 'nodes_truenodeids', 'nodes_falsenodeids', 'nodes_missing_value_tracks_true'}:
58 attrs[k] = []
59 if is_classifier:
60 for k in {'class_treeids', 'class_nodeids', 'class_ids', 'class_weights'}:
61 attrs[k] = []
62 else:
63 for k in {'target_treeids', 'target_nodeids', 'target_ids', 'target_weights'}:
64 attrs[k] = []
65 return attrs
67 @staticmethod
68 def _add_node(attr_pairs, is_classifier, tree_id, tree_weight, node_id,
69 feature_id, mode, value, true_child_id, false_child_id, weights, weight_id_bias,
70 missing, hitrate):
71 if isinstance(feature_id, str):
72 # Something like f0, f1...
73 if feature_id[0] == "f":
74 try:
75 feature_id = int(feature_id[1:])
76 except ValueError as e: # pragma: no cover
77 raise RuntimeError(
78 f"Unable to interpret '{feature_id}'") from e
79 else: # pragma: no cover
80 try:
81 feature_id = int(feature_id)
82 except ValueError:
83 raise RuntimeError(
84 f"Unable to interpret '{feature_id}'") from e
86 # Split condition for sklearn
87 # * if X_ptr[X_sample_stride * i + X_fx_stride * node.feature] <= node.threshold:
88 # * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L946
89 # Split condition for xgboost
90 # * if (fvalue < split_value)
91 # * https://github.com/dmlc/xgboost/blob/master/include/xgboost/tree_model.h#L804
93 attr_pairs['nodes_treeids'].append(tree_id)
94 attr_pairs['nodes_nodeids'].append(node_id)
95 attr_pairs['nodes_featureids'].append(feature_id)
96 attr_pairs['nodes_modes'].append(mode)
97 attr_pairs['nodes_values'].append(float(value))
98 attr_pairs['nodes_truenodeids'].append(true_child_id)
99 attr_pairs['nodes_falsenodeids'].append(false_child_id)
100 attr_pairs['nodes_missing_value_tracks_true'].append(missing)
101 if 'nodes_hitrates' in attr_pairs:
102 attr_pairs['nodes_hitrates'].append(hitrate) # pragma: no cover
103 if mode == 'LEAF':
104 if is_classifier:
105 for i, w in enumerate(weights):
106 attr_pairs['class_treeids'].append(tree_id)
107 attr_pairs['class_nodeids'].append(node_id)
108 attr_pairs['class_ids'].append(i + weight_id_bias)
109 attr_pairs['class_weights'].append(float(tree_weight * w))
110 else:
111 for i, w in enumerate(weights):
112 attr_pairs['target_treeids'].append(tree_id)
113 attr_pairs['target_nodeids'].append(node_id)
114 attr_pairs['target_ids'].append(i + weight_id_bias)
115 attr_pairs['target_weights'].append(float(tree_weight * w))
117 @staticmethod
118 def _fill_node_attributes(treeid, tree_weight, jsnode, attr_pairs, is_classifier, remap):
119 if 'children' in jsnode:
120 XGBConverter._add_node(
121 attr_pairs=attr_pairs, is_classifier=is_classifier,
122 tree_id=treeid, tree_weight=tree_weight,
123 value=jsnode['split_condition'],
124 node_id=remap[jsnode['nodeid']],
125 feature_id=jsnode['split'],
126 mode='BRANCH_LT', # 'BRANCH_LEQ' --> is for sklearn
127 # ['children'][0]['nodeid'],
128 true_child_id=remap[jsnode['yes']],
129 # ['children'][1]['nodeid'],
130 false_child_id=remap[jsnode['no']],
131 weights=None, weight_id_bias=None,
132 # ['children'][0]['nodeid'],
133 missing=jsnode.get('missing', -1) == jsnode['yes'],
134 hitrate=jsnode.get('cover', 0))
136 for ch in jsnode['children']:
137 if 'children' in ch or 'leaf' in ch:
138 XGBConverter._fill_node_attributes(
139 treeid, tree_weight, ch, attr_pairs, is_classifier, remap)
140 else:
141 raise RuntimeError( # pragma: no cover
142 f"Unable to convert this node {ch}")
144 else:
145 weights = [jsnode['leaf']]
146 weights_id_bias = 0
147 XGBConverter._add_node(
148 attr_pairs=attr_pairs, is_classifier=is_classifier,
149 tree_id=treeid, tree_weight=tree_weight,
150 value=0., node_id=remap[jsnode['nodeid']],
151 feature_id=0, mode='LEAF',
152 true_child_id=0, false_child_id=0,
153 weights=weights, weight_id_bias=weights_id_bias,
154 missing=False, hitrate=jsnode.get('cover', 0))
156 @staticmethod
157 def _remap_nodeid(jsnode, remap=None):
158 if remap is None:
159 remap = {}
160 nid = jsnode['nodeid']
161 remap[nid] = len(remap)
162 if 'children' in jsnode:
163 for ch in jsnode['children']:
164 XGBConverter._remap_nodeid(ch, remap)
165 return remap
167 @staticmethod
168 def fill_tree_attributes(js_xgb_node, attr_pairs, tree_weights, is_classifier):
169 "fills tree attributes"
170 if not isinstance(js_xgb_node, list):
171 raise TypeError( # pragma: no cover
172 "js_xgb_node must be a list")
173 for treeid, (jstree, w) in enumerate(zip(js_xgb_node, tree_weights)):
174 remap = XGBConverter._remap_nodeid(jstree)
175 XGBConverter._fill_node_attributes(
176 treeid, w, jstree, attr_pairs, is_classifier, remap)
179class XGBRegressorConverter(XGBConverter):
180 "converter class"
182 @staticmethod
183 def validate(xgb_node):
184 return XGBConverter.validate(xgb_node)
186 @staticmethod
187 def _get_default_tree_attribute_pairs(): # pylint: disable=W0221
188 attrs = XGBConverter._get_default_tree_attribute_pairs(False)
189 attrs['post_transform'] = 'NONE'
190 attrs['n_targets'] = 1
191 return attrs
193 @staticmethod
194 def convert(scope, operator, container):
195 "converter method"
196 dtype = guess_numpy_type(operator.inputs[0].type)
197 if dtype != numpy.float64:
198 dtype = numpy.float32
199 opsetml = container.target_opset_all.get('ai.onnx.ml', None)
200 if opsetml is None:
201 opsetml = 3 if container.target_opset >= 16 else 1
202 xgb_node = operator.raw_operator
203 inputs = operator.inputs
204 objective, base_score, js_trees = XGBConverter.common_members(
205 xgb_node, inputs)
207 if objective in ["reg:gamma", "reg:tweedie"]:
208 raise RuntimeError( # pragma: no cover
209 f"Objective '{objective}' not supported.")
211 booster = xgb_node.get_booster()
212 if booster is None:
213 raise RuntimeError( # pragma: no cover
214 "The model was probably not trained.")
216 best_ntree_limit = getattr(booster, 'best_ntree_limit', len(js_trees))
217 if best_ntree_limit < len(js_trees):
218 js_trees = js_trees[:best_ntree_limit]
220 attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs()
221 if base_score is None:
222 attr_pairs['base_values'] = [0.5]
223 else:
224 attr_pairs['base_values'] = [base_score]
225 XGBConverter.fill_tree_attributes(
226 js_trees, attr_pairs, [1 for _ in js_trees], False)
228 # add nodes
229 if dtype == numpy.float64 and opsetml < 3:
230 container.add_node(
231 'TreeEnsembleRegressorDouble', operator.input_full_names,
232 operator.output_full_names,
233 name=scope.get_unique_operator_name(
234 'TreeEnsembleRegressorDouble'),
235 op_domain='mlprodict', op_version=1, **attr_pairs)
236 else:
237 container.add_node(
238 'TreeEnsembleRegressor', operator.input_full_names,
239 operator.output_full_names,
240 name=scope.get_unique_operator_name('TreeEnsembleRegressor'),
241 op_domain='ai.onnx.ml', op_version=1, **attr_pairs)
242 if opsetml >= 3:
243 _fix_tree_ensemble(scope, container, opsetml, dtype)
246class XGBClassifierConverter(XGBConverter):
247 "converter for XGBClassifier"
249 @staticmethod
250 def validate(xgb_node):
251 return XGBConverter.validate(xgb_node)
253 @staticmethod
254 def _get_default_tree_attribute_pairs(): # pylint: disable=W0221
255 attrs = XGBConverter._get_default_tree_attribute_pairs(True)
256 # attrs['nodes_hitrates'] = []
257 return attrs
259 @staticmethod
260 def convert(scope, operator, container):
261 "convert method"
262 opsetml = container.target_opset_all.get('ai.onnx.ml', None)
263 if opsetml is None:
264 opsetml = 3 if container.target_opset >= 16 else 1
265 dtype = guess_numpy_type(operator.inputs[0].type)
266 if dtype != numpy.float64:
267 dtype = numpy.float32
268 xgb_node = operator.raw_operator
269 inputs = operator.inputs
271 objective, base_score, js_trees = XGBConverter.common_members(
272 xgb_node, inputs)
273 params = XGBConverter.get_xgb_params(xgb_node)
275 attr_pairs = XGBClassifierConverter._get_default_tree_attribute_pairs()
276 XGBConverter.fill_tree_attributes(
277 js_trees, attr_pairs, [1 for _ in js_trees], True)
279 ncl = (max(attr_pairs['class_treeids']) + 1) // params['n_estimators']
281 bst = xgb_node.get_booster()
282 best_ntree_limit = getattr(
283 bst, 'best_ntree_limit', len(js_trees)) * ncl
284 if best_ntree_limit < len(js_trees):
285 js_trees = js_trees[:best_ntree_limit]
286 attr_pairs = XGBClassifierConverter._get_default_tree_attribute_pairs()
287 XGBConverter.fill_tree_attributes(
288 js_trees, attr_pairs, [1 for _ in js_trees], True)
290 if len(attr_pairs['class_treeids']) == 0:
291 raise RuntimeError( # pragma: no cover
292 "XGBoost model is empty.")
293 if 'n_estimators' not in params:
294 raise RuntimeError( # pragma: no cover
295 f"Parameters not found, existing:\n{pformat(params)}")
296 if base_score is None:
297 base_score = 0.5
298 if ncl <= 1:
299 ncl = 2
300 # See https://github.com/dmlc/xgboost/blob/master/src/common/math.h#L23.
301 attr_pairs['post_transform'] = "LOGISTIC"
302 if js_trees[0].get('leaf', None) == 0:
303 attr_pairs['base_values'] = [0.5]
304 elif base_score != 0.5:
305 cst = - numpy.log(1 / numpy.float32(base_score) - 1.)
306 attr_pairs['base_values'] = [cst]
307 attr_pairs['class_ids'] = [0 for v in attr_pairs['class_treeids']]
308 else:
309 # See https://github.com/dmlc/xgboost/blob/master/src/common/math.h#L35.
310 attr_pairs['post_transform'] = "SOFTMAX"
311 # attr_pairs['base_values'] = [base_score for n in range(ncl)]
312 attr_pairs['class_ids'] = [v % ncl
313 for v in attr_pairs['class_treeids']]
315 classes = xgb_node.classes_
316 if (numpy.issubdtype(classes.dtype, numpy.floating) or
317 numpy.issubdtype(classes.dtype, numpy.signedinteger)):
318 attr_pairs['classlabels_int64s'] = classes.astype('int')
319 else:
320 classes = numpy.array([s.encode('utf-8') for s in classes])
321 attr_pairs['classlabels_strings'] = classes
323 if dtype == numpy.float64 and opsetml < 3:
324 op_name = "TreeEnsembleClassifierDouble"
325 else:
326 op_name = "TreeEnsembleClassifier"
328 # add nodes
329 if objective == "binary:logistic":
330 ncl = 2
331 container.add_node(op_name, operator.input_full_names,
332 operator.output_full_names,
333 name=scope.get_unique_operator_name(
334 op_name),
335 op_domain='ai.onnx.ml', **attr_pairs)
336 elif objective == "multi:softprob":
337 ncl = len(js_trees) // params['n_estimators']
338 container.add_node(
339 op_name, operator.input_full_names,
340 operator.output_full_names,
341 name=scope.get_unique_operator_name(op_name),
342 op_domain='ai.onnx.ml', op_version=1, **attr_pairs)
343 elif objective == "multi:softmax":
344 ncl = len(js_trees) // params['n_estimators']
345 container.add_node(
346 op_name, operator.input_full_names,
347 operator.output_full_names,
348 name=scope.get_unique_operator_name(op_name),
349 op_domain='ai.onnx.ml', op_version=1, **attr_pairs)
350 elif objective == "reg:logistic":
351 ncl = len(js_trees) // params['n_estimators']
352 if ncl == 1:
353 ncl = 2
354 container.add_node(
355 op_name, operator.input_full_names,
356 operator.output_full_names,
357 name=scope.get_unique_operator_name(op_name),
358 op_domain='ai.onnx.ml', op_version=1, **attr_pairs)
359 else:
360 raise RuntimeError( # pragma: no cover
361 f"Unexpected objective: {objective}")
363 if opsetml >= 3:
364 _fix_tree_ensemble(scope, container, opsetml, dtype)
367def convert_xgboost(scope, operator, container):
368 """
369 This converters reuses the code from
370 `XGBoost.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
371 xgboost/operator_converters/XGBoost.py>`_ and makes
372 some modifications. It implements converters
373 for models in :epkg:`xgboost`.
374 """
375 xgb_node = operator.raw_operator
376 if isinstance(xgb_node, XGBClassifier):
377 cls = XGBClassifierConverter
378 else:
379 cls = XGBRegressorConverter
380 cls.convert(scope, operator, container)