Coverage for mlprodict/onnx_conv/operator_converters/conv_xgboost.py: 91%

203 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-04 02:28 +0100

1""" 

2@file 

3@brief Modified converter from 

4`XGBoost.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/ 

5xgboost/operator_converters/XGBoost.py>`_. 

6""" 

7import json 

8from pprint import pformat 

9import numpy 

10from xgboost import XGBClassifier 

11from skl2onnx.common.data_types import guess_numpy_type # pylint: disable=C0411 

12from ..sklconv.tree_converters import _fix_tree_ensemble 

13 

14 

15class XGBConverter: 

16 "common methods for converters" 

17 

18 @staticmethod 

19 def get_xgb_params(xgb_node): 

20 """ 

21 Retrieves parameters of a model. 

22 """ 

23 pars = xgb_node.get_xgb_params() 

24 # xgboost >= 1.0 

25 if 'n_estimators' not in pars: 

26 pars['n_estimators'] = xgb_node.n_estimators 

27 return pars 

28 

29 @staticmethod 

30 def validate(xgb_node): 

31 "validates the model" 

32 params = XGBConverter.get_xgb_params(xgb_node) 

33 try: 

34 if "objective" not in params: 

35 raise AttributeError('ojective') 

36 except AttributeError as e: # pragma: no cover 

37 raise RuntimeError('Missing attribute in XGBoost model.') from e 

38 

39 @staticmethod 

40 def common_members(xgb_node, inputs): 

41 "common to regresssor and classifier" 

42 params = XGBConverter.get_xgb_params(xgb_node) 

43 objective = params["objective"] 

44 base_score = params["base_score"] 

45 booster = xgb_node.get_booster() 

46 # The json format was available in October 2017. 

47 # XGBoost 0.7 was the first version released with it. 

48 js_tree_list = booster.get_dump(with_stats=True, dump_format='json') 

49 js_trees = [json.loads(s) for s in js_tree_list] 

50 return objective, base_score, js_trees 

51 

52 @staticmethod 

53 def _get_default_tree_attribute_pairs(is_classifier): 

54 attrs = {} 

55 for k in {'nodes_treeids', 'nodes_nodeids', 

56 'nodes_featureids', 'nodes_modes', 'nodes_values', 

57 'nodes_truenodeids', 'nodes_falsenodeids', 'nodes_missing_value_tracks_true'}: 

58 attrs[k] = [] 

59 if is_classifier: 

60 for k in {'class_treeids', 'class_nodeids', 'class_ids', 'class_weights'}: 

61 attrs[k] = [] 

62 else: 

63 for k in {'target_treeids', 'target_nodeids', 'target_ids', 'target_weights'}: 

64 attrs[k] = [] 

65 return attrs 

66 

67 @staticmethod 

68 def _add_node(attr_pairs, is_classifier, tree_id, tree_weight, node_id, 

69 feature_id, mode, value, true_child_id, false_child_id, weights, weight_id_bias, 

70 missing, hitrate): 

71 if isinstance(feature_id, str): 

72 # Something like f0, f1... 

73 if feature_id[0] == "f": 

74 try: 

75 feature_id = int(feature_id[1:]) 

76 except ValueError as e: # pragma: no cover 

77 raise RuntimeError( 

78 f"Unable to interpret '{feature_id}'") from e 

79 else: # pragma: no cover 

80 try: 

81 feature_id = int(feature_id) 

82 except ValueError: 

83 raise RuntimeError( 

84 f"Unable to interpret '{feature_id}'") from e 

85 

86 # Split condition for sklearn 

87 # * if X_ptr[X_sample_stride * i + X_fx_stride * node.feature] <= node.threshold: 

88 # * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L946 

89 # Split condition for xgboost 

90 # * if (fvalue < split_value) 

91 # * https://github.com/dmlc/xgboost/blob/master/include/xgboost/tree_model.h#L804 

92 

93 attr_pairs['nodes_treeids'].append(tree_id) 

94 attr_pairs['nodes_nodeids'].append(node_id) 

95 attr_pairs['nodes_featureids'].append(feature_id) 

96 attr_pairs['nodes_modes'].append(mode) 

97 attr_pairs['nodes_values'].append(float(value)) 

98 attr_pairs['nodes_truenodeids'].append(true_child_id) 

99 attr_pairs['nodes_falsenodeids'].append(false_child_id) 

100 attr_pairs['nodes_missing_value_tracks_true'].append(missing) 

101 if 'nodes_hitrates' in attr_pairs: 

102 attr_pairs['nodes_hitrates'].append(hitrate) # pragma: no cover 

103 if mode == 'LEAF': 

104 if is_classifier: 

105 for i, w in enumerate(weights): 

106 attr_pairs['class_treeids'].append(tree_id) 

107 attr_pairs['class_nodeids'].append(node_id) 

108 attr_pairs['class_ids'].append(i + weight_id_bias) 

109 attr_pairs['class_weights'].append(float(tree_weight * w)) 

110 else: 

111 for i, w in enumerate(weights): 

112 attr_pairs['target_treeids'].append(tree_id) 

113 attr_pairs['target_nodeids'].append(node_id) 

114 attr_pairs['target_ids'].append(i + weight_id_bias) 

115 attr_pairs['target_weights'].append(float(tree_weight * w)) 

116 

117 @staticmethod 

118 def _fill_node_attributes(treeid, tree_weight, jsnode, attr_pairs, is_classifier, remap): 

119 if 'children' in jsnode: 

120 XGBConverter._add_node( 

121 attr_pairs=attr_pairs, is_classifier=is_classifier, 

122 tree_id=treeid, tree_weight=tree_weight, 

123 value=jsnode['split_condition'], 

124 node_id=remap[jsnode['nodeid']], 

125 feature_id=jsnode['split'], 

126 mode='BRANCH_LT', # 'BRANCH_LEQ' --> is for sklearn 

127 # ['children'][0]['nodeid'], 

128 true_child_id=remap[jsnode['yes']], 

129 # ['children'][1]['nodeid'], 

130 false_child_id=remap[jsnode['no']], 

131 weights=None, weight_id_bias=None, 

132 # ['children'][0]['nodeid'], 

133 missing=jsnode.get('missing', -1) == jsnode['yes'], 

134 hitrate=jsnode.get('cover', 0)) 

135 

136 for ch in jsnode['children']: 

137 if 'children' in ch or 'leaf' in ch: 

138 XGBConverter._fill_node_attributes( 

139 treeid, tree_weight, ch, attr_pairs, is_classifier, remap) 

140 else: 

141 raise RuntimeError( # pragma: no cover 

142 f"Unable to convert this node {ch}") 

143 

144 else: 

145 weights = [jsnode['leaf']] 

146 weights_id_bias = 0 

147 XGBConverter._add_node( 

148 attr_pairs=attr_pairs, is_classifier=is_classifier, 

149 tree_id=treeid, tree_weight=tree_weight, 

150 value=0., node_id=remap[jsnode['nodeid']], 

151 feature_id=0, mode='LEAF', 

152 true_child_id=0, false_child_id=0, 

153 weights=weights, weight_id_bias=weights_id_bias, 

154 missing=False, hitrate=jsnode.get('cover', 0)) 

155 

156 @staticmethod 

157 def _remap_nodeid(jsnode, remap=None): 

158 if remap is None: 

159 remap = {} 

160 nid = jsnode['nodeid'] 

161 remap[nid] = len(remap) 

162 if 'children' in jsnode: 

163 for ch in jsnode['children']: 

164 XGBConverter._remap_nodeid(ch, remap) 

165 return remap 

166 

167 @staticmethod 

168 def fill_tree_attributes(js_xgb_node, attr_pairs, tree_weights, is_classifier): 

169 "fills tree attributes" 

170 if not isinstance(js_xgb_node, list): 

171 raise TypeError( # pragma: no cover 

172 "js_xgb_node must be a list") 

173 for treeid, (jstree, w) in enumerate(zip(js_xgb_node, tree_weights)): 

174 remap = XGBConverter._remap_nodeid(jstree) 

175 XGBConverter._fill_node_attributes( 

176 treeid, w, jstree, attr_pairs, is_classifier, remap) 

177 

178 

179class XGBRegressorConverter(XGBConverter): 

180 "converter class" 

181 

182 @staticmethod 

183 def validate(xgb_node): 

184 return XGBConverter.validate(xgb_node) 

185 

186 @staticmethod 

187 def _get_default_tree_attribute_pairs(): # pylint: disable=W0221 

188 attrs = XGBConverter._get_default_tree_attribute_pairs(False) 

189 attrs['post_transform'] = 'NONE' 

190 attrs['n_targets'] = 1 

191 return attrs 

192 

193 @staticmethod 

194 def convert(scope, operator, container): 

195 "converter method" 

196 dtype = guess_numpy_type(operator.inputs[0].type) 

197 if dtype != numpy.float64: 

198 dtype = numpy.float32 

199 opsetml = container.target_opset_all.get('ai.onnx.ml', None) 

200 if opsetml is None: 

201 opsetml = 3 if container.target_opset >= 16 else 1 

202 xgb_node = operator.raw_operator 

203 inputs = operator.inputs 

204 objective, base_score, js_trees = XGBConverter.common_members( 

205 xgb_node, inputs) 

206 

207 if objective in ["reg:gamma", "reg:tweedie"]: 

208 raise RuntimeError( # pragma: no cover 

209 f"Objective '{objective}' not supported.") 

210 

211 booster = xgb_node.get_booster() 

212 if booster is None: 

213 raise RuntimeError( # pragma: no cover 

214 "The model was probably not trained.") 

215 

216 best_ntree_limit = getattr(booster, 'best_ntree_limit', len(js_trees)) 

217 if best_ntree_limit < len(js_trees): 

218 js_trees = js_trees[:best_ntree_limit] 

219 

220 attr_pairs = XGBRegressorConverter._get_default_tree_attribute_pairs() 

221 if base_score is None: 

222 attr_pairs['base_values'] = [0.5] 

223 else: 

224 attr_pairs['base_values'] = [base_score] 

225 XGBConverter.fill_tree_attributes( 

226 js_trees, attr_pairs, [1 for _ in js_trees], False) 

227 

228 # add nodes 

229 if dtype == numpy.float64 and opsetml < 3: 

230 container.add_node( 

231 'TreeEnsembleRegressorDouble', operator.input_full_names, 

232 operator.output_full_names, 

233 name=scope.get_unique_operator_name( 

234 'TreeEnsembleRegressorDouble'), 

235 op_domain='mlprodict', op_version=1, **attr_pairs) 

236 else: 

237 container.add_node( 

238 'TreeEnsembleRegressor', operator.input_full_names, 

239 operator.output_full_names, 

240 name=scope.get_unique_operator_name('TreeEnsembleRegressor'), 

241 op_domain='ai.onnx.ml', op_version=1, **attr_pairs) 

242 if opsetml >= 3: 

243 _fix_tree_ensemble(scope, container, opsetml, dtype) 

244 

245 

246class XGBClassifierConverter(XGBConverter): 

247 "converter for XGBClassifier" 

248 

249 @staticmethod 

250 def validate(xgb_node): 

251 return XGBConverter.validate(xgb_node) 

252 

253 @staticmethod 

254 def _get_default_tree_attribute_pairs(): # pylint: disable=W0221 

255 attrs = XGBConverter._get_default_tree_attribute_pairs(True) 

256 # attrs['nodes_hitrates'] = [] 

257 return attrs 

258 

259 @staticmethod 

260 def convert(scope, operator, container): 

261 "convert method" 

262 opsetml = container.target_opset_all.get('ai.onnx.ml', None) 

263 if opsetml is None: 

264 opsetml = 3 if container.target_opset >= 16 else 1 

265 dtype = guess_numpy_type(operator.inputs[0].type) 

266 if dtype != numpy.float64: 

267 dtype = numpy.float32 

268 xgb_node = operator.raw_operator 

269 inputs = operator.inputs 

270 

271 objective, base_score, js_trees = XGBConverter.common_members( 

272 xgb_node, inputs) 

273 params = XGBConverter.get_xgb_params(xgb_node) 

274 

275 attr_pairs = XGBClassifierConverter._get_default_tree_attribute_pairs() 

276 XGBConverter.fill_tree_attributes( 

277 js_trees, attr_pairs, [1 for _ in js_trees], True) 

278 

279 ncl = (max(attr_pairs['class_treeids']) + 1) // params['n_estimators'] 

280 

281 bst = xgb_node.get_booster() 

282 best_ntree_limit = getattr( 

283 bst, 'best_ntree_limit', len(js_trees)) * ncl 

284 if best_ntree_limit < len(js_trees): 

285 js_trees = js_trees[:best_ntree_limit] 

286 attr_pairs = XGBClassifierConverter._get_default_tree_attribute_pairs() 

287 XGBConverter.fill_tree_attributes( 

288 js_trees, attr_pairs, [1 for _ in js_trees], True) 

289 

290 if len(attr_pairs['class_treeids']) == 0: 

291 raise RuntimeError( # pragma: no cover 

292 "XGBoost model is empty.") 

293 if 'n_estimators' not in params: 

294 raise RuntimeError( # pragma: no cover 

295 f"Parameters not found, existing:\n{pformat(params)}") 

296 if base_score is None: 

297 base_score = 0.5 

298 if ncl <= 1: 

299 ncl = 2 

300 # See https://github.com/dmlc/xgboost/blob/master/src/common/math.h#L23. 

301 attr_pairs['post_transform'] = "LOGISTIC" 

302 if js_trees[0].get('leaf', None) == 0: 

303 attr_pairs['base_values'] = [0.5] 

304 elif base_score != 0.5: 

305 cst = - numpy.log(1 / numpy.float32(base_score) - 1.) 

306 attr_pairs['base_values'] = [cst] 

307 attr_pairs['class_ids'] = [0 for v in attr_pairs['class_treeids']] 

308 else: 

309 # See https://github.com/dmlc/xgboost/blob/master/src/common/math.h#L35. 

310 attr_pairs['post_transform'] = "SOFTMAX" 

311 # attr_pairs['base_values'] = [base_score for n in range(ncl)] 

312 attr_pairs['class_ids'] = [v % ncl 

313 for v in attr_pairs['class_treeids']] 

314 

315 classes = xgb_node.classes_ 

316 if (numpy.issubdtype(classes.dtype, numpy.floating) or 

317 numpy.issubdtype(classes.dtype, numpy.signedinteger)): 

318 attr_pairs['classlabels_int64s'] = classes.astype('int') 

319 else: 

320 classes = numpy.array([s.encode('utf-8') for s in classes]) 

321 attr_pairs['classlabels_strings'] = classes 

322 

323 if dtype == numpy.float64 and opsetml < 3: 

324 op_name = "TreeEnsembleClassifierDouble" 

325 else: 

326 op_name = "TreeEnsembleClassifier" 

327 

328 # add nodes 

329 if objective == "binary:logistic": 

330 ncl = 2 

331 container.add_node(op_name, operator.input_full_names, 

332 operator.output_full_names, 

333 name=scope.get_unique_operator_name( 

334 op_name), 

335 op_domain='ai.onnx.ml', **attr_pairs) 

336 elif objective == "multi:softprob": 

337 ncl = len(js_trees) // params['n_estimators'] 

338 container.add_node( 

339 op_name, operator.input_full_names, 

340 operator.output_full_names, 

341 name=scope.get_unique_operator_name(op_name), 

342 op_domain='ai.onnx.ml', op_version=1, **attr_pairs) 

343 elif objective == "multi:softmax": 

344 ncl = len(js_trees) // params['n_estimators'] 

345 container.add_node( 

346 op_name, operator.input_full_names, 

347 operator.output_full_names, 

348 name=scope.get_unique_operator_name(op_name), 

349 op_domain='ai.onnx.ml', op_version=1, **attr_pairs) 

350 elif objective == "reg:logistic": 

351 ncl = len(js_trees) // params['n_estimators'] 

352 if ncl == 1: 

353 ncl = 2 

354 container.add_node( 

355 op_name, operator.input_full_names, 

356 operator.output_full_names, 

357 name=scope.get_unique_operator_name(op_name), 

358 op_domain='ai.onnx.ml', op_version=1, **attr_pairs) 

359 else: 

360 raise RuntimeError( # pragma: no cover 

361 f"Unexpected objective: {objective}") 

362 

363 if opsetml >= 3: 

364 _fix_tree_ensemble(scope, container, opsetml, dtype) 

365 

366 

367def convert_xgboost(scope, operator, container): 

368 """ 

369 This converters reuses the code from 

370 `XGBoost.py <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/ 

371 xgboost/operator_converters/XGBoost.py>`_ and makes 

372 some modifications. It implements converters 

373 for models in :epkg:`xgboost`. 

374 """ 

375 xgb_node = operator.raw_operator 

376 if isinstance(xgb_node, XGBClassifier): 

377 cls = XGBClassifierConverter 

378 else: 

379 cls = XGBRegressorConverter 

380 cls.convert(scope, operator, container)