Coverage for mlinsights/mlmodel/decision_tree_logreg.py: 99%

189 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-28 08:46 +0100

1""" 

2@file 

3@brief Builds a tree of logistic regressions. 

4""" 

5import numpy 

6import scipy.sparse as sparse # pylint: disable=R0402 

7from sklearn.linear_model import LogisticRegression 

8from sklearn.base import BaseEstimator, ClassifierMixin, clone 

9from sklearn.linear_model._base import LinearClassifierMixin 

10 

11 

12def logistic(x): 

13 """ 

14 Computes :math:`\\frac{1}{1 + e^{-x}}`. 

15 """ 

16 return 1. / (1. + numpy.exp(-x)) 

17 

18 

19def likelihood(x, y, theta=1., th=0.): 

20 """ 

21 Computes :math:`\\sum_i y_i f(\\theta (x_i - x_0)) + (1 - y_i) (1 - f(\\theta (x_i - x_0)))` 

22 where :math:`f(x_i)` is :math:`\\frac{1}{1 + e^{-x}}`. 

23 """ 

24 lr = logistic((x - th) * theta) 

25 return y * lr + (1. - y) * (1 - lr) 

26 

27 

28class _DecisionTreeLogisticRegressionNode: 

29 """ 

30 Describes the tree structure hold by class 

31 @see cl DecisionTreeLogisticRegression. 

32 See also notebook :ref:`decisiontreelogregrst`. 

33 """ 

34 

35 def __init__(self, estimator, threshold=0.5, depth=1, index=0): 

36 """ 

37 constructor 

38 

39 @param estimator binary estimator 

40 """ 

41 self.index = index 

42 self.estimator = estimator 

43 self.above = None 

44 self.below = None 

45 self.threshold = threshold 

46 self.depth = depth 

47 

48 def predict(self, X): 

49 """ 

50 Predicts 

51 """ 

52 prob = self.predict_proba(X) 

53 return (prob[:, 1] >= 0.5).astype(numpy.int32) 

54 

55 def predict_proba(self, X): 

56 """ 

57 Returns the classification probabilities. 

58 

59 @param X features 

60 @return probabilties 

61 """ 

62 prob = self.estimator.predict_proba(X) 

63 above = prob[:, 1] > self.threshold 

64 below = ~ above 

65 n_above = above.sum() 

66 n_below = below.sum() 

67 if self.above is not None and n_above > 0: 

68 prob_above = self.above.predict_proba(X[above]) 

69 prob[above] = prob_above 

70 if self.below is not None and n_below > 0: 

71 prob_below = self.below.predict_proba(X[below]) 

72 prob[below] = prob_below 

73 return prob 

74 

75 def decision_path(self, X, mat, indices): 

76 """ 

77 Returns the classification probabilities. 

78 

79 @param X features 

80 @param mat decision path (allocated matrix) 

81 """ 

82 mat[indices, self.index] = 1 

83 prob = self.estimator.predict_proba(X) 

84 above = prob[:, 1] > self.threshold 

85 below = ~ above 

86 n_above = above.sum() 

87 n_below = below.sum() 

88 indices_above = indices[above] 

89 indices_below = indices[below] 

90 if self.above is not None and n_above > 0: 

91 self.above.decision_path(X[above], mat, indices_above) 

92 if self.below is not None and n_below > 0: 

93 self.below.decision_path(X[below], mat, indices_below) 

94 

95 def fit(self, X, y, sample_weight, dtlr, total_N): 

96 """ 

97 Fits a logistic regression, then splits the sample into 

98 positive and negative examples, finally tries to fit 

99 logistic regressions on both subsamples. This method only 

100 works on a linear classifier. 

101 

102 @param X features 

103 @param y binary labels 

104 @param sample_weight weights of every sample 

105 @param dtlr @see cl DecisionTreeLogisticRegression 

106 @param total_N total number of observation 

107 @return last index 

108 """ 

109 self.estimator.fit(X, y, sample_weight=sample_weight) 

110 if dtlr.verbose >= 1: 

111 print("[DTLR ] %s trained acc %1.2f N=%d" % ( # pragma: no cover 

112 " " * self.depth, self.estimator.score(X, y), X.shape[0])) 

113 prob = self.fit_improve(dtlr, total_N, X, y, 

114 sample_weight=sample_weight) 

115 

116 if self.depth + 1 > dtlr.max_depth: 

117 return self.index 

118 if X.shape[0] < dtlr.min_samples_split: 

119 return self.index 

120 

121 above = prob[:, 1] > self.threshold 

122 below = ~ above 

123 n_above = above.sum() 

124 n_below = below.sum() 

125 y_above = set(y[above]) 

126 y_below = set(y[below]) 

127 

128 def _fit_side(index, y_above_below, above_below, n_above_below, side): 

129 if dtlr.verbose >= 1: 

130 print("[DTLR*] %s%s: n_class=%d N=%d - %d/%d" % ( # pragma: no cover 

131 " " * self.depth, side, 

132 len(y_above_below), above_below.shape[0], 

133 n_above_below, total_N)) 

134 if (len(y_above_below) > 1 and 

135 above_below.shape[0] > dtlr.min_samples_leaf * 2 and 

136 (float(n_above_below) / total_N >= 

137 dtlr.min_weight_fraction_leaf * 2) and 

138 n_above_below < total_N): 

139 estimator = clone(dtlr.estimator) 

140 sw = sample_weight[above_below] if sample_weight is not None else None 

141 node = _DecisionTreeLogisticRegressionNode( 

142 estimator, self.threshold, depth=self.depth + 1, index=index) 

143 last_index = node.fit( 

144 X[above_below], y[above_below], sw, dtlr, total_N) 

145 return node, last_index 

146 return None, index 

147 

148 self.above, last = _fit_side( 

149 self.index + 1, y_above, above, n_above, "above") 

150 self.below, last = _fit_side( 

151 last + 1, y_below, below, n_below, "below") 

152 return last 

153 

154 @property 

155 def tree_depth_(self): 

156 """ 

157 Returns the maximum depth of the tree. 

158 """ 

159 dt = self.depth 

160 if self.above is not None: 

161 dt = max(dt, self.above.tree_depth_) 

162 if self.below is not None: 

163 dt = max(dt, self.below.tree_depth_) 

164 return dt 

165 

166 def fit_improve(self, dtlr, total_N, X, y, sample_weight): 

167 """ 

168 The method only works on a linear classifier, it changes 

169 the intercept in order to be within the constraints 

170 imposed by the *min_samples_leaf* and *min_weight_fraction_leaf*. 

171 The algorithm has a significant cost as it sorts every observation 

172 and chooses the best intercept. 

173 

174 @param dtlr @see cl DecisionTreeLogisticRegression 

175 @param total_N total number of observations 

176 @param X features 

177 @param y labels 

178 @param sample_weight sample weight 

179 @return probabilities 

180 """ 

181 if self.estimator is None: 

182 raise RuntimeError( 

183 "Estimator was not trained.") # pragma: no cover 

184 prob = self.estimator.predict_proba(X) 

185 if dtlr.fit_improve_algo in (None, 'none'): 

186 return prob 

187 

188 if not isinstance(self.estimator, LinearClassifierMixin): 

189 # The classifier is not linear and cannot be improved. 

190 if dtlr.fit_improve_algo == 'intercept_sort_always': # pragma: no cover 

191 raise RuntimeError( 

192 f"The model is not linear " 

193 f"({self.estimator.__class__.__name__!r}), " 

194 f"intercept cannot be improved.") 

195 return prob 

196 

197 above = prob[:, 1] > self.threshold 

198 below = ~ above 

199 n_above = above.sum() 

200 n_below = below.sum() 

201 n_min = min(n_above, n_below) 

202 p1p2 = float(n_above * n_below) / X.shape[0] ** 2 

203 if dtlr.verbose >= 2: 

204 print("[DTLRI] %s imp %d <> %d, p1p2=%1.3f <> %1.3f" % ( # pragma: no cover 

205 " " * self.depth, n_min, dtlr.min_samples_leaf, 

206 p1p2, dtlr.p1p2)) 

207 if (n_min >= dtlr.min_samples_leaf and 

208 float(n_min) / total_N >= dtlr.min_weight_fraction_leaf and 

209 p1p2 > dtlr.p1p2 and 

210 dtlr.fit_improve_algo != 'intercept_sort_always'): 

211 return prob 

212 

213 coef = self.estimator.coef_ 

214 decision_function = (X @ coef.T).ravel() 

215 order = numpy.argsort(decision_function, axis=0) 

216 begin = dtlr.min_samples_leaf 

217 

218 sorted_df = decision_function[order] 

219 sorted_y = decision_function[order] 

220 N = sorted_y.shape[0] 

221 

222 best = None 

223 besti = None 

224 beta_best = None 

225 for i in range(begin, N - begin): 

226 beta = - sorted_df[i] 

227 like = numpy.sum(likelihood(decision_function + beta, y)) / N 

228 w = float(i * (N - i)) / N**2 

229 like += w * dtlr.gamma 

230 if besti is None or like > best: 

231 best = like 

232 besti = i 

233 beta_best = beta 

234 

235 if beta_best is not None: 

236 if dtlr.verbose >= 1: 

237 print("[DTLRI] %s change intercept %f --> %f in [%f, %f]" % ( # pragma: no cover 

238 " " * self.depth, self.estimator.intercept_, beta_best, 

239 - sorted_df[-1], - sorted_df[0])) 

240 self.estimator.intercept_ = beta_best 

241 prob = self.estimator.predict_proba(X) 

242 return prob 

243 

244 def enumerate_leaves_index(self): 

245 """ 

246 Returns the leaves index. 

247 """ 

248 if self.above is None or self.below is None: 

249 yield self.index 

250 if self.above is not None: 

251 for index in self.above.enumerate_leaves_index(): 

252 yield index 

253 if self.below is not None: 

254 for index in self.below.enumerate_leaves_index(): 

255 yield index 

256 

257 

258class DecisionTreeLogisticRegression(BaseEstimator, ClassifierMixin): 

259 """ 

260 Fits a logistic regression, then fits two other 

261 logistic regression for every observation on both sides 

262 of the border. It goes one until a tree is built. 

263 It only handles a binary classification. 

264 The built tree cannot be deeper than the maximum recursion. 

265 

266 :param estimator: binary classification estimator, 

267 if empty, use a logistic regression, the theoritical 

268 model defined with a logistic regression but it could 

269 any binary classifier 

270 :param max_depth: int, default=None 

271 The maximum depth of the tree. If None, then nodes are expanded until 

272 all leaves are pure or until all leaves contain less than 

273 min_samples_split samples. It must be below the maximum 

274 allowed recursion by python. 

275 :param min_samples_split: int or float, default=2 

276 The minimum number of samples required to split an internal node: 

277 - If int, then consider `min_samples_split` as the minimum number. 

278 - If float, then `min_samples_split` is a fraction and 

279 `ceil(min_samples_split * n_samples)` are the minimum 

280 number of samples for each split. 

281 :param min_samples_leaf: int or float, default=1 

282 The minimum number of samples required to be at a leaf node. 

283 A split point at any depth will only be considered if it leaves at 

284 least ``min_samples_leaf`` training samples in each of the left and 

285 right branches. This may have the effect of smoothing the model, 

286 especially in regression. 

287 - If int, then consider `min_samples_leaf` as the minimum number. 

288 - If float, then `min_samples_leaf` is a fraction and 

289 `ceil(min_samples_leaf * n_samples)` are the minimum 

290 number of samples for each node. 

291 :param min_weight_fraction_leaf: float, default=0.0 

292 The minimum weighted fraction of the sum total of weights (of all 

293 the input samples) required to be at a leaf node. Samples have 

294 equal weight when sample_weight is not provided. 

295 :param fit_improve_algo: string, one of the following value: 

296 - `'auto'`: chooses the best option below, '`none'` for 

297 every non linear model, `'intercept_sort'` for linear models 

298 - '`none'`: does not nothing once the binary classifier is fit 

299 - `'intercept_sort'`: if one side of the classifier is too small, 

300 the method changes the best intercept possible verifying 

301 the constraints 

302 - `'intercept_sort_always'`: always chooses the best intercept 

303 possible 

304 :param p1p2: threshold in [0, 1] 

305 for every split, we can define probabilities :math:`p_1 p_2` 

306 which define the ratio of samples in both splits, 

307 if :math:`p_1 p_2` is below the threshold, 

308 method *fit_improve* is called 

309 :param gamma: weight before the coefficient :math:`p (1-p)`. 

310 When the model tries to improve the linear classifier, 

311 it looks a better intercept which maximizes the 

312 likelihood and verifies the constraints. 

313 In order to force the classifier to choose a value 

314 which splits the dataset into 2 almost equal folds, 

315 the function maximimes :math:`likelihood + \\gamma p (1 - p)` 

316 where *p* is the proportion of samples falling in the first 

317 fold. 

318 :param verbose: prints out information about the training 

319 :param strategy: `'parallel'` or `'perpendicular'`, 

320 see below 

321 

322 Fitted attributes: 

323 

324 * `classes_`: ndarray of shape (n_classes,) or list of ndarray 

325 The classes labels (single output problem), 

326 or a list of arrays of class labels (multi-output problem). 

327 * `tree_`: Tree 

328 The underlying Tree object. 

329 

330 The class implements two strategies to build the tree. 

331 The first one `'parallel'` splits the feature space using 

332 the hyperplan defined by a logistic regression, the second 

333 strategy `'perpendicular'` splis the feature space based on 

334 a hyperplan perpendicular to a logistic regression. By doing 

335 this, two logistic regression fit on both sub parts must 

336 necessary decreases the training error. 

337 """ 

338 

339 _fit_improve_algo_values = ( 

340 None, 'none', 'auto', 'intercept_sort', 'intercept_sort_always') 

341 

342 def __init__(self, estimator=None, 

343 max_depth=20, min_samples_split=2, 

344 min_samples_leaf=2, min_weight_fraction_leaf=0.0, 

345 fit_improve_algo='auto', p1p2=0.09, 

346 gamma=1., verbose=0, strategy='parallel'): 

347 "constructor" 

348 ClassifierMixin.__init__(self) 

349 BaseEstimator.__init__(self) 

350 # logistic regression 

351 if estimator is None: 

352 self.estimator = LogisticRegression() 

353 else: 

354 self.estimator = estimator 

355 if max_depth is None: 

356 raise ValueError("'max_depth' cannot be None.") # pragma: no cover 

357 if max_depth > 1024: 

358 raise ValueError( 

359 "'max_depth' must be <= 1024.") # pragma: no cover 

360 self.max_depth = max_depth 

361 self.min_samples_split = min_samples_split 

362 self.min_samples_leaf = min_samples_leaf 

363 self.min_weight_fraction_leaf = min_weight_fraction_leaf 

364 self.fit_improve_algo = fit_improve_algo 

365 self.p1p2 = p1p2 

366 self.gamma = gamma 

367 self.verbose = verbose 

368 self.strategy = strategy 

369 

370 if self.fit_improve_algo not in DecisionTreeLogisticRegression._fit_improve_algo_values: 

371 raise ValueError( 

372 f"fit_improve_algo={self.fit_improve_algo!r} " 

373 f"not in {DecisionTreeLogisticRegression._fit_improve_algo_values}.") 

374 

375 def fit(self, X, y, sample_weight=None): 

376 """ 

377 Builds the tree model. 

378 

379 :param X: numpy array or sparse matrix of shape [n_samples,n_features] 

380 Training data 

381 :param y: numpy array of shape [n_samples, n_targets] 

382 Target values. Will be cast to X's dtype if necessary 

383 :param sample_weight: numpy array of shape [n_samples] 

384 Individual weights for each sample 

385 :return: self : returns an instance of self. 

386 

387 Fitted attributes: 

388 

389 * `classes_`: classes 

390 * `tree_`: tree structure, see @see cl _DecisionTreeLogisticRegressionNode 

391 * `n_nodes_`: number of nodes 

392 """ 

393 if not isinstance(X, numpy.ndarray): 

394 if hasattr(X, 'values'): 

395 X = X.values 

396 if not isinstance(X, numpy.ndarray): 

397 raise TypeError("'X' must be an array.") 

398 if (sample_weight is not None and 

399 not isinstance(sample_weight, numpy.ndarray)): 

400 raise TypeError( 

401 "'sample_weight' must be an array.") # pragma: no cover 

402 self.classes_ = numpy.array(sorted(set(y))) 

403 if len(self.classes_) != 2: 

404 raise RuntimeError( 

405 f"The model only supports binary classification but labels are " 

406 f"{self.classes_}.") 

407 

408 if self.strategy == 'parallel': 

409 return self._fit_parallel(X, y, sample_weight) 

410 if self.strategy == 'perpendicular': 

411 return self._fit_perpendicular(X, y, sample_weight) 

412 raise ValueError( 

413 f"Unknown strategy '{self.strategy}'.") 

414 

415 def _fit_parallel(self, X, y, sample_weight): 

416 "Implements the parallel strategy." 

417 cls = (y == self.classes_[1]).astype(numpy.int32) 

418 estimator = clone(self.estimator) 

419 self.tree_ = _DecisionTreeLogisticRegressionNode(estimator, 0.5) 

420 self.n_nodes_ = self.tree_.fit( 

421 X, cls, sample_weight, self, X.shape[0]) + 1 

422 return self 

423 

424 def _fit_perpendicular(self, X, y, sample_weight): 

425 "Implements the perpendicular strategy." 

426 raise NotImplementedError() # pragma: no cover 

427 

428 def predict(self, X): 

429 """ 

430 Runs the predictions. 

431 """ 

432 labels = self.tree_.predict(X) 

433 return numpy.take(self.classes_, labels) 

434 

435 def predict_proba(self, X): 

436 """ 

437 Converts predictions into probabilities. 

438 """ 

439 return self.tree_.predict_proba(X) 

440 

441 def decision_function(self, X): 

442 """ 

443 Calls *decision_function*. 

444 """ 

445 raise NotImplementedError( # pragma: no cover 

446 "Decision function is not available for this model.") 

447 

448 @property 

449 def tree_depth_(self): 

450 """ 

451 Returns the maximum depth of the tree. 

452 """ 

453 return self.tree_.tree_depth_ 

454 

455 def decision_path(self, X, check_input=True): 

456 """ 

457 Returns the decision path. 

458 

459 @param X inputs 

460 @param check_input unused 

461 @return sparse matrix 

462 """ 

463 mat = sparse.lil_matrix((X.shape[0], self.n_nodes_), dtype=numpy.int32) 

464 self.tree_.decision_path(X, mat, numpy.arange(X.shape[0])) 

465 return sparse.csr_matrix(mat) 

466 

467 def get_leaves_index(self): 

468 """ 

469 Returns the index of every leave. 

470 """ 

471 indices = self.tree_.enumerate_leaves_index() 

472 return numpy.array(sorted(indices))