Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Implements a kind of piecewise linear regression by modifying 

5the criterion used by the algorithm which builds a decision tree. 

6""" 

7import numpy 

8from sklearn.tree import DecisionTreeRegressor 

9 

10 

11class PiecewiseTreeRegressor(DecisionTreeRegressor): 

12 """ 

13 Implements a kind of piecewise linear regression by modifying 

14 the criterion used by the algorithm which builds a decision tree. 

15 See :epkg:`sklearn:tree:DecisionTreeRegressor` to get the meaning 

16 of the parameters except criterion: 

17 

18 * ``mselin``: optimizes for a piecewise linear regression 

19 * ``simple``: optimizes for a stepwise regression (equivalent to *mse*) 

20 """ 

21 

22 def __init__(self, criterion='mselin', splitter='best', max_depth=None, 

23 min_samples_split=2, min_samples_leaf=1, 

24 min_weight_fraction_leaf=0.0, max_features=None, 

25 random_state=None, max_leaf_nodes=None, 

26 min_impurity_decrease=0.0): 

27 DecisionTreeRegressor.__init__( 

28 self, criterion=criterion, 

29 splitter=splitter, max_depth=max_depth, 

30 min_samples_split=min_samples_split, 

31 min_samples_leaf=min_samples_leaf, 

32 min_weight_fraction_leaf=min_weight_fraction_leaf, 

33 max_features=max_features, random_state=random_state, 

34 max_leaf_nodes=max_leaf_nodes, 

35 min_impurity_decrease=min_impurity_decrease) 

36 

37 def fit(self, X, y, sample_weight=None, check_input=True, 

38 X_idx_sorted=None): 

39 """ 

40 Replaces the string stored in criterion by an instance of a class. 

41 """ 

42 replace = None 

43 if isinstance(self.criterion, str): 

44 if self.criterion == 'mselin': 

45 from .piecewise_tree_regression_criterion_linear import ( # pylint: disable=E0611,C0415 

46 LinearRegressorCriterion) 

47 replace = self.criterion 

48 self.criterion = LinearRegressorCriterion(X) 

49 elif self.criterion == "simple": 

50 from .piecewise_tree_regression_criterion_fast import ( # pylint: disable=E0611,C0415 

51 SimpleRegressorCriterionFast) 

52 replace = self.criterion 

53 self.criterion = SimpleRegressorCriterionFast(X) 

54 else: 

55 replace = None 

56 

57 DecisionTreeRegressor.fit(self, X, y, sample_weight=sample_weight, check_input=check_input, 

58 X_idx_sorted=X_idx_sorted) 

59 

60 if replace: 

61 self.criterion = replace 

62 

63 if self.criterion == "mselin": 

64 self._fit_reglin(X, y, sample_weight) 

65 return self 

66 

67 def _mapping_train(self, X): 

68 tree = self.tree_ 

69 leaves = [i for i in range(len(tree.children_left)) 

70 if tree.children_left[i] <= i and tree.children_right[i] <= i] # pylint: disable=E1136 

71 dec_path = self.decision_path(X) 

72 association = numpy.zeros((X.shape[0],)) 

73 association[:] = -1 

74 mapping = {} 

75 ntree = 0 

76 for j in leaves: 

77 ind = dec_path[:, j] == 1 

78 ind = numpy.asarray(ind.todense()).flatten() 

79 if not numpy.any(ind): 

80 # No training example for this bucket. 

81 continue 

82 mapping[j] = ntree 

83 association[ind] = ntree 

84 ntree += 1 

85 return mapping 

86 

87 def predict_leaves(self, X): 

88 """ 

89 Returns the leave index for each observation of *X*. 

90 

91 :param X: array 

92 :return: array 

93 leaves index in ``self.leaves_index_`` 

94 """ 

95 # The creation of the sparse matrix could be avoided. 

96 leaves = self.decision_path(X) 

97 leaves = leaves[:, self.leaves_index_] 

98 mat = numpy.argmax(leaves, 1) 

99 res = numpy.asarray(mat).ravel() 

100 return res 

101 

102 def _fit_reglin(self, X, y, sample_weight): 

103 """ 

104 Fits linear regressions for all leaves. 

105 Sets attributes ``leaves_mapping_``, ``betas_``, ``leaves_index_``. 

106 The first attribute is a dictionary ``{leave: row}`` 

107 which maps a leave of the tree to the coefficients 

108 ``betas_[row, :]`` of a regression trained on all training 

109 points mapped a specific leave. ``leaves_index_`` keeps 

110 in memory a set of leaves. 

111 """ 

112 from .piecewise_tree_regression_criterion_linear import ( # pylint: disable=E0611,C0415 

113 LinearRegressorCriterion) 

114 

115 tree = self.tree_ 

116 self.leaves_index_ = [i for i in range(len(tree.children_left)) 

117 if tree.children_left[i] <= i and tree.children_right[i] <= i] # pylint: disable=E1136 

118 if tree.n_leaves != len(self.leaves_index_): 

119 raise RuntimeError( # pragma: no cover 

120 "Unexpected number of leaves {} != {}".format( 

121 tree.n_leaves, len(self.leaves_index_))) 

122 pred_leaves = self.predict_leaves(X) 

123 self.leaves_mapping_ = {k: i for i, k in enumerate(pred_leaves)} 

124 self.betas_ = numpy.empty((len(self.leaves_index_), X.shape[1] + 1)) 

125 for i, _ in enumerate(self.leaves_index_): 

126 ind = pred_leaves == i 

127 xs = X[ind, :].copy() 

128 ys = y[ind].astype(numpy.float64) 

129 if len(ys.shape) == 1: 

130 ys = ys[:, numpy.newaxis] 

131 ys = ys.copy() 

132 ws = sample_weight[ind].copy() if sample_weight else None 

133 dec = LinearRegressorCriterion.create(xs, ys, ws) 

134 dec.node_beta(self.betas_[i, :]) 

135 

136 def predict(self, X, check_input=True): 

137 """ 

138 Overloads method *predict*. Falls back into 

139 the predict from a decision tree is criterion is 

140 *mse*, *mae*, *simple*. Computes the predictions 

141 from linear regression if the criterion is *mselin*. 

142 """ 

143 if self.criterion == 'mselin': 

144 return self._predict_reglin(X, check_input=check_input) 

145 return DecisionTreeRegressor.predict(self, X, check_input=check_input) 

146 

147 def _predict_reglin(self, X, check_input=True): 

148 """ 

149 Computes the predictions with a linear regression 

150 fitted with the observations mapped to each leave 

151 of the tree. 

152 

153 :param X: array-like or sparse matrix of shape = [n_samples, n_features] 

154 The input samples. Internally, it will be converted to 

155 ``dtype=np.float32`` and if a sparse matrix is provided 

156 to a sparse ``csr_matrix``. 

157 :param check_input: boolean, (default=True) 

158 Allow to bypass several input checking. 

159 Don't use this parameter unless you know what you do. 

160 :return: y, array of shape = [n_samples] or [n_samples, n_outputs] 

161 The predicted classes, or the predict values. 

162 """ 

163 leaves = self.predict_leaves(X) 

164 pred = numpy.ones((X.shape[0], 1)) 

165 Xone = numpy.hstack([X, pred]) 

166 for i in range(0, X.shape[0]): 

167 li = leaves[i] 

168 pred[i] = numpy.dot(Xone[i, :], self.betas_[li, :]) 

169 return pred.ravel()