Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2@file 

3@brief Helpers on logistic regression. 

4""" 

5import numpy 

6from pandas import DataFrame 

7 

8 

9def random_set_1d(n, kind): 

10 """ 

11 Builds a random dataset as describes in example 

12 :ref:`l-example-logistic-decision`. 

13 

14 @param n number of observations 

15 @param kind 2, 3, 4 (see example) 

16 @return array 2D 

17 """ 

18 x = numpy.random.rand(n) * 3 - 1 

19 if kind == 3: 

20 y = numpy.empty(x.shape, dtype=numpy.int32) 

21 y[x < 0] = 0 

22 y[(x >= 0) & (x <= 1)] = 1 

23 y[x > 1] = 0 

24 elif kind == 2: 

25 y = numpy.empty(x.shape, dtype=numpy.int32) 

26 y[x < 0] = 0 

27 y[x >= 0] = 1 

28 elif kind == 4: 

29 y = numpy.empty(x.shape, dtype=numpy.int32) 

30 y[x < 0] = 0 

31 y[(x >= 0) & (x <= 0.8)] = 1 

32 y[(x >= 0.8) & (x <= 1.5)] = 0 

33 y[x > 1.5] = 1 

34 else: 

35 raise ValueError('kind must be in (2, 3, 4).') 

36 x2 = numpy.random.rand(n) 

37 return numpy.vstack([x, x2]).T, y 

38 

39 

40def plot_ds(X, y, ax=None, title=None): 

41 """ 

42 Plots a dataset, *X* is a dataset with two 

43 features, *y* contains the binary labels. 

44 """ 

45 if ax is None: 

46 import matplotlib.pyplot as plt # pragma: no cover 

47 ax = plt.gca() # pragma: no cover 

48 colors = {0: '#88CCCC', 1: '#CCCC88'} 

49 c = [colors[_] for _ in y] 

50 ax.scatter(X[:, 0], X[:, 1], c=c, s=20, edgecolor='k', lw=0.1) 

51 if title is not None: 

52 ax.set_title(title) 

53 return ax 

54 

55 

56def plog2(p): 

57 """ 

58 Computes :math:`x \\ln_2 x`. 

59 """ 

60 if p == 0: 

61 return 0 

62 return p * numpy.log(p) / numpy.log(2) 

63 

64 

65def logistic(x): 

66 """ 

67 Computes :math:`\\frac{1}{1 + e^{-x}}`. 

68 """ 

69 return 1. / (1. + numpy.exp(-x)) 

70 

71 

72def likelihood(x, y, theta=1., th=0.): 

73 """ 

74 Computes :math:`\\sum_i y_i f(\\theta (x_i - x_0)) + (1 - y_i) (1 - f(\\theta (x_i - x_0)))` 

75 where :math:`f(x_i)` is :math:`\\frac{1}{1 + e^{-x}}`. 

76 """ 

77 lr = logistic((x - th) * theta) 

78 return y * lr + (1. - y) * (1 - lr) 

79 

80 

81def criteria(X, y): 

82 """ 

83 Computes Gini, information gain, likelihood on a dataset 

84 with two features assuming the first coordinates is used to classify. 

85 

86 @param X 2D matrix 

87 @param y binary labels 

88 @return dataframe 

89 """ 

90 res = numpy.empty((X.shape[0], 8)) 

91 res[:, 0] = X[:, 0] 

92 res[:, 1] = y 

93 order = numpy.argsort(res[:, 0]) 

94 res = res[order, :].copy() 

95 x = res[:, 0].copy() 

96 y = res[:, 1].copy() 

97 

98 for i in range(1, res.shape[0] - 1): 

99 # gini 

100 p1 = numpy.sum(y[:i]) / i 

101 p2 = numpy.sum(y[i:]) / (y.shape[0] - i) 

102 res[i, 2] = p1 

103 res[i, 3] = p2 

104 res[i, 4] = 1 - p1**2 - (1 - p1)**2 + 1 - p2**2 - (1 - p2)**2 

105 res[i, 5] = - plog2(p1) - plog2(1 - p1) - plog2(p2) - plog2(1 - p2) 

106 th = x[i] 

107 res[i, 6] = logistic(th) 

108 res[i, 7] = numpy.sum(likelihood(x, y, 1., th)) / res.shape[0] 

109 columns = ['X', 'y', 'p1', 'p2', 'Gini', 'Gain', 'lr', 'LL'] 

110 return DataFrame(res[1:-1], columns=columns) 

111 

112 

113def criteria2(X, y): 

114 """ 

115 Computes Gini, information gain, likelihood on a dataset 

116 with two features assuming the first coordinates is used to classify. 

117 

118 @param X 2D matrix 

119 @param y binary labels 

120 @return dataframe 

121 """ 

122 res = numpy.empty((X.shape[0], 5)) 

123 res[:, 0] = X[:, 0] 

124 res[:, 1] = y 

125 order = numpy.argsort(res[:, 0]) 

126 res = res[order, :].copy() 

127 x = res[:, 0].copy() 

128 y = res[:, 1].copy() 

129 

130 for i in range(1, res.shape[0] - 1): 

131 # gini 

132 th = x[i] 

133 res[i, 2] = max(numpy.sum(likelihood(x, y, 1., th)), 

134 numpy.sum(likelihood(x, y, -1., th))) / res.shape[0] 

135 res[i, 3] = max(numpy.sum(likelihood(x, y, 10., th)), 

136 numpy.sum(likelihood(x, y, -10., th))) / res.shape[0] 

137 res[i, 4] = max(numpy.sum(likelihood(x, y, 100., th)), 

138 numpy.sum(likelihood(x, y, -100., th))) / res.shape[0] 

139 columns = ['X', 'y', 'LL', 'LL-10', 'LL-100'] 

140 return DataFrame(res[1:-1], columns=columns)