Coverage for src/mlstatpy/ml/logreg.py: 100%

1"""

2@file

3@brief Helpers on logistic regression.

4"""

5import numpy

6from pandas import DataFrame

9def random_set_1d(n, kind):

10 """

11 Builds a random dataset as describes in example

12 :ref:`l-example-logistic-decision`.

14 @param n number of observations

15 @param kind 2, 3, 4 (see example)

16 @return array 2D

17 """

18 x = numpy.random.rand(n) * 3 - 1

19 if kind == 3:

20 y = numpy.empty(x.shape, dtype=numpy.int32)

21 y[x < 0] = 0

22 y[(x >= 0) & (x <= 1)] = 1

23 y[x > 1] = 0

24 elif kind == 2:

25 y = numpy.empty(x.shape, dtype=numpy.int32)

26 y[x < 0] = 0

27 y[x >= 0] = 1

28 elif kind == 4:

29 y = numpy.empty(x.shape, dtype=numpy.int32)

30 y[x < 0] = 0

31 y[(x >= 0) & (x <= 0.8)] = 1

32 y[(x >= 0.8) & (x <= 1.5)] = 0

33 y[x > 1.5] = 1

34 else:

35 raise ValueError('kind must be in (2, 3, 4).')

36 x2 = numpy.random.rand(n)

37 return numpy.vstack([x, x2]).T, y

40def plot_ds(X, y, ax=None, title=None):

41 """

42 Plots a dataset, *X* is a dataset with two

43 features, *y* contains the binary labels.

44 """

45 if ax is None:

46 import matplotlib.pyplot as plt # pragma: no cover

47 ax = plt.gca() # pragma: no cover

48 colors = {0: '#88CCCC', 1: '#CCCC88'}

49 c = [colors[_] for _ in y]

50 ax.scatter(X[:, 0], X[:, 1], c=c, s=20, edgecolor='k', lw=0.1)

51 if title is not None:

52 ax.set_title(title)

53 return ax

56def plog2(p):

57 """

58 Computes :math:`x \\ln_2 x`.

59 """

60 if p == 0:

61 return 0

62 return p * numpy.log(p) / numpy.log(2)

65def logistic(x):

66 """

67 Computes :math:`\\frac{1}{1 + e^{-x}}`.

68 """

69 return 1. / (1. + numpy.exp(-x))

72def likelihood(x, y, theta=1., th=0.):

73 """

74 Computes :math:`\\sum_i y_i f(\\theta (x_i - x_0)) + (1 - y_i) (1 - f(\\theta (x_i - x_0)))`

75 where :math:`f(x_i)` is :math:`\\frac{1}{1 + e^{-x}}`.

76 """

77 lr = logistic((x - th) * theta)

78 return y * lr + (1. - y) * (1 - lr)

81def criteria(X, y):

82 """

83 Computes Gini, information gain, likelihood on a dataset

84 with two features assuming the first coordinates is used to classify.

86 @param X 2D matrix

87 @param y binary labels

88 @return dataframe

89 """

90 res = numpy.empty((X.shape[0], 8))

91 res[:, 0] = X[:, 0]

92 res[:, 1] = y

93 order = numpy.argsort(res[:, 0])

94 res = res[order, :].copy()

95 x = res[:, 0].copy()

96 y = res[:, 1].copy()

98 for i in range(1, res.shape[0] - 1):

99 # gini

100 p1 = numpy.sum(y[:i]) / i

101 p2 = numpy.sum(y[i:]) / (y.shape[0] - i)

102 res[i, 2] = p1

103 res[i, 3] = p2

104 res[i, 4] = 1 - p1**2 - (1 - p1)**2 + 1 - p2**2 - (1 - p2)**2

105 res[i, 5] = - plog2(p1) - plog2(1 - p1) - plog2(p2) - plog2(1 - p2)

106 th = x[i]

107 res[i, 6] = logistic(th)

108 res[i, 7] = numpy.sum(likelihood(x, y, 1., th)) / res.shape[0]

109 columns = ['X', 'y', 'p1', 'p2', 'Gini', 'Gain', 'lr', 'LL']

110 return DataFrame(res[1:-1], columns=columns)

111

112

113def criteria2(X, y):

114 """

115 Computes Gini, information gain, likelihood on a dataset

116 with two features assuming the first coordinates is used to classify.

117

118 @param X 2D matrix

119 @param y binary labels

120 @return dataframe

121 """

122 res = numpy.empty((X.shape[0], 5))

123 res[:, 0] = X[:, 0]

124 res[:, 1] = y

125 order = numpy.argsort(res[:, 0])

126 res = res[order, :].copy()

127 x = res[:, 0].copy()

128 y = res[:, 1].copy()

129

130 for i in range(1, res.shape[0] - 1):

131 # gini

132 th = x[i]

133 res[i, 2] = max(numpy.sum(likelihood(x, y, 1., th)),

134 numpy.sum(likelihood(x, y, -1., th))) / res.shape[0]

135 res[i, 3] = max(numpy.sum(likelihood(x, y, 10., th)),

136 numpy.sum(likelihood(x, y, -10., th))) / res.shape[0]

137 res[i, 4] = max(numpy.sum(likelihood(x, y, 100., th)),

138 numpy.sum(likelihood(x, y, -100., th))) / res.shape[0]

139 columns = ['X', 'y', 'LL', 'LL-10', 'LL-100']

140 return DataFrame(res[1:-1], columns=columns)