Coverage for src/mlstatpy/ml/logreg.py: 100%
74 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-27 05:59 +0100
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-27 05:59 +0100
1"""
2@file
3@brief Helpers on logistic regression.
4"""
5import numpy
6from pandas import DataFrame
9def random_set_1d(n, kind):
10 """
11 Builds a random dataset as describes in example
12 :ref:`l-example-logistic-decision`.
14 @param n number of observations
15 @param kind 2, 3, 4 (see example)
16 @return array 2D
17 """
18 x = numpy.random.rand(n) * 3 - 1
19 if kind == 3:
20 y = numpy.empty(x.shape, dtype=numpy.int32)
21 y[x < 0] = 0
22 y[(x >= 0) & (x <= 1)] = 1
23 y[x > 1] = 0
24 elif kind == 2:
25 y = numpy.empty(x.shape, dtype=numpy.int32)
26 y[x < 0] = 0
27 y[x >= 0] = 1
28 elif kind == 4:
29 y = numpy.empty(x.shape, dtype=numpy.int32)
30 y[x < 0] = 0
31 y[(x >= 0) & (x <= 0.8)] = 1
32 y[(x >= 0.8) & (x <= 1.5)] = 0
33 y[x > 1.5] = 1
34 else:
35 raise ValueError('kind must be in (2, 3, 4).')
36 x2 = numpy.random.rand(n)
37 return numpy.vstack([x, x2]).T, y
40def plot_ds(X, y, ax=None, title=None):
41 """
42 Plots a dataset, *X* is a dataset with two
43 features, *y* contains the binary labels.
44 """
45 if ax is None:
46 import matplotlib.pyplot as plt # pragma: no cover
47 ax = plt.gca() # pragma: no cover
48 colors = {0: '#88CCCC', 1: '#CCCC88'}
49 c = [colors[_] for _ in y]
50 ax.scatter(X[:, 0], X[:, 1], c=c, s=20, edgecolor='k', lw=0.1)
51 if title is not None:
52 ax.set_title(title)
53 return ax
56def plog2(p):
57 """
58 Computes :math:`x \\ln_2 x`.
59 """
60 if p == 0:
61 return 0
62 return p * numpy.log(p) / numpy.log(2)
65def logistic(x):
66 """
67 Computes :math:`\\frac{1}{1 + e^{-x}}`.
68 """
69 return 1. / (1. + numpy.exp(-x))
72def likelihood(x, y, theta=1., th=0.):
73 """
74 Computes :math:`\\sum_i y_i f(\\theta (x_i - x_0)) + (1 - y_i) (1 - f(\\theta (x_i - x_0)))`
75 where :math:`f(x_i)` is :math:`\\frac{1}{1 + e^{-x}}`.
76 """
77 lr = logistic((x - th) * theta)
78 return y * lr + (1. - y) * (1 - lr)
81def criteria(X, y):
82 """
83 Computes Gini, information gain, likelihood on a dataset
84 with two features assuming the first coordinates is used to classify.
86 @param X 2D matrix
87 @param y binary labels
88 @return dataframe
89 """
90 res = numpy.empty((X.shape[0], 8))
91 res[:, 0] = X[:, 0]
92 res[:, 1] = y
93 order = numpy.argsort(res[:, 0])
94 res = res[order, :].copy()
95 x = res[:, 0].copy()
96 y = res[:, 1].copy()
98 for i in range(1, res.shape[0] - 1):
99 # gini
100 p1 = numpy.sum(y[:i]) / i
101 p2 = numpy.sum(y[i:]) / (y.shape[0] - i)
102 res[i, 2] = p1
103 res[i, 3] = p2
104 res[i, 4] = 1 - p1**2 - (1 - p1)**2 + 1 - p2**2 - (1 - p2)**2
105 res[i, 5] = - plog2(p1) - plog2(1 - p1) - plog2(p2) - plog2(1 - p2)
106 th = x[i]
107 res[i, 6] = logistic(th)
108 res[i, 7] = numpy.sum(likelihood(x, y, 1., th)) / res.shape[0]
109 columns = ['X', 'y', 'p1', 'p2', 'Gini', 'Gain', 'lr', 'LL']
110 return DataFrame(res[1:-1], columns=columns)
113def criteria2(X, y):
114 """
115 Computes Gini, information gain, likelihood on a dataset
116 with two features assuming the first coordinates is used to classify.
118 @param X 2D matrix
119 @param y binary labels
120 @return dataframe
121 """
122 res = numpy.empty((X.shape[0], 5))
123 res[:, 0] = X[:, 0]
124 res[:, 1] = y
125 order = numpy.argsort(res[:, 0])
126 res = res[order, :].copy()
127 x = res[:, 0].copy()
128 y = res[:, 1].copy()
130 for i in range(1, res.shape[0] - 1):
131 # gini
132 th = x[i]
133 res[i, 2] = max(numpy.sum(likelihood(x, y, 1., th)),
134 numpy.sum(likelihood(x, y, -1., th))) / res.shape[0]
135 res[i, 3] = max(numpy.sum(likelihood(x, y, 10., th)),
136 numpy.sum(likelihood(x, y, -10., th))) / res.shape[0]
137 res[i, 4] = max(numpy.sum(likelihood(x, y, 100., th)),
138 numpy.sum(likelihood(x, y, -100., th))) / res.shape[0]
139 columns = ['X', 'y', 'LL', 'LL-10', 'LL-100']
140 return DataFrame(res[1:-1], columns=columns)