# Coverage for src/ensae_teaching_cs/homeblog/table_formula_stat.py: 56%

## 63 statements

, created at 2023-01-27 05:44 +0100

1# -*- coding: utf8 -*-

2"""

3@file

4@brief Contains TableFormulaStat.

5"""

8class _TableFormulaStat:

9 """

10 Contains various statistical functions.

12 ::

14 table = TableFormula ("sum_y#1#1#1#1#1#1#1#1#1#1#1".replace(" ", "\\t").replace("#","\\n"))

15 gini = table.Gini (lambda v : v["sum_y"])

16 print (gini) # expects 1

18 table = TableFormula ("sum_y#1#1#1#1#1#1#1#1#1#1#1#5#10".replace(" ", "\\t").replace("#","\\n"))

19 gini = table.Gini (lambda v : v["sum_y"])

20 print (gini) # expects much more less than 1

22 """

24 def GiniCurve(self, functionY, functionX=None, isXdx=False):

25 """

26 Computes the Gini curve, takes the following parameters.

28 @param functionY revenues

29 @param functionX sum of persons having an income below Y

30 (or having Y is isXdx is True)

31 @param isXdx number of persons equal to Y (True) or inferior (False),

32 if True, X,Y couples are sorted

33 @return a curve (x, Gini(x))

34 """

35 couples = [(0., 0.)]

36 for i, row in enumerate(self.values):

37 v = self._interpret_row(row)

38 x = functionX(v) if functionX is not None else float(i + 1)

39 y = functionY(v)

40 couples.append((x, y))

41 if y < 0:

42 raise ValueError(

43 "a value should not be negative for y: " + str(y))

44 if x < 0:

45 raise ValueError(

46 "a value should not be negative for x: " + str(x))

48 if not isXdx:

49 couples.sort()

51 sumx = sum(_[0] for _ in couples) if isXdx else max(_[0]

52 for _ in couples)

53 sumy = sum(_[1] for _ in couples)

54 couples = [[_[0] / sumx, _[1] / sumy] for _ in couples]

56 for i in range(1, len(couples)):

57 couples[i][1] += couples[i - 1][1]

58 if isXdx:

59 couples[i][0] += couples[i - 1][0]

60 for _ in (0, 1):

61 couples[i][_] = min(couples[i][_], 1.)

63 return self._private_getclass()(["x", "Gini(x)"], couples)

65 def Gini(self, functionY, functionX=None, isXdx=False):

66 """

67 computes the Gini, it calls GiniCurve (@see me GiniCurve),

68 it takes the following parameters:

69 @param functionY revenues

70 @param functionX sum of persons having an income below Y

71 (or having Y is isXdx is True)

72 @param isXdx number of persons equal to Y (True) or inferior (False),

73 if True, X,Y couples are sorted

74 @return a curve (x, Gini(x))

75 """

76 giniC = self.GiniCurve(functionY, functionX, isXdx)

77 gini = 0.

78 row_ = giniC.values[0]

80 for i in range(1, len(giniC)):

81 row = giniC.values[i]

82 dx = row[0] - row_[0]

83 y = row[1] + row_[1]

84 gini += dx * y

85 row_ = row

87 return 1. - gini

89 def summary(self):

90 """

91 produces a summary on each columns

92 @return TableFormulaStat

93 """

95 row = []

97 res = self.summary_column(col)

98 row.append(res)

100 return self._private_getclass()(row)

102 def summary_column(self, column_name):

103 """

104 produces a summary of a column, it the column is numerical, it

105 computes, the min, max, quantile, mean, med, std. If it is not,

106 count the number of distinct values.

107 The function considers an empty column as a non-numerical column.

108 The fonction do not consider None values.

110 @param column_name column name

111 @return dictionary

112 """

113 vals = self.select(lambda v: v[column_name])

114 vals = [_ for _ in vals if _ is not None]

115 missing = len(self) - len(vals)

117 if len(vals) > 0:

118 try:

119 s = sum(vals)

120 s2 = sum([v**2 for v in vals])

121 m = s / len(vals)

122 vals.sort()

123 res = {"ave": m,

124 "std": (s2 / len(vals) - m**2) ** 0.5,

125 "med": vals[len(vals) // 2],

126 "min": vals[0],

127 "max": vals[-1],

128 "1qua": vals[len(vals) * 1 // 4],

129 "3qua": vals[len(vals) * 3 // 4],

130 "02.5%": vals[len(vals) * 25 // 1000],

131 "97.5%": vals[len(vals) * 975 // 1000],

132 }

133 except TypeError:

134 count = {}

135 for v in vals:

136 count[v] = count.get(v, 0) + 1

137 res = {"count": len(count)}

138 else:

139 res = {"count": 0}

141 if missing > 0:

142 res["missing"] = missing

143 res["var"] = column_name

145 return res