Coverage for src/ensae_teaching_cs/homeblog/table_formula_stat.py: 56%

63 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-01-27 05:44 +0100

1# -*- coding: utf8 -*- 

2""" 

3@file 

4@brief Contains TableFormulaStat. 

5""" 

6 

7 

8class _TableFormulaStat: 

9 """ 

10 Contains various statistical functions. 

11 

12 :: 

13 

14 table = TableFormula ("sum_y#1#1#1#1#1#1#1#1#1#1#1".replace(" ", "\\t").replace("#","\\n")) 

15 gini = table.Gini (lambda v : v["sum_y"]) 

16 print (gini) # expects 1 

17 

18 table = TableFormula ("sum_y#1#1#1#1#1#1#1#1#1#1#1#5#10".replace(" ", "\\t").replace("#","\\n")) 

19 gini = table.Gini (lambda v : v["sum_y"]) 

20 print (gini) # expects much more less than 1 

21 

22 """ 

23 

24 def GiniCurve(self, functionY, functionX=None, isXdx=False): 

25 """ 

26 Computes the Gini curve, takes the following parameters. 

27 

28 @param functionY revenues 

29 @param functionX sum of persons having an income below Y 

30 (or having Y is isXdx is True) 

31 @param isXdx number of persons equal to Y (True) or inferior (False), 

32 if True, X,Y couples are sorted 

33 @return a curve (x, Gini(x)) 

34 """ 

35 couples = [(0., 0.)] 

36 for i, row in enumerate(self.values): 

37 v = self._interpret_row(row) 

38 x = functionX(v) if functionX is not None else float(i + 1) 

39 y = functionY(v) 

40 couples.append((x, y)) 

41 if y < 0: 

42 raise ValueError( 

43 "a value should not be negative for y: " + str(y)) 

44 if x < 0: 

45 raise ValueError( 

46 "a value should not be negative for x: " + str(x)) 

47 

48 if not isXdx: 

49 couples.sort() 

50 

51 sumx = sum(_[0] for _ in couples) if isXdx else max(_[0] 

52 for _ in couples) 

53 sumy = sum(_[1] for _ in couples) 

54 couples = [[_[0] / sumx, _[1] / sumy] for _ in couples] 

55 

56 for i in range(1, len(couples)): 

57 couples[i][1] += couples[i - 1][1] 

58 if isXdx: 

59 couples[i][0] += couples[i - 1][0] 

60 for _ in (0, 1): 

61 couples[i][_] = min(couples[i][_], 1.) 

62 

63 return self._private_getclass()(["x", "Gini(x)"], couples) 

64 

65 def Gini(self, functionY, functionX=None, isXdx=False): 

66 """ 

67 computes the Gini, it calls GiniCurve (@see me GiniCurve), 

68 it takes the following parameters: 

69 @param functionY revenues 

70 @param functionX sum of persons having an income below Y 

71 (or having Y is isXdx is True) 

72 @param isXdx number of persons equal to Y (True) or inferior (False), 

73 if True, X,Y couples are sorted 

74 @return a curve (x, Gini(x)) 

75 """ 

76 giniC = self.GiniCurve(functionY, functionX, isXdx) 

77 gini = 0. 

78 row_ = giniC.values[0] 

79 

80 for i in range(1, len(giniC)): 

81 row = giniC.values[i] 

82 dx = row[0] - row_[0] 

83 y = row[1] + row_[1] 

84 gini += dx * y 

85 row_ = row 

86 

87 return 1. - gini 

88 

89 def summary(self): 

90 """ 

91 produces a summary on each columns 

92 @return TableFormulaStat 

93 """ 

94 

95 row = [] 

96 for col in self.header: 

97 res = self.summary_column(col) 

98 row.append(res) 

99 

100 return self._private_getclass()(row) 

101 

102 def summary_column(self, column_name): 

103 """ 

104 produces a summary of a column, it the column is numerical, it 

105 computes, the min, max, quantile, mean, med, std. If it is not, 

106 count the number of distinct values. 

107 The function considers an empty column as a non-numerical column. 

108 The fonction do not consider None values. 

109 

110 @param column_name column name 

111 @return dictionary 

112 """ 

113 vals = self.select(lambda v: v[column_name]) 

114 vals = [_ for _ in vals if _ is not None] 

115 missing = len(self) - len(vals) 

116 

117 if len(vals) > 0: 

118 try: 

119 s = sum(vals) 

120 s2 = sum([v**2 for v in vals]) 

121 m = s / len(vals) 

122 vals.sort() 

123 res = {"ave": m, 

124 "std": (s2 / len(vals) - m**2) ** 0.5, 

125 "med": vals[len(vals) // 2], 

126 "min": vals[0], 

127 "max": vals[-1], 

128 "1qua": vals[len(vals) * 1 // 4], 

129 "3qua": vals[len(vals) * 3 // 4], 

130 "02.5%": vals[len(vals) * 25 // 1000], 

131 "97.5%": vals[len(vals) * 975 // 1000], 

132 } 

133 except TypeError: 

134 count = {} 

135 for v in vals: 

136 count[v] = count.get(v, 0) + 1 

137 res = {"count": len(count)} 

138 else: 

139 res = {"count": 0} 

140 

141 if missing > 0: 

142 res["missing"] = missing 

143 res["var"] = column_name 

144 

145 return res