Coverage for mlinsights/metrics/correlations.py: 100%

52 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-09 08:45 +0200

1""" 

2@file 

3@brief Correlations. 

4""" 

5import numpy 

6from sklearn.model_selection import train_test_split 

7from sklearn.preprocessing import scale 

8from sklearn import clone 

9 

10 

11def non_linear_correlations(df, model, draws=5, minmax=False): 

12 """ 

13 Computes non linear correlations. 

14 

15 @param df :epkg:`pandas:DataFrame` or 

16 :epkg:`numpy:array` 

17 @param model machine learned model used to compute 

18 the correlations 

19 @param draws number of tries for :epkg:`bootstrap`, 

20 the correlation is the average of the results 

21 obtained at each draw 

22 @param minmax if True, returns three matrices correlations, min, max, 

23 only the correlation matrix if False 

24 @return see parameter minmax 

25 

26 `Pearson Correlations <https://fr.wikipedia.org/wiki/Corr%C3%A9lation_(statistiques)>`_ 

27 is: 

28 

29 .. math:: 

30 

31 cor(X_i, X_j) = \\frac{cov(X_i, Y_i)}{\\sigma(X_i)\\sigma(X_j)} 

32 

33 If variables are centered, :math:`\\mathbb{E}X_i=\\mathbb{E}X_j=0`, 

34 it becomes: 

35 

36 .. math:: 

37 

38 cor(X_i, X_j) = \\frac{\\mathbb{E}(X_i X_j)}{\\sqrt{\\mathbb{E}X_i^2 \\mathbb{E}X_j^2}} 

39 

40 If rescaled, :math:`\\mathbb{E}X_i^2=\\mathbb{E}X_j^2=1`, 

41 then it becomes :math:`cor(X_i, X_j) = \\mathbb{E}(X_i X_j)`. 

42 Let's assume we try to find a coefficient such as 

43 :math:`\\alpha_{ij}` minimizes the standard deviation 

44 of noise :math:`\\epsilon_{ij}`: 

45 

46 .. math:: 

47 

48 X_j = \\alpha_{ij}X_i + \\epsilon_{ij} 

49 

50 It is like if coefficient :math:`\\alpha_{ij}` comes from a 

51 a linear regression which minimizes 

52 :math:`\\mathbb{E}(X_j - \\alpha_{ij}X_i)^2`. 

53 If variable :math:`X_i`, :math:`X_j` are centered 

54 and rescaled: :math:`\\alpha_{ij}^* = \\mathbb{E}(X_i X_j) = cor(X_i, X_j)`. 

55 We extend that definition to function :math:`f` of parameter :math:`\\omega` 

56 defined as: :math:`f(\\omega, X) \\rightarrow \\mathbb{R}`. 

57 :math:`f` is not linear anymore. 

58 Let's assume parameter :math:`\\omega^*` minimizes 

59 quantity :math:`\\min_\\omega (X_j - f(\\omega, X_i))^2`. 

60 Then :math:`X_j = \\alpha_{ij} \\frac{f(\\omega^*, X_i)}{\\alpha_{ij}} + \\epsilon_{ij}` 

61 and we choose :math:`\\alpha_{ij}` such as 

62 :math:`\\mathbb{E}\\left(\\frac{f(\\omega^*, X_i)^2}{\\alpha_{ij}^2}\\right) = 1`. 

63 Let's define a non linear correlation bounded by :math:`f` as: 

64 

65 .. math:: 

66 

67 cor^f(X_i, X_j) = \\sqrt{ \\mathbb{E} (f(\\omega, X_i)^2 )} 

68 

69 We can verify that this value is in interval`:math:`[0,1]``. 

70 That also means that there is no negative correlation. 

71 :math:`f` is a machine learned model and most of them 

72 usually overfit the data. The database is split into 

73 two parts, one is used to train the model, the other 

74 one to compute the correlation. The same split are used 

75 for every coefficient. The returned matrix is not 

76 necessarily symmetric. 

77 

78 .. exref:: 

79 :title: Compute non linear correlations 

80 

81 The following example compute non linear correlations 

82 on :epkg:`Iris` datasets based on a 

83 :epkg:`RandomForestRegressor` model. 

84 

85 .. runpython:: 

86 :showcode: 

87 :warningout: FutureWarning 

88 

89 import pandas 

90 from sklearn import datasets 

91 from sklearn.ensemble import RandomForestRegressor 

92 from mlinsights.metrics import non_linear_correlations 

93 

94 iris = datasets.load_iris() 

95 X = iris.data[:, :4] 

96 df = pandas.DataFrame(X) 

97 df.columns = ["X1", "X2", "X3", "X4"] 

98 cor = non_linear_correlations(df, RandomForestRegressor()) 

99 print(cor) 

100 

101 """ 

102 

103 if hasattr(df, 'iloc'): 

104 cor = df.corr() 

105 cor.iloc[:, :] = 0. 

106 iloc = True 

107 if minmax: 

108 mini = cor.copy() 

109 maxi = cor.copy() 

110 else: 

111 cor = numpy.corrcoef(df, rowvar=False) 

112 cor[:, :] = 0. 

113 iloc = False 

114 if minmax: 

115 mini = cor.copy() 

116 maxi = cor.copy() 

117 df = scale(df) 

118 

119 for k in range(0, draws): 

120 df_train, df_test = train_test_split(df, test_size=0.5) 

121 for i in range(cor.shape[0]): 

122 xi_train = df_train[:, i:i + 1] 

123 xi_test = df_test[:, i:i + 1] 

124 for j in range(cor.shape[1]): 

125 xj_train = df_train[:, j:j + 1] 

126 xj_test = df_test[:, j:j + 1] 

127 if len(xj_test) == 0 or len(xi_test) == 0: 

128 raise ValueError( # pragma: no cover 

129 f"One column is empty i={i} j={j}.") 

130 mod = clone(model) 

131 try: 

132 mod.fit(xi_train, xj_train.ravel()) 

133 except Exception as e: # pragma: no cover 

134 raise ValueError( 

135 f"Unable to compute correlation for i={i} j={j}.") from e 

136 v = mod.predict(xi_test) 

137 c = (1 - numpy.var(v - xj_test.ravel())) 

138 co = max(c, 0) ** 0.5 

139 if iloc: 

140 cor.iloc[i, j] += co 

141 if minmax: 

142 if k == 0: 

143 mini.iloc[i, j] = co 

144 maxi.iloc[i, j] = co 

145 else: 

146 mini.iloc[i, j] = min(mini.iloc[i, j], co) 

147 maxi.iloc[i, j] = max(maxi.iloc[i, j], co) 

148 else: 

149 cor[i, j] += co 

150 if minmax: 

151 if k == 0: 

152 mini[i, j] = co 

153 maxi[i, j] = co 

154 else: 

155 mini[i, j] = min(mini[i, j], co) 

156 maxi[i, j] = max(maxi[i, j], co) 

157 if minmax: 

158 return cor / draws, mini, maxi 

159 return cor / draws