Coverage for mlinsights/metrics/correlations.py: 100%

1"""

2@file

3@brief Correlations.

4"""

5import numpy

6from sklearn.model_selection import train_test_split

7from sklearn.preprocessing import scale

8from sklearn import clone

11def non_linear_correlations(df, model, draws=5, minmax=False):

12 """

13 Computes non linear correlations.

15 @param df :epkg:`pandas:DataFrame` or

16 :epkg:`numpy:array`

17 @param model machine learned model used to compute

18 the correlations

19 @param draws number of tries for :epkg:`bootstrap`,

20 the correlation is the average of the results

21 obtained at each draw

22 @param minmax if True, returns three matrices correlations, min, max,

23 only the correlation matrix if False

24 @return see parameter minmax

26 `Pearson Correlations <https://fr.wikipedia.org/wiki/Corr%C3%A9lation_(statistiques)>`_

27 is:

29 .. math::

31 cor(X_i, X_j) = \\frac{cov(X_i, Y_i)}{\\sigma(X_i)\\sigma(X_j)}

33 If variables are centered, :math:`\\mathbb{E}X_i=\\mathbb{E}X_j=0`,

34 it becomes:

36 .. math::

38 cor(X_i, X_j) = \\frac{\\mathbb{E}(X_i X_j)}{\\sqrt{\\mathbb{E}X_i^2 \\mathbb{E}X_j^2}}

40 If rescaled, :math:`\\mathbb{E}X_i^2=\\mathbb{E}X_j^2=1`,

41 then it becomes :math:`cor(X_i, X_j) = \\mathbb{E}(X_i X_j)`.

42 Let's assume we try to find a coefficient such as

43 :math:`\\alpha_{ij}` minimizes the standard deviation

44 of noise :math:`\\epsilon_{ij}`:

46 .. math::

48 X_j = \\alpha_{ij}X_i + \\epsilon_{ij}

50 It is like if coefficient :math:`\\alpha_{ij}` comes from a

51 a linear regression which minimizes

52 :math:`\\mathbb{E}(X_j - \\alpha_{ij}X_i)^2`.

53 If variable :math:`X_i`, :math:`X_j` are centered

54 and rescaled: :math:`\\alpha_{ij}^* = \\mathbb{E}(X_i X_j) = cor(X_i, X_j)`.

55 We extend that definition to function :math:`f` of parameter :math:`\\omega`

56 defined as: :math:`f(\\omega, X) \\rightarrow \\mathbb{R}`.

57 :math:`f` is not linear anymore.

58 Let's assume parameter :math:`\\omega^*` minimizes

59 quantity :math:`\\min_\\omega (X_j - f(\\omega, X_i))^2`.

60 Then :math:`X_j = \\alpha_{ij} \\frac{f(\\omega^*, X_i)}{\\alpha_{ij}} + \\epsilon_{ij}`

61 and we choose :math:`\\alpha_{ij}` such as

62 :math:`\\mathbb{E}\\left(\\frac{f(\\omega^*, X_i)^2}{\\alpha_{ij}^2}\\right) = 1`.

63 Let's define a non linear correlation bounded by :math:`f` as:

65 .. math::

67 cor^f(X_i, X_j) = \\sqrt{ \\mathbb{E} (f(\\omega, X_i)^2 )}

69 We can verify that this value is in interval`:math:`[0,1]``.

70 That also means that there is no negative correlation.

71 :math:`f` is a machine learned model and most of them

72 usually overfit the data. The database is split into

73 two parts, one is used to train the model, the other

74 one to compute the correlation. The same split are used

75 for every coefficient. The returned matrix is not

76 necessarily symmetric.

78 .. exref::

79 :title: Compute non linear correlations

81 The following example compute non linear correlations

82 on :epkg:`Iris` datasets based on a

83 :epkg:`RandomForestRegressor` model.

85 .. runpython::

86 :showcode:

87 :warningout: FutureWarning

89 import pandas

90 from sklearn import datasets

91 from sklearn.ensemble import RandomForestRegressor

92 from mlinsights.metrics import non_linear_correlations

94 iris = datasets.load_iris()

95 X = iris.data[:, :4]

96 df = pandas.DataFrame(X)

97 df.columns = ["X1", "X2", "X3", "X4"]

98 cor = non_linear_correlations(df, RandomForestRegressor())

99 print(cor)

100

101 """

102

103 if hasattr(df, 'iloc'):

104 cor = df.corr()

105 cor.iloc[:, :] = 0.

106 iloc = True

107 if minmax:

108 mini = cor.copy()

109 maxi = cor.copy()

110 else:

111 cor = numpy.corrcoef(df, rowvar=False)

112 cor[:, :] = 0.

113 iloc = False

114 if minmax:

115 mini = cor.copy()

116 maxi = cor.copy()

117 df = scale(df)

118

119 for k in range(0, draws):

120 df_train, df_test = train_test_split(df, test_size=0.5)

121 for i in range(cor.shape[0]):

122 xi_train = df_train[:, i:i + 1]

123 xi_test = df_test[:, i:i + 1]

124 for j in range(cor.shape[1]):

125 xj_train = df_train[:, j:j + 1]

126 xj_test = df_test[:, j:j + 1]

127 if len(xj_test) == 0 or len(xi_test) == 0:

128 raise ValueError( # pragma: no cover

129 f"One column is empty i={i} j={j}.")

130 mod = clone(model)

131 try:

132 mod.fit(xi_train, xj_train.ravel())

133 except Exception as e: # pragma: no cover

134 raise ValueError(

135 f"Unable to compute correlation for i={i} j={j}.") from e

136 v = mod.predict(xi_test)

137 c = (1 - numpy.var(v - xj_test.ravel()))

138 co = max(c, 0) ** 0.5

139 if iloc:

140 cor.iloc[i, j] += co

141 if minmax:

142 if k == 0:

143 mini.iloc[i, j] = co

144 maxi.iloc[i, j] = co

145 else:

146 mini.iloc[i, j] = min(mini.iloc[i, j], co)

147 maxi.iloc[i, j] = max(maxi.iloc[i, j], co)

148 else:

149 cor[i, j] += co

150 if minmax:

151 if k == 0:

152 mini[i, j] = co

153 maxi[i, j] = co

154 else:

155 mini[i, j] = min(mini[i, j], co)

156 maxi[i, j] = max(maxi[i, j], co)

157 if minmax:

158 return cor / draws, mini, maxi

159 return cor / draws