Coverage for aftercovid/preprocess/ts.py: 98%

101 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2024-03-28 03:09 +0100

1""" 

2Preprocesses timeseries about COVID. 

3""" 

4import numpy 

5 

6 

7def ts_remove_decreasing_values(series): 

8 """ 

9 Returns a series with no decreasing values 

10 (only growing). Data are sometimes normalized and 

11 show negative values but the past remains unchanged. 

12 This functions decreases past values until the series 

13 is growing. 

14 

15 :param series: series 

16 :return: new series 

17 """ 

18 def normalize(series, index): 

19 origin = series[0] 

20 diff = series[1:] - series[:-1] 

21 delta = series[index - 1] - series[index] 

22 h = int(float(delta) / index) + 1 

23 delta += h 

24 delta_ = delta + 1 

25 while delta > 0 and delta < delta_: 

26 delta_ = delta 

27 pos = index - 2 

28 while pos > 0 and delta > 0: 

29 if diff[pos] > h: 

30 d = min(h, delta) 

31 diff[pos] -= d 

32 delta -= d 

33 elif diff[pos] > 1: 

34 diff[pos] -= 1 

35 delta -= 1 

36 pos -= 1 

37 diff[index - 1] = h 

38 series[1:] = origin + diff.cumsum() 

39 

40 if series.dtype in (numpy.int64, numpy.int32): 

41 if hasattr(series, 'values'): 

42 values = series.values.copy() 

43 else: 

44 values = series.copy() 

45 points = [] 

46 for i in range(1, len(values)): 

47 if values[i] < values[i - 1]: 

48 points.append(i) 

49 for p in reversed(points): 

50 normalize(values, p) 

51 return values 

52 

53 raise NotImplementedError( 

54 "Not implemented for real types.") 

55 

56 

57def ts_moving_average(series, n=7, center=True): 

58 """ 

59 Computes the moving average of a differential series. 

60 The function handles nan as well. The outputs 

61 does not contain any nan unless there are too many 

62 consecutive nans. 

63 

64 :param series: timeseries 

65 :param n: window 

66 :param center: centered average 

67 :return: moving average (of same size) 

68 """ 

69 if hasattr(series, 'values'): 

70 cls = series.__class__ 

71 columns = getattr(series, 'columns', None) 

72 name = getattr(series, 'name', None) 

73 index = series.index 

74 series = series.values 

75 as_df = True 

76 else: 

77 as_df = False 

78 cls = None 

79 

80 if center and n % 2 != 1: 

81 raise ValueError("If center is True, n should be odd.") 

82 

83 dtype = numpy.float64 if series.dtype != numpy.float32 else numpy.float32 

84 

85 series = series.astype(dtype) 

86 weights = numpy.ones(series.shape, dtype=dtype) 

87 isna = numpy.isnan(series) 

88 weights[isna] = 0 

89 series[isna] = 0 

90 

91 ret = numpy.cumsum(series.astype(dtype), axis=0) 

92 wet = numpy.cumsum(weights.astype(dtype), axis=0) 

93 res = numpy.zeros(ret.shape, dtype) 

94 if center: 

95 d = n // 2 

96 res[d + 1:-d] = (ret[n:] - ret[:-n]) / (wet[n:] - wet[:-n]) 

97 for i in range(0, d + 1): 

98 res[i] = numpy.divide(ret[i + d - 1], wet[i + d - 1]) 

99 res[-i - 1] = numpy.divide(ret[-1] - ret[-(i + d) - 1], 

100 wet[-1] - wet[-(i + d) - 1]) 

101 else: 

102 res[n:] = (ret[n:] - ret[:-n]) / (wet[n:] - wet[:-n]) 

103 for i in range(0, n): 

104 res[i] = numpy.divide(ret[i], wet[i]) 

105 

106 if as_df: 

107 if columns is not None: 

108 return cls(series, columns=columns, index=index) 

109 if name is not None: 

110 return cls(series, name=name, index=index) 

111 return res 

112 

113 

114def ts_normalise_negative_values(series, n=7, extreme=4): 

115 """ 

116 *series* is a differential series which should not 

117 have any negative values. The function removes 

118 unexpected high value and negative value. These extremes 

119 are replaced by a local average. 

120 The function handles nan as well. The outputs 

121 does not contain any nan unless there are too many 

122 consecutive nans. 

123 

124 :param series: differential values 

125 :param n: moving average 

126 :param extreme: removes extreme values, 

127 if the series is higher or lower than its moverage * th or / th 

128 :return: corrected series 

129 """ 

130 if hasattr(series, 'values'): 

131 cls = series.__class__ 

132 columns = getattr(series, 'columns', None) 

133 name = getattr(series, 'name', None) 

134 index = series.index 

135 series = series.values 

136 as_df = True 

137 else: 

138 as_df = False 

139 

140 mov = ts_moving_average(series, n=n, center=True) 

141 series = series.astype(mov.dtype) 

142 isna = numpy.isnan(series) 

143 series_raw = series 

144 series = series.copy().astype(mov.dtype) 

145 series[isna] = 0 

146 total = numpy.sum(series, axis=0) 

147 rep = (numpy.isnan(series_raw) | (series < 0) | 

148 (mov / extreme > series) | (mov * extreme < series)) 

149 series[rep] = mov[rep] 

150 nonan = series.copy() 

151 isna = numpy.isnan(nonan) 

152 nonan[isna] = 0 

153 series = numpy.maximum(series, 0) 

154 new_total = numpy.sum(nonan, axis=0) 

155 series *= total / new_total 

156 if as_df: 

157 if columns is not None: 

158 return cls(series, columns=columns, index=index) 

159 if name is not None: 

160 return cls(series, name=name, index=index) 

161 return series