Coverage for mlinsights/timeseries/patterns.py: 100%

49 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-08-09 08:45 +0200

1""" 

2@file 

3@brief Find patterns in timeseries. 

4""" 

5import numpy 

6import pandas 

7from sklearn.cluster import KMeans 

8from .agg import aggregate_timeseries 

9 

10 

11def find_ts_group_pattern(ttime, values, names, name_subset=None, 

12 per='week', unit='half-hour', agg='sum', 

13 estimator=None, fLOG=None): 

14 """ 

15 Clusters times series to find similar patterns. 

16 

17 @param ttime time column 

18 @param values features to use to cluster 

19 @param names column which holds group name 

20 @param name_subset subset of groups to study, None for all 

21 @param per aggragation per week 

22 @param estimator estimator used to find pattern, 

23 :epkg:`sklearn:cluster:KMeans` and 

24 10 groups 

25 @param fLOG logging function 

26 @return found clusters, distances 

27 """ 

28 for var, na in zip([ttime, values, names], ['ttime', 'values', 'names']): 

29 if not isinstance(var, numpy.ndarray): 

30 raise TypeError(f"'{na}' must an array not {type(var)}") 

31 # builds features 

32 set_names = set(names) 

33 if name_subset is not None: 

34 set_names &= set(name_subset) 

35 if fLOG: 

36 fLOG( # pragma: no cover 

37 f'[find_ts_group_pattern] build features, {len(set_names)} groups') 

38 gr_names = [] 

39 to_merge = [] 

40 for name in set_names: 

41 indices = names == name 

42 gr_ttime = ttime[indices] 

43 gr_values = values[indices] 

44 gr = aggregate_timeseries(None, gr_ttime, gr_values, 

45 unit=unit, agg=agg, per=per) 

46 gr.set_index(gr.columns[0], inplace=True) 

47 gr_names.append(name) 

48 to_merge.append(gr) 

49 

50 if fLOG: 

51 fLOG( # pragma: no cover 

52 '[find_ts_group_pattern] merge features') 

53 all_merged = pandas.concat(to_merge, axis=1) 

54 all_merged.fillna(0, inplace=True) 

55 ncol = all_merged.shape[1] // len(gr_names) 

56 gr_feats = [] 

57 for i, name in enumerate(gr_names): 

58 feats = all_merged.iloc[:, i * ncol: (i + 1) * ncol].values.ravel() 

59 gr_feats.append(feats) 

60 

61 gr_feats = numpy.vstack(gr_feats) 

62 

63 # cluster 

64 if fLOG: 

65 fLOG( # pragma: no cover 

66 f'[find_ts_group_pattern] clustering, shape={gr_feats.shape}') 

67 if estimator is None: 

68 estimator = KMeans() 

69 estimator.fit(gr_feats) 

70 

71 # predicted clusters 

72 pred = estimator.predict(gr_feats) 

73 dist = estimator.transform(gr_feats) 

74 if fLOG: 

75 fLOG( # pragma: no cover 

76 f'[find_ts_group_pattern] number of clusters: {len(set(pred))}') 

77 

78 row_name = {n: i for i, n in enumerate(gr_names)} 

79 clusters = numpy.empty(ttime.shape[0], dtype=pred.dtype) 

80 dists = numpy.empty((ttime.shape[0], dist.shape[1]), dtype=dist.dtype) 

81 

82 for i in range(ttime.shape[0]): 

83 if names[i] in row_name: 

84 index = row_name[names[i]] 

85 clusters[i] = pred[index] 

86 dists[i, :] = dist[index, :] 

87 else: 

88 clusters[i] = -1 

89 dists[i, :] = numpy.nan 

90 

91 return clusters, dists