|
@@ -7,101 +7,118 @@ import matplotlib.pyplot as plt
|
7
|
7
|
import seaborn as sns
|
8
|
8
|
from scipy.spatial.distance import squareform
|
9
|
9
|
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
|
|
10
|
+from tqdm import tqdm
|
10
|
11
|
|
11
|
|
-Sourcedata = '../data/2017-sample.pkl'
|
|
12
|
+def tqcorr(df):
|
|
13
|
+ cols = df.columns
|
|
14
|
+ ncols = len(cols)
|
|
15
|
+ cdf = p.DataFrame(index = cols, columns = cols)
|
|
16
|
+ for c in tqdm(range(ncols)):
|
|
17
|
+ cind = cols[c]
|
|
18
|
+ cdf.loc[cind, cind] = 1
|
|
19
|
+ # for i in range(c + 1, ncols):
|
|
20
|
+ # print("ZZZ")
|
|
21
|
+ return cdf
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+tqdm.pandas()
|
|
25
|
+
|
|
26
|
+Sourcedata = '../data/2017-all-wide.pkl'
|
12
|
27
|
lableddata = '../data/9-clusters.pkl'
|
13
|
28
|
aggdata = '../data/9-clusters.agg.pkl'
|
14
|
29
|
clustertable = '../data/9-clusters-sample-table.pkl'
|
15
|
30
|
|
16
|
31
|
numclusts = 9
|
17
|
32
|
df = p.read_pickle(Sourcedata)
|
18
|
|
-dforig = df
|
|
33
|
+# dforig = df
|
19
|
34
|
|
20
|
35
|
# print(df)
|
21
|
36
|
|
22
|
37
|
print(df.info())
|
23
|
|
-print(df.icp_id.nunique())
|
24
|
|
-print(df.read_time.nunique())
|
|
38
|
+tqcorr(df)
|
|
39
|
+# print(df.icp_id.nunique())
|
|
40
|
+# print(df.read_time.nunique())
|
25
|
41
|
# print(df.groupby('icp_id').read_time.nunique().nunique())
|
26
|
|
-df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
|
27
|
|
-print(df.info())
|
|
42
|
+# df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
|
|
43
|
+# print(df.info())
|
28
|
44
|
df = df[df.columns[df.max() != df.min()]]
|
29
|
45
|
print(df.info())
|
30
|
|
-cmat = df.corr()
|
|
46
|
+cmat = tqcorr(df)
|
|
47
|
+print(cmat)
|
31
|
48
|
print(cmat.info())
|
32
|
49
|
|
33
|
|
-lmat = squareform(1 - cmat)
|
34
|
|
-
|
35
|
|
-lobj = linkage(lmat, method = 'ward')
|
36
|
|
-print(lobj)
|
37
|
|
-print(cophenet(lobj, lmat))
|
38
|
|
-
|
39
|
|
-
|
40
|
|
-
|
41
|
|
-clabs = [x + 1 for x in range(numclusts)]
|
42
|
|
-cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
|
43
|
|
-
|
44
|
|
-clusts = fcluster(lobj, numclusts, criterion='maxclust')
|
45
|
|
-print(clusts)
|
46
|
|
-print(cmat.index.values)
|
47
|
|
-clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
|
48
|
|
-print(clustdf)
|
49
|
|
-clustdf.to_pickle(clustertable)
|
50
|
|
-mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
|
51
|
|
-print(mdf)
|
52
|
|
-print(mdf.info())
|
53
|
|
-qlow = lambda x: x.quantile(0.250)
|
54
|
|
-qhigh = lambda x: x.quantile(0.750)
|
55
|
|
-print(mdf.cluster.describe())
|
56
|
|
-mdagg = mdf.groupby(['read_time', 'cluster']).agg({
|
57
|
|
- 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
|
58
|
|
-}, q = 0.025)
|
59
|
|
-mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
|
60
|
|
-mdagg = mdagg.reset_index()
|
61
|
|
-print(mdagg)
|
62
|
|
-print(mdagg.info())
|
63
|
|
-print(mdagg.describe())
|
64
|
|
-# mdf.to_csv('~/windows/Documents/clusters-ward.csv')
|
65
|
|
-print("Saving")
|
66
|
|
-mdf.to_pickle(lableddata)
|
67
|
|
-mdagg.to_pickle(aggdata)
|
68
|
|
-print("saved")
|
69
|
|
-
|
70
|
|
-# Algorithm via
|
71
|
|
-# <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
|
72
|
|
-ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
|
73
|
|
-link_cols = {}
|
74
|
|
-for i, i12 in enumerate(lobj[:,:2].astype(int)):
|
75
|
|
- c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
|
76
|
|
- for x in i12)
|
77
|
|
- link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
|
78
|
|
-
|
79
|
|
-plt.figure(figsize = (25, 10))
|
80
|
|
-plt.title('ICP Clustering Dendrogram')
|
81
|
|
-plt.xlabel('ICP ID/(Number of ICPs)')
|
82
|
|
-plt.ylabel('distance')
|
83
|
|
-dendrogram(
|
84
|
|
- lobj,
|
85
|
|
- labels = cmat.index.values,
|
86
|
|
- leaf_rotation=90,
|
87
|
|
- leaf_font_size=8,
|
88
|
|
- # show_leaf_counts = True,
|
89
|
|
- # truncate_mode = 'lastp',
|
90
|
|
- # p = 50,
|
91
|
|
- # show_contracted = True,
|
92
|
|
- link_color_func = lambda x: link_cols[x],
|
93
|
|
- color_threshold = None
|
94
|
|
-)
|
95
|
|
-# plt.show()
|
96
|
|
-plt.savefig("../img/sample-9-dendro.png")
|
97
|
|
-
|
98
|
|
-sns.set()
|
99
|
|
-
|
100
|
|
-f, axes = plt.subplots(3,3)
|
101
|
|
-
|
102
|
|
-for i, c in enumerate(clabs):
|
103
|
|
- fds = mdagg[mdagg.cluster == c]
|
104
|
|
- sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
|
105
|
|
- axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
|
106
|
|
-# plt.show()
|
107
|
|
-plt.savefig("../img/sample-9-panedtrends.png")
|
|
50
|
+# lmat = squareform(1 - cmat)
|
|
51
|
+
|
|
52
|
+# lobj = linkage(lmat, method = 'ward')
|
|
53
|
+# print(lobj)
|
|
54
|
+# print(cophenet(lobj, lmat))
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+# clabs = [x + 1 for x in range(numclusts)]
|
|
59
|
+# cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
|
|
60
|
+
|
|
61
|
+# clusts = fcluster(lobj, numclusts, criterion='maxclust')
|
|
62
|
+# print(clusts)
|
|
63
|
+# print(cmat.index.values)
|
|
64
|
+# clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
|
|
65
|
+# print(clustdf)
|
|
66
|
+# clustdf.to_pickle(clustertable)
|
|
67
|
+# mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
|
|
68
|
+# print(mdf)
|
|
69
|
+# print(mdf.info())
|
|
70
|
+# qlow = lambda x: x.quantile(0.250)
|
|
71
|
+# qhigh = lambda x: x.quantile(0.750)
|
|
72
|
+# print(mdf.cluster.describe())
|
|
73
|
+# mdagg = mdf.groupby(['read_time', 'cluster']).agg({
|
|
74
|
+# 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
|
|
75
|
+# }, q = 0.025)
|
|
76
|
+# mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
|
|
77
|
+# mdagg = mdagg.reset_index()
|
|
78
|
+# print(mdagg)
|
|
79
|
+# print(mdagg.info())
|
|
80
|
+# print(mdagg.describe())
|
|
81
|
+# # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
|
|
82
|
+# print("Saving")
|
|
83
|
+# mdf.to_pickle(lableddata)
|
|
84
|
+# mdagg.to_pickle(aggdata)
|
|
85
|
+# print("saved")
|
|
86
|
+
|
|
87
|
+# # Algorithm via
|
|
88
|
+# # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
|
|
89
|
+# ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
|
|
90
|
+# link_cols = {}
|
|
91
|
+# for i, i12 in enumerate(lobj[:,:2].astype(int)):
|
|
92
|
+# c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
|
|
93
|
+# for x in i12)
|
|
94
|
+# link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
|
|
95
|
+
|
|
96
|
+# plt.figure(figsize = (25, 10))
|
|
97
|
+# plt.title('ICP Clustering Dendrogram')
|
|
98
|
+# plt.xlabel('ICP ID/(Number of ICPs)')
|
|
99
|
+# plt.ylabel('distance')
|
|
100
|
+# dendrogram(
|
|
101
|
+# lobj,
|
|
102
|
+# labels = cmat.index.values,
|
|
103
|
+# leaf_rotation=90,
|
|
104
|
+# leaf_font_size=8,
|
|
105
|
+# # show_leaf_counts = True,
|
|
106
|
+# # truncate_mode = 'lastp',
|
|
107
|
+# # p = 50,
|
|
108
|
+# # show_contracted = True,
|
|
109
|
+# link_color_func = lambda x: link_cols[x],
|
|
110
|
+# color_threshold = None
|
|
111
|
+# )
|
|
112
|
+# # plt.show()
|
|
113
|
+# plt.savefig("../img/sample-9-dendro.png")
|
|
114
|
+
|
|
115
|
+# sns.set()
|
|
116
|
+
|
|
117
|
+# f, axes = plt.subplots(3,3)
|
|
118
|
+
|
|
119
|
+# for i, c in enumerate(clabs):
|
|
120
|
+# fds = mdagg[mdagg.cluster == c]
|
|
121
|
+# sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
|
|
122
|
+# axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
|
|
123
|
+# # plt.show()
|
|
124
|
+# plt.savefig("../img/sample-9-panedtrends.png")
|