Browse Source

Trying to make my own corr

Petra Lamborn 5 years ago
parent
commit
6231929972
1 changed files with 99 additions and 82 deletions
  1. 99
    82
      py/clustering.py

+ 99
- 82
py/clustering.py View File

@@ -7,101 +7,118 @@ import matplotlib.pyplot as plt
7 7
 import seaborn as sns
8 8
 from scipy.spatial.distance import squareform
9 9
 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
10
+from tqdm import tqdm
10 11
 
11
-Sourcedata =   '../data/2017-sample.pkl'
12
+def tqcorr(df):
13
+    cols = df.columns
14
+    ncols = len(cols)
15
+    cdf = p.DataFrame(index = cols, columns = cols)
16
+    for c in tqdm(range(ncols)):
17
+        cind = cols[c]
18
+        cdf.loc[cind, cind] = 1
19
+        # for i in range(c + 1, ncols):
20
+        #     print("ZZZ")
21
+    return cdf
22
+
23
+
24
+tqdm.pandas()
25
+
26
+Sourcedata =   '../data/2017-all-wide.pkl'
12 27
 lableddata =   '../data/9-clusters.pkl'
13 28
 aggdata =      '../data/9-clusters.agg.pkl'
14 29
 clustertable = '../data/9-clusters-sample-table.pkl'
15 30
 
16 31
 numclusts = 9
17 32
 df = p.read_pickle(Sourcedata)
18
-dforig = df
33
+# dforig = df
19 34
 
20 35
 # print(df)
21 36
 
22 37
 print(df.info())
23
-print(df.icp_id.nunique())
24
-print(df.read_time.nunique())
38
+tqcorr(df)
39
+# print(df.icp_id.nunique())
40
+# print(df.read_time.nunique())
25 41
 # print(df.groupby('icp_id').read_time.nunique().nunique())
26
-df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
27
-print(df.info())
42
+# df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
43
+# print(df.info())
28 44
 df = df[df.columns[df.max() != df.min()]]
29 45
 print(df.info())
30
-cmat = df.corr()
46
+cmat = tqcorr(df)
47
+print(cmat)
31 48
 print(cmat.info())
32 49
 
33
-lmat = squareform(1 - cmat)
34
-
35
-lobj = linkage(lmat, method = 'ward')
36
-print(lobj)
37
-print(cophenet(lobj, lmat))
38
-
39
-
40
-
41
-clabs = [x + 1 for x in range(numclusts)]
42
-cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
43
-
44
-clusts = fcluster(lobj, numclusts, criterion='maxclust')
45
-print(clusts)
46
-print(cmat.index.values)
47
-clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
48
-print(clustdf)
49
-clustdf.to_pickle(clustertable)
50
-mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
51
-print(mdf)
52
-print(mdf.info())
53
-qlow  = lambda x: x.quantile(0.250)
54
-qhigh = lambda x: x.quantile(0.750)
55
-print(mdf.cluster.describe())
56
-mdagg = mdf.groupby(['read_time', 'cluster']).agg({
57
-        'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
58
-}, q = 0.025)
59
-mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
60
-mdagg = mdagg.reset_index()
61
-print(mdagg)
62
-print(mdagg.info())
63
-print(mdagg.describe())
64
-# mdf.to_csv('~/windows/Documents/clusters-ward.csv')
65
-print("Saving")
66
-mdf.to_pickle(lableddata)
67
-mdagg.to_pickle(aggdata)
68
-print("saved")
69
-
70
-# Algorithm via 
71
-# <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
72
-ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
73
-link_cols = {}
74
-for i, i12 in enumerate(lobj[:,:2].astype(int)):
75
-  c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
76
-    for x in i12)
77
-  link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
78
-
79
-plt.figure(figsize = (25, 10))
80
-plt.title('ICP Clustering Dendrogram')
81
-plt.xlabel('ICP ID/(Number of ICPs)')
82
-plt.ylabel('distance')
83
-dendrogram(
84
-    lobj,
85
-    labels = cmat.index.values,
86
-    leaf_rotation=90,
87
-    leaf_font_size=8,
88
-    # show_leaf_counts = True,
89
-    # truncate_mode = 'lastp',
90
-    # p = 50,
91
-    # show_contracted = True,
92
-    link_color_func = lambda x: link_cols[x],
93
-    color_threshold = None
94
-)
95
-# plt.show()
96
-plt.savefig("../img/sample-9-dendro.png")
97
-
98
-sns.set()
99
-
100
-f, axes = plt.subplots(3,3)
101
-
102
-for i, c in enumerate(clabs):
103
-    fds = mdagg[mdagg.cluster == c]
104
-    sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
105
-    axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
106
-# plt.show()
107
-plt.savefig("../img/sample-9-panedtrends.png")
50
+# lmat = squareform(1 - cmat)
51
+
52
+# lobj = linkage(lmat, method = 'ward')
53
+# print(lobj)
54
+# print(cophenet(lobj, lmat))
55
+
56
+
57
+
58
+# clabs = [x + 1 for x in range(numclusts)]
59
+# cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
60
+
61
+# clusts = fcluster(lobj, numclusts, criterion='maxclust')
62
+# print(clusts)
63
+# print(cmat.index.values)
64
+# clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
65
+# print(clustdf)
66
+# clustdf.to_pickle(clustertable)
67
+# mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
68
+# print(mdf)
69
+# print(mdf.info())
70
+# qlow  = lambda x: x.quantile(0.250)
71
+# qhigh = lambda x: x.quantile(0.750)
72
+# print(mdf.cluster.describe())
73
+# mdagg = mdf.groupby(['read_time', 'cluster']).agg({
74
+#         'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
75
+# }, q = 0.025)
76
+# mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
77
+# mdagg = mdagg.reset_index()
78
+# print(mdagg)
79
+# print(mdagg.info())
80
+# print(mdagg.describe())
81
+# # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
82
+# print("Saving")
83
+# mdf.to_pickle(lableddata)
84
+# mdagg.to_pickle(aggdata)
85
+# print("saved")
86
+
87
+# # Algorithm via 
88
+# # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
89
+# ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
90
+# link_cols = {}
91
+# for i, i12 in enumerate(lobj[:,:2].astype(int)):
92
+#   c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
93
+#     for x in i12)
94
+#   link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
95
+
96
+# plt.figure(figsize = (25, 10))
97
+# plt.title('ICP Clustering Dendrogram')
98
+# plt.xlabel('ICP ID/(Number of ICPs)')
99
+# plt.ylabel('distance')
100
+# dendrogram(
101
+#     lobj,
102
+#     labels = cmat.index.values,
103
+#     leaf_rotation=90,
104
+#     leaf_font_size=8,
105
+#     # show_leaf_counts = True,
106
+#     # truncate_mode = 'lastp',
107
+#     # p = 50,
108
+#     # show_contracted = True,
109
+#     link_color_func = lambda x: link_cols[x],
110
+#     color_threshold = None
111
+# )
112
+# # plt.show()
113
+# plt.savefig("../img/sample-9-dendro.png")
114
+
115
+# sns.set()
116
+
117
+# f, axes = plt.subplots(3,3)
118
+
119
+# for i, c in enumerate(clabs):
120
+#     fds = mdagg[mdagg.cluster == c]
121
+#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
122
+#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
123
+# # plt.show()
124
+# plt.savefig("../img/sample-9-panedtrends.png")