|
@@ -1,3 +1,5 @@
|
|
1
|
+from pprint import pprint
|
|
2
|
+from argparse import ArgumentParser
|
1
|
3
|
from util import getQuery, pickleQuery
|
2
|
4
|
import numpy as np
|
3
|
5
|
import pandas as p
|
|
@@ -11,57 +13,83 @@ from tqdm import tqdm
|
11
|
13
|
from itertools import combinations
|
12
|
14
|
from math import factorial as f
|
13
|
15
|
|
14
|
|
-def tqcorr(df):
|
15
|
|
- cols = df.columns
|
16
|
|
- ncols = len(cols)
|
17
|
|
- cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
|
18
|
|
- print(cdf.info())
|
19
|
|
- for c in tqdm(cols):
|
20
|
|
- cdf.loc[c, c] = 0
|
21
|
|
- print(cdf.info())
|
22
|
|
- comb = combinations(cols, 2)
|
23
|
|
- ncomb = f(ncols) // f(2) // f(ncols - 2)
|
24
|
|
- for c1, c2 in tqdm(comb, total = ncomb):
|
25
|
|
- dv = 1 - df[c1].corr(df[c2])
|
26
|
|
- cdf.loc[c1, c2] = dv
|
27
|
|
- cdf.loc[c2, c1] = dv
|
28
|
|
- print(cdf.info())
|
29
|
|
- return cdf
|
30
|
|
-
|
31
|
|
-
|
32
|
|
-tqdm.pandas()
|
33
|
16
|
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+numclusts = 9
|
34
|
20
|
Sourcedata = '../data/2017-5k-wide.pkl'
|
|
21
|
+Sourcecorr = '../data/5kdcorrmatrix.pkl'
|
35
|
22
|
lableddata = '../data/9-clusters-5k.pkl'
|
36
|
23
|
aggdata = '../data/9-clusters-5k-agg.pkl'
|
37
|
24
|
clustertable = '../data/9-clusters-5k-table.pkl'
|
38
|
25
|
|
39
|
|
-numclusts = 9
|
40
|
|
-df = p.read_pickle(Sourcedata)
|
41
|
|
-# dforig = df
|
42
|
|
-
|
43
|
|
-# print(df)
|
|
26
|
+# sourcep = p.read_pickle(Sourcecorr)
|
44
|
27
|
|
45
|
|
-print(df.info())
|
46
|
|
-# print(df.icp_id.nunique())
|
47
|
|
-# print(df.read_time.nunique())
|
48
|
|
-# print(df.groupby('icp_id').read_time.nunique().nunique())
|
49
|
|
-# df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
|
50
|
|
-# print(df.info())
|
51
|
|
-df = df[df.columns[df.max() != df.min()]]
|
52
|
|
-print(df.info())
|
53
|
|
-cmat = tqcorr(df)
|
54
|
|
-print(cmat)
|
55
|
|
-print(cmat.info())
|
56
|
|
-
|
57
|
|
-cmat.to_pickle('../data/fulldcorrmatrix.pkl')
|
58
|
|
-
|
59
|
|
-# lmat = squareform(1 - cmat)
|
|
28
|
+# lmat = squareform(sourcep)
|
60
|
29
|
|
61
|
30
|
# lobj = linkage(lmat, method = 'ward')
|
62
|
31
|
# print(lobj)
|
63
|
32
|
# print(cophenet(lobj, lmat))
|
64
|
33
|
|
|
34
|
+def cluster(dcmat, method, nclusters):
|
|
35
|
+ """Cluster provided correlation dataframe
|
|
36
|
+ """
|
|
37
|
+ lmat = squareform(dcmat)
|
|
38
|
+ lobj = linkage(lmat, method = method)
|
|
39
|
+ clabs = [x + 1 for x in range(numclusts)]
|
|
40
|
+ clusts = fcluster(lobj, numclusts, criterion='maxclust')
|
|
41
|
+ clustdf = p.DataFrame({'icp_id' : dcmat.index.values, 'cluster' : clusts})
|
|
42
|
+ return lobj, clustdf
|
|
43
|
+
|
|
44
|
+def dendro(lobj, clustdf, numclusts, icps, fname):
|
|
45
|
+ clabs = [x + 1 for x in range(numclusts)]
|
|
46
|
+ cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
|
|
47
|
+
|
|
48
|
+ # Algorithm via
|
|
49
|
+ # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
|
|
50
|
+ ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
|
|
51
|
+ link_cols = {}
|
|
52
|
+ for i, i12 in enumerate(lobj[:,:2].astype(int)):
|
|
53
|
+ c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
|
|
54
|
+ for x in i12)
|
|
55
|
+ link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
|
|
56
|
+
|
|
57
|
+ plt.figure(figsize = (25, 10))
|
|
58
|
+ plt.title('ICP Clustering Dendrogram')
|
|
59
|
+ plt.xlabel('ICP ID/(Number of ICPs)')
|
|
60
|
+ plt.ylabel('distance')
|
|
61
|
+ dendrogram(
|
|
62
|
+ lobj,
|
|
63
|
+ labels = icps,
|
|
64
|
+ leaf_rotation=90,
|
|
65
|
+ leaf_font_size=8,
|
|
66
|
+ link_color_func = lambda x: link_cols[x],
|
|
67
|
+ color_threshold = None
|
|
68
|
+ )
|
|
69
|
+ plt.savefig(fname)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+def main():
|
|
73
|
+ parser = ArgumentParser(description='Cluster from pre-existing distance correlation matrix in pickled dataframe')
|
|
74
|
+ parser.add_argument("-i", "--input", dest="input", help = "input pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/5kdcorrmatrix.pkl")
|
|
75
|
+ parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/5kdcorrmatrix.pkl")
|
|
76
|
+ parser.add_argument("--method", dest="method", help = "clustering method; default 'ward'", metavar = "[METHOD]", default = "ward")
|
|
77
|
+ parser.add_argument("--clusters", dest="numclusters", help = "number of clusters; default: 9", metavar = "[NUM]", default = 9, type = int)
|
|
78
|
+ parser.add_argument("-d", "--dendrogram", dest = "incdendro", help = "draw dendrogram", action ="store_true")
|
|
79
|
+ parser.add_argument("-t", "--tree", dest="treepath", help="Filename for dendrogram (if -d), default: ../img/59-9-dendro.png", metavar="[PATH]", default = "../img/5k-9-dendro.png")
|
|
80
|
+ args = parser.parse_args()
|
|
81
|
+
|
|
82
|
+ print("Clustering")
|
|
83
|
+ sourcep = p.read_pickle(args.input)
|
|
84
|
+ l, c = cluster(sourcep, args.method, args.numclusters)
|
|
85
|
+ c.to_pickle(args.output)
|
|
86
|
+
|
|
87
|
+ print("Drawing dendrogram")
|
|
88
|
+ if args.incdendro:
|
|
89
|
+ icps = sourcep.index.values
|
|
90
|
+ dendro(l, c, args.numclusters, icps, args.treepath)
|
|
91
|
+
|
|
92
|
+
|
65
|
93
|
|
66
|
94
|
|
67
|
95
|
# clabs = [x + 1 for x in range(numclusts)]
|
|
@@ -131,3 +159,6 @@ cmat.to_pickle('../data/fulldcorrmatrix.pkl')
|
131
|
159
|
# axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
|
132
|
160
|
# # plt.show()
|
133
|
161
|
# plt.savefig("../img/sample-9-panedtrends.png")
|
|
162
|
+
|
|
163
|
+if __name__ == "__main__":
|
|
164
|
+ main()
|