Browse Source

Clustering file

Petra Lamborn 5 years ago
parent
commit
bcbb6fe5b9
3 changed files with 71 additions and 41 deletions
  1. BIN
      img/5k-9-dendro.png
  2. 70
    39
      py/clustering.py
  3. 1
    2
      py/dcorr.py

BIN
img/5k-9-dendro.png View File


+ 70
- 39
py/clustering.py View File

@@ -1,3 +1,5 @@
1
+from pprint import pprint
2
+from argparse import ArgumentParser
1 3
 from util import getQuery, pickleQuery
2 4
 import numpy as np
3 5
 import pandas as p
@@ -11,57 +13,83 @@ from tqdm import tqdm
11 13
 from itertools import combinations
12 14
 from math import factorial as f
13 15
 
14
-def tqcorr(df):
15
-    cols = df.columns
16
-    ncols = len(cols)
17
-    cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
18
-    print(cdf.info())
19
-    for c in tqdm(cols):
20
-        cdf.loc[c, c] = 0
21
-    print(cdf.info())
22
-    comb = combinations(cols, 2)
23
-    ncomb = f(ncols) // f(2) // f(ncols - 2)
24
-    for c1, c2 in tqdm(comb, total = ncomb):
25
-        dv = 1 - df[c1].corr(df[c2])
26
-        cdf.loc[c1, c2] = dv
27
-        cdf.loc[c2, c1] = dv
28
-    print(cdf.info())
29
-    return cdf
30
-
31
-
32
-tqdm.pandas()
33 16
 
17
+
18
+
19
+numclusts = 9
34 20
 Sourcedata =   '../data/2017-5k-wide.pkl'
21
+Sourcecorr =   '../data/5kdcorrmatrix.pkl'
35 22
 lableddata =   '../data/9-clusters-5k.pkl'
36 23
 aggdata =      '../data/9-clusters-5k-agg.pkl'
37 24
 clustertable = '../data/9-clusters-5k-table.pkl'
38 25
 
39
-numclusts = 9
40
-df = p.read_pickle(Sourcedata)
41
-# dforig = df
42
-
43
-# print(df)
26
+# sourcep = p.read_pickle(Sourcecorr)
44 27
 
45
-print(df.info())
46
-# print(df.icp_id.nunique())
47
-# print(df.read_time.nunique())
48
-# print(df.groupby('icp_id').read_time.nunique().nunique())
49
-# df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
50
-# print(df.info())
51
-df = df[df.columns[df.max() != df.min()]]
52
-print(df.info())
53
-cmat = tqcorr(df)
54
-print(cmat)
55
-print(cmat.info())
56
-
57
-cmat.to_pickle('../data/fulldcorrmatrix.pkl')
58
-
59
-# lmat = squareform(1 - cmat)
28
+# lmat = squareform(sourcep)
60 29
 
61 30
 # lobj = linkage(lmat, method = 'ward')
62 31
 # print(lobj)
63 32
 # print(cophenet(lobj, lmat))
64 33
 
34
+def cluster(dcmat, method, nclusters):
35
+    """Cluster provided correlation dataframe
36
+    """
37
+    lmat = squareform(dcmat)
38
+    lobj = linkage(lmat, method = method)
39
+    clabs = [x + 1 for x in range(numclusts)]
40
+    clusts = fcluster(lobj, numclusts, criterion='maxclust')
41
+    clustdf = p.DataFrame({'icp_id' : dcmat.index.values, 'cluster' : clusts})
42
+    return lobj, clustdf
43
+
44
+def dendro(lobj, clustdf, numclusts, icps, fname):
45
+    clabs = [x + 1 for x in range(numclusts)]
46
+    cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
47
+
48
+    # Algorithm via 
49
+    # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
50
+    ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
51
+    link_cols = {}
52
+    for i, i12 in enumerate(lobj[:,:2].astype(int)):
53
+      c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
54
+        for x in i12)
55
+      link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
56
+
57
+    plt.figure(figsize = (25, 10))
58
+    plt.title('ICP Clustering Dendrogram')
59
+    plt.xlabel('ICP ID/(Number of ICPs)')
60
+    plt.ylabel('distance')
61
+    dendrogram(
62
+        lobj,
63
+        labels = icps,
64
+        leaf_rotation=90,
65
+        leaf_font_size=8,
66
+        link_color_func = lambda x: link_cols[x],
67
+        color_threshold = None
68
+    )
69
+    plt.savefig(fname)
70
+
71
+
72
+def main():
73
+    parser = ArgumentParser(description='Cluster from pre-existing distance correlation matrix in pickled dataframe')
74
+    parser.add_argument("-i", "--input",  dest="input",  help = "input pickle path; default: ../data/5kdcorrmatrix.pkl",  metavar="[PATH]", default = "../data/5kdcorrmatrix.pkl")
75
+    parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl",  metavar="[PATH]", default = "../data/5kdcorrmatrix.pkl")
76
+    parser.add_argument("--method", dest="method", help = "clustering method; default 'ward'", metavar = "[METHOD]", default = "ward")
77
+    parser.add_argument("--clusters",  dest="numclusters",  help = "number of clusters; default: 9", metavar = "[NUM]", default = 9, type = int)
78
+    parser.add_argument("-d", "--dendrogram", dest = "incdendro", help = "draw dendrogram", action ="store_true")
79
+    parser.add_argument("-t", "--tree", dest="treepath", help="Filename for dendrogram (if -d), default: ../img/59-9-dendro.png", metavar="[PATH]", default = "../img/5k-9-dendro.png")
80
+    args = parser.parse_args()
81
+
82
+    print("Clustering")
83
+    sourcep = p.read_pickle(args.input)
84
+    l, c = cluster(sourcep, args.method, args.numclusters)
85
+    c.to_pickle(args.output)
86
+
87
+    print("Drawing dendrogram")
88
+    if args.incdendro:
89
+        icps = sourcep.index.values
90
+        dendro(l, c, args.numclusters, icps, args.treepath)
91
+
92
+
65 93
 
66 94
 
67 95
 # clabs = [x + 1 for x in range(numclusts)]
@@ -131,3 +159,6 @@ cmat.to_pickle('../data/fulldcorrmatrix.pkl')
131 159
 #     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
132 160
 # # plt.show()
133 161
 # plt.savefig("../img/sample-9-panedtrends.png")
162
+
163
+if __name__ == "__main__":
164
+    main()

+ 1
- 2
py/dcorr.py View File

@@ -5,7 +5,6 @@ import pandas as p
5 5
 from tqdm import tqdm
6 6
 from itertools import combinations
7 7
 from math import factorial as f
8
-from pprint import pprint
9 8
 
10 9
 def tqcorr(df):
11 10
     """Calculates a (distance) correlation matrix for wide-form dataframe, with progressbar
@@ -39,6 +38,6 @@ def createCorr(source, output):
39 38
 if __name__ == "__main__":
40 39
     parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe')
41 40
     parser.add_argument("-i", "--input",  dest="input",  help = "input pickle path; default: ../data/2017-5k-wide.pkl",  metavar="[PATH]", default = "../data/2017-5k-wide.pkl")
42
-    parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/fulldcorrmatrix.pkl")
41
+    parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/5kdcorrmatrix.pkl")
43 42
     args = parser.parse_args()
44 43
     createCorr(args.input, args.output)