123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164 |
- from pprint import pprint
- from argparse import ArgumentParser
- from util import getQuery, pickleQuery
- import numpy as np
- import pandas as p
- import matplotlib
- matplotlib.use('agg')
- import matplotlib.pyplot as plt
- import seaborn as sns
- from scipy.spatial.distance import squareform
- from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
- from tqdm import tqdm
- from itertools import combinations
- from math import factorial as f
-
-
-
-
- numclusts = 9
- Sourcedata = '../data/2017-5k-wide.pkl'
- Sourcecorr = '../data/5kdcorrmatrix.pkl'
- lableddata = '../data/9-clusters-5k.pkl'
- aggdata = '../data/9-clusters-5k-agg.pkl'
- clustertable = '../data/9-clusters-5k-table.pkl'
-
- # sourcep = p.read_pickle(Sourcecorr)
-
- # lmat = squareform(sourcep)
-
- # lobj = linkage(lmat, method = 'ward')
- # print(lobj)
- # print(cophenet(lobj, lmat))
-
- def cluster(dcmat, method, nclusters):
- """Cluster provided correlation dataframe
- """
- lmat = squareform(dcmat)
- lobj = linkage(lmat, method = method)
- clabs = [x + 1 for x in range(numclusts)]
- clusts = fcluster(lobj, numclusts, criterion='maxclust')
- clustdf = p.DataFrame({'icp_id' : dcmat.index.values, 'cluster' : clusts})
- return lobj, clustdf
-
- def dendro(lobj, clustdf, numclusts, icps, fname):
- clabs = [x + 1 for x in range(numclusts)]
- cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
-
- # Algorithm via
- # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
- ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
- link_cols = {}
- for i, i12 in enumerate(lobj[:,:2].astype(int)):
- c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
- for x in i12)
- link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
-
- plt.figure(figsize = (25, 10))
- plt.title('ICP Clustering Dendrogram')
- plt.xlabel('ICP ID/(Number of ICPs)')
- plt.ylabel('distance')
- dendrogram(
- lobj,
- labels = icps,
- leaf_rotation=90,
- leaf_font_size=8,
- link_color_func = lambda x: link_cols[x],
- color_threshold = None
- )
- plt.savefig(fname)
-
-
- def main():
- parser = ArgumentParser(description='Cluster from pre-existing distance correlation matrix in pickled dataframe')
- parser.add_argument("-i", "--input", dest="input", help = "input pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/5kdcorrmatrix.pkl")
- parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/5kdcorrmatrix.pkl")
- parser.add_argument("--method", dest="method", help = "clustering method; default 'ward'", metavar = "[METHOD]", default = "ward")
- parser.add_argument("--clusters", dest="numclusters", help = "number of clusters; default: 9", metavar = "[NUM]", default = 9, type = int)
- parser.add_argument("-d", "--dendrogram", dest = "incdendro", help = "draw dendrogram", action ="store_true")
- parser.add_argument("-t", "--tree", dest="treepath", help="Filename for dendrogram (if -d), default: ../img/59-9-dendro.png", metavar="[PATH]", default = "../img/5k-9-dendro.png")
- args = parser.parse_args()
-
- print("Clustering")
- sourcep = p.read_pickle(args.input)
- l, c = cluster(sourcep, args.method, args.numclusters)
- c.to_pickle(args.output)
-
- print("Drawing dendrogram")
- if args.incdendro:
- icps = sourcep.index.values
- dendro(l, c, args.numclusters, icps, args.treepath)
-
-
-
-
- # clabs = [x + 1 for x in range(numclusts)]
- # cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
-
- # clusts = fcluster(lobj, numclusts, criterion='maxclust')
- # print(clusts)
- # print(cmat.index.values)
- # clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
- # print(clustdf)
- # clustdf.to_pickle(clustertable)
- # mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
- # print(mdf)
- # print(mdf.info())
- # qlow = lambda x: x.quantile(0.250)
- # qhigh = lambda x: x.quantile(0.750)
- # print(mdf.cluster.describe())
- # mdagg = mdf.groupby(['read_time', 'cluster']).agg({
- # 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
- # }, q = 0.025)
- # mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
- # mdagg = mdagg.reset_index()
- # print(mdagg)
- # print(mdagg.info())
- # print(mdagg.describe())
- # # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
- # print("Saving")
- # mdf.to_pickle(lableddata)
- # mdagg.to_pickle(aggdata)
- # print("saved")
-
- # # Algorithm via
- # # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
- # ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
- # link_cols = {}
- # for i, i12 in enumerate(lobj[:,:2].astype(int)):
- # c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
- # for x in i12)
- # link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
-
- # plt.figure(figsize = (25, 10))
- # plt.title('ICP Clustering Dendrogram')
- # plt.xlabel('ICP ID/(Number of ICPs)')
- # plt.ylabel('distance')
- # dendrogram(
- # lobj,
- # labels = cmat.index.values,
- # leaf_rotation=90,
- # leaf_font_size=8,
- # # show_leaf_counts = True,
- # # truncate_mode = 'lastp',
- # # p = 50,
- # # show_contracted = True,
- # link_color_func = lambda x: link_cols[x],
- # color_threshold = None
- # )
- # # plt.show()
- # plt.savefig("../img/sample-9-dendro.png")
-
- # sns.set()
-
- # f, axes = plt.subplots(3,3)
-
- # for i, c in enumerate(clabs):
- # fds = mdagg[mdagg.cluster == c]
- # sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
- # axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
- # # plt.show()
- # plt.savefig("../img/sample-9-panedtrends.png")
-
- if __name__ == "__main__":
- main()
|