from util import getQuery, pickleQuery import numpy as np import pandas as p import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import seaborn as sns from scipy.spatial.distance import squareform from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster from tqdm import tqdm def tqcorr(df): cols = df.columns ncols = len(cols) cdf = p.DataFrame(index = cols, columns = cols) for c in tqdm(range(ncols)): cind = cols[c] cdf.loc[cind, cind] = 1 # for i in range(c + 1, ncols): # print("ZZZ") return cdf tqdm.pandas() Sourcedata = '../data/2017-all-wide.pkl' lableddata = '../data/9-clusters.pkl' aggdata = '../data/9-clusters.agg.pkl' clustertable = '../data/9-clusters-sample-table.pkl' numclusts = 9 df = p.read_pickle(Sourcedata) # dforig = df # print(df) print(df.info()) tqcorr(df) # print(df.icp_id.nunique()) # print(df.read_time.nunique()) # print(df.groupby('icp_id').read_time.nunique().nunique()) # df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot') # print(df.info()) df = df[df.columns[df.max() != df.min()]] print(df.info()) cmat = tqcorr(df) print(cmat) print(cmat.info()) # lmat = squareform(1 - cmat) # lobj = linkage(lmat, method = 'ward') # print(lobj) # print(cophenet(lobj, lmat)) # clabs = [x + 1 for x in range(numclusts)] # cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex())) # clusts = fcluster(lobj, numclusts, criterion='maxclust') # print(clusts) # print(cmat.index.values) # clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts}) # print(clustdf) # clustdf.to_pickle(clustertable) # mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left') # print(mdf) # print(mdf.info()) # qlow = lambda x: x.quantile(0.250) # qhigh = lambda x: x.quantile(0.750) # print(mdf.cluster.describe()) # mdagg = mdf.groupby(['read_time', 'cluster']).agg({ # 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)] # }, q = 0.025) # mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()] # mdagg = mdagg.reset_index() # print(mdagg) # print(mdagg.info()) # print(mdagg.describe()) # # mdf.to_csv('~/windows/Documents/clusters-ward.csv') # print("Saving") # mdf.to_pickle(lableddata) # mdagg.to_pickle(aggdata) # print("saved") # # Algorithm via # # # ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)} # link_cols = {} # for i, i12 in enumerate(lobj[:,:2].astype(int)): # c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]] # for x in i12) # link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000' # plt.figure(figsize = (25, 10)) # plt.title('ICP Clustering Dendrogram') # plt.xlabel('ICP ID/(Number of ICPs)') # plt.ylabel('distance') # dendrogram( # lobj, # labels = cmat.index.values, # leaf_rotation=90, # leaf_font_size=8, # # show_leaf_counts = True, # # truncate_mode = 'lastp', # # p = 50, # # show_contracted = True, # link_color_func = lambda x: link_cols[x], # color_threshold = None # ) # # plt.show() # plt.savefig("../img/sample-9-dendro.png") # sns.set() # f, axes = plt.subplots(3,3) # for i, c in enumerate(clabs): # fds = mdagg[mdagg.cluster == c] # sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds) # axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c]) # # plt.show() # plt.savefig("../img/sample-9-panedtrends.png")