from util import getQuery, pickleQuery import numpy as np import pandas as p import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import seaborn as sns from scipy.spatial.distance import squareform from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster from tqdm import tqdm from itertools import combinations from math import factorial as f def tqcorr(df): cols = df.columns ncols = len(cols) cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16) print(cdf.info()) for c in tqdm(cols): cdf.loc[c, c] = 0 print(cdf.info()) comb = combinations(cols, 2) ncomb = f(ncols) // f(2) // f(ncols - 2) for c1, c2 in tqdm(comb, total = ncomb): dv = 1 - df[c1].corr(df[c2]) cdf.loc[c1, c2] = dv cdf.loc[c2, c1] = dv print(cdf.info()) return cdf tqdm.pandas() Sourcedata = '../data/2017-5k-wide.pkl' lableddata = '../data/9-clusters-5k.pkl' aggdata = '../data/9-clusters-5k-agg.pkl' clustertable = '../data/9-clusters-5k-table.pkl' numclusts = 9 df = p.read_pickle(Sourcedata) # dforig = df # print(df) print(df.info()) # print(df.icp_id.nunique()) # print(df.read_time.nunique()) # print(df.groupby('icp_id').read_time.nunique().nunique()) # df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot') # print(df.info()) df = df[df.columns[df.max() != df.min()]] print(df.info()) cmat = tqcorr(df) print(cmat) print(cmat.info()) cmat.to_pickle('../data/fulldcorrmatrix.pkl') # lmat = squareform(1 - cmat) # lobj = linkage(lmat, method = 'ward') # print(lobj) # print(cophenet(lobj, lmat)) # clabs = [x + 1 for x in range(numclusts)] # cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex())) # clusts = fcluster(lobj, numclusts, criterion='maxclust') # print(clusts) # print(cmat.index.values) # clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts}) # print(clustdf) # clustdf.to_pickle(clustertable) # mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left') # print(mdf) # print(mdf.info()) # qlow = lambda x: x.quantile(0.250) # qhigh = lambda x: x.quantile(0.750) # print(mdf.cluster.describe()) # mdagg = mdf.groupby(['read_time', 'cluster']).agg({ # 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)] # }, q = 0.025) # mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()] # mdagg = mdagg.reset_index() # print(mdagg) # print(mdagg.info()) # print(mdagg.describe()) # # mdf.to_csv('~/windows/Documents/clusters-ward.csv') # print("Saving") # mdf.to_pickle(lableddata) # mdagg.to_pickle(aggdata) # print("saved") # # Algorithm via # # # ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)} # link_cols = {} # for i, i12 in enumerate(lobj[:,:2].astype(int)): # c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]] # for x in i12) # link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000' # plt.figure(figsize = (25, 10)) # plt.title('ICP Clustering Dendrogram') # plt.xlabel('ICP ID/(Number of ICPs)') # plt.ylabel('distance') # dendrogram( # lobj, # labels = cmat.index.values, # leaf_rotation=90, # leaf_font_size=8, # # show_leaf_counts = True, # # truncate_mode = 'lastp', # # p = 50, # # show_contracted = True, # link_color_func = lambda x: link_cols[x], # color_threshold = None # ) # # plt.show() # plt.savefig("../img/sample-9-dendro.png") # sns.set() # f, axes = plt.subplots(3,3) # for i, c in enumerate(clabs): # fds = mdagg[mdagg.cluster == c] # sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds) # axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c]) # # plt.show() # plt.savefig("../img/sample-9-panedtrends.png")