from util import getQuery, pickleQuery import numpy as np import pandas as p import matplotlib.pyplot as plt import seaborn as sns from scipy.spatial.distance import squareform from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster Sourcedata = '../data/2017-sample.pkl' lableddata = '../data/9-clusters.pkl' aggdata = '../data/9-clusters.agg.pkl' clustertable = '../data/9-clusters-sample-table.pkl' numclusts = 9 df = p.read_pickle(Sourcedata) dforig = df # print(df) print(df.info()) print(df.icp_id.nunique()) print(df.read_time.nunique()) # print(df.groupby('icp_id').read_time.nunique().nunique()) df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot') print(df.info()) df = df[df.columns[df.max() != df.min()]] print(df.info()) cmat = df.corr() print(cmat.info()) lmat = squareform(1 - cmat) lobj = linkage(lmat, method = 'ward') print(lobj) print(cophenet(lobj, lmat)) clabs = [x + 1 for x in range(numclusts)] cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex())) clusts = fcluster(lobj, numclusts, criterion='maxclust') print(clusts) print(cmat.index.values) clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts}) print(clustdf) clustdf.to_pickle(clustertable) mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left') print(mdf) print(mdf.info()) qlow = lambda x: x.quantile(0.250) qhigh = lambda x: x.quantile(0.750) print(mdf.cluster.describe()) mdagg = mdf.groupby(['read_time', 'cluster']).agg({ 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)] }, q = 0.025) mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()] mdagg = mdagg.reset_index() print(mdagg) print(mdagg.info()) print(mdagg.describe()) # mdf.to_csv('~/windows/Documents/clusters-ward.csv') print("Saving") mdf.to_pickle(lableddata) mdagg.to_pickle(aggdata) print("saved") # Algorithm via # ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)} link_cols = {} for i, i12 in enumerate(lobj[:,:2].astype(int)): c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]] for x in i12) link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000' plt.figure(figsize = (25, 10)) plt.title('ICP Clustering Dendrogram') plt.xlabel('ICP ID/(Number of ICPs)') plt.ylabel('distance') dendrogram( lobj, labels = cmat.index.values, leaf_rotation=90, leaf_font_size=8, # show_leaf_counts = True, # truncate_mode = 'lastp', # p = 50, # show_contracted = True, link_color_func = lambda x: link_cols[x], color_threshold = None ) plt.show() sns.set() f, axes = plt.subplots(3,3) for i, c in enumerate(clabs): fds = mdagg[mdagg.cluster == c] sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds) axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c]) plt.show()