from util import getQuery, pickleQuery import numpy as np import pandas as p import matplotlib.pyplot as plt import seaborn as sns from scipy.spatial.distance import squareform from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster # query = """ # SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time # FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy') # ORDER BY icp_id, read_time; # """ # # qparams = ['%%1117', '20/04/2017'] # query = """ # SELECT read_date, period, AVG(kwh_tot) AS average # FROM public.coup_tall_april # GROUP BY read_date, period # ORDER BY read_date, period; # """ # # qparams = [] # # df = getQuery(query, qparams) # # print(df.info()) # # sns.set() # # #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df) # sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df) # # plt.show() numclusts = 7 df = p.read_pickle('../data/2017-20s.pkl') dforig = df print(df.info()) print(df.icp_id.nunique()) print(df.read_time.nunique()) print(df.groupby('icp_id').read_time.nunique().nunique()) df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot') df = df[df.columns[df.max() != df.min()]] print(df.info()) cmat = df.corr() print(cmat.info()) lmat = squareform(1 - cmat) lobj = linkage(lmat, method = 'ward') print(lobj) print(cophenet(lobj, lmat)) clabs = [x + 1 for x in range(numclusts)] cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex())) clusts = fcluster(lobj, numclusts, criterion='maxclust') print(clusts) print(cmat.index.values) clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts}) print(clustdf) mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left') print(mdf) print(mdf.info()) qlow = lambda x: x.quantile(0.250) qhigh = lambda x: x.quantile(0.750) print(mdf.cluster.describe()) mdagg = mdf.groupby(['read_time', 'cluster']).agg({ 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)] }, q = 0.025) mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()] mdagg = mdagg.reset_index() print(mdagg) print(mdagg.info()) print(mdagg.describe()) # mdf.to_csv('~/windows/Documents/clusters-ward.csv') # Algorithm via # ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)} link_cols = {} for i, i12 in enumerate(lobj[:,:2].astype(int)): c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]] for x in i12) link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000' plt.figure(figsize = (25, 10)) plt.title('ICP Clustering Dendrogram') plt.xlabel('ICP ID/(Number of ICPs)') plt.ylabel('distance') dendrogram( lobj, labels = cmat.index.values, leaf_rotation=90, leaf_font_size=8, # show_leaf_counts = True, # truncate_mode = 'lastp', # p = 50, # show_contracted = True, link_color_func = lambda x: link_cols[x], color_threshold = None ) plt.show() sns.set() ax = sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', hue = 'cluster', data = mdagg, palette = cpal) for c in clabs: fds = mdagg[mdagg.cluster == c] ax.fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c]) plt.show()