# An algorithm for assigning a dataset to pre-existing clusters import pandas as p from pprint import pprint # Pre-existing aggregated clusters clusfile = '../data/9-clusters.agg.pkl' # A new dataset ndsfile = '../data/2016-17-sample.pkl' # Table of assigned clusters aclusfile = '../data/1617-asgn-table.pkl' # Aggregated dataset aggfile = '../data/1617-agg.pkl' clusdf = p.read_pickle(clusfile) clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean') del clusdf.columns.name print(clusdf.info()) newdf = p.read_pickle(ndsfile).pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot').loc[clusdf.index, :] print(newdf) print(newdf.info()) clusdict = {} clusters = list(clusdf.columns) icps = list(newdf.columns) print(clusters) for i in icps: bestc = -1 s = newdf.loc[:, i] if (s.min() != s.max()): bestr = -1 for c in clusters: thisr = s.corr(clusdf.loc[:, c], method = 'pearson') if thisr > bestr: bestr = thisr bestc = c print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr)) else: print('ICP {} has constant value; assigning to cluster -1'.format(i)) clusdict[i] = bestc newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster']) newclusdf.index.name = 'icp_id' newclusdf = newclusdf.reset_index() print(newclusdf) newclusdf.to_pickle(aclusfile) newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh') print(newdf.info()) print(newclusdf.info()) anndf = newdf.set_index('icp_id').join(newclusdf.set_index('icp_id')).reset_index() print(anndf) qlow = lambda x: x.quantile(0.250) qhigh = lambda x: x.quantile(0.750) newagg = anndf.groupby(['read_time', 'cluster']).agg({ 'kwh': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)] }) newagg.columns = ['_tot_'.join(x) for x in newagg.columns.ravel()] newagg = newagg.reset_index() print(newagg) newagg.to_pickle(aggfile)