1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- # An algorithm for assigning a dataset to pre-existing clusters
- import pandas as p
- from pprint import pprint
-
- # Pre-existing aggregated clusters
- clusfile = '../data/9-clusters.agg.pkl'
-
- # A new dataset
- ndsfile = '../data/2016-17-sample.pkl'
-
- # Table of assigned clusters
- aclusfile = '../data/1617-asgn-table.pkl'
-
- # Aggregated dataset
- aggfile = '../data/1617-agg.pkl'
-
-
- clusdf = p.read_pickle(clusfile)
- clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
- del clusdf.columns.name
- print(clusdf.info())
-
-
-
- newdf = p.read_pickle(ndsfile).pivot(index = 'read_time',
- columns = 'icp_id',
- values = 'kwh_tot').loc[clusdf.index, :]
- print(newdf)
- print(newdf.info())
-
- clusdict = {}
-
- clusters = list(clusdf.columns)
- icps = list(newdf.columns)
-
- print(clusters)
-
- for i in icps:
- bestc = -1
- s = newdf.loc[:, i]
- if (s.min() != s.max()):
- bestr = -1
- for c in clusters:
- thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
- if thisr > bestr:
- bestr = thisr
- bestc = c
- print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
- else:
- print('ICP {} has constant value; assigning to cluster -1'.format(i))
- clusdict[i] = bestc
-
- newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
- newclusdf.index.name = 'icp_id'
- newclusdf = newclusdf.reset_index()
- print(newclusdf)
- newclusdf.to_pickle(aclusfile)
-
-
- newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')
-
- print(newdf.info())
- print(newclusdf.info())
-
- anndf = newdf.set_index('icp_id').join(newclusdf.set_index('icp_id')).reset_index()
- print(anndf)
-
- qlow = lambda x: x.quantile(0.250)
- qhigh = lambda x: x.quantile(0.750)
- newagg = anndf.groupby(['read_time', 'cluster']).agg({
- 'kwh': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
- })
- newagg.columns = ['_tot_'.join(x) for x in newagg.columns.ravel()]
- newagg = newagg.reset_index()
-
- print(newagg)
- newagg.to_pickle(aggfile)
|