123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- # An algorithm for assigning a dataset to pre-existing clusters
- import pandas as p
- from pprint import pprint
-
- # Pre-existing aggregated clusters
- clusfile = '../data/9-clusters.agg.pkl'
-
- # A new dataset
- ndsfile = '../data/2016-17-sample.pkl'
-
-
- clusdf = p.read_pickle(clusfile)
- clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
- del clusdf.columns.name
- print(clusdf.info())
-
-
-
- newdf = p.read_pickle(ndsfile).pivot(index = 'read_time',
- columns = 'icp_id',
- values = 'kwh_tot').loc[clusdf.index, :]
- print(newdf)
- print(newdf.info())
-
- clusdict = {}
-
- clusters = list(clusdf.columns)
- icps = list(newdf.columns)
-
- print(clusters)
-
- for i in icps:
- bestc = -1
- s = newdf.loc[:, i]
- if (s.min() != s.max()):
- bestr = -1
- for c in clusters:
- thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
- if thisr > bestr:
- bestr = thisr
- bestc = c
- print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
- else:
- print('ICP {} has constant value; assigning to cluster -1'.format(i))
- clusdict[i] = bestc
-
- # Need to make sure cluster is integer ZZZ
- newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
- newclusdf.index.name = 'icp_id'
- newclusdf = newclusdf.reset_index()
- # print(newclusdf)
-
- newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')
-
- ######## ZZZ Something isn't working here
- anndf = newdf.set_index('icp_id').join(newclusdf)
- print(anndf)
|