petra
/
ampli


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
							# An algorithm for assigning a dataset to pre-existing clusters
import pandas as p
from pprint import pprint

# Pre-existing aggregated clusters
clusfile = '../data/9-clusters.agg.pkl'

# A new dataset
ndsfile = '../data/2016-17-sample.pkl'


clusdf = p.read_pickle(clusfile)
clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
del clusdf.columns.name
print(clusdf.info())


newdf = p.read_pickle(ndsfile).pivot(index = 'read_time', 
                                     columns = 'icp_id', 
                                     values = 'kwh_tot').loc[clusdf.index, :]
print(newdf)
print(newdf.info())

clusdict = {}

clusters = list(clusdf.columns)
icps = list(newdf.columns)

print(clusters)

for i in icps:
    bestc = -1
    s = newdf.loc[:, i]
    if (s.min() != s.max()):
        bestr = -1
        for c in clusters:
            thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
            if thisr > bestr:
                bestr = thisr
                bestc = c
        print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
    else:
        print('ICP {} has constant value; assigning to cluster -1'.format(i))
    clusdict[i] = bestc

# Need to make sure cluster is integer ZZZ
newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
newclusdf.index.name = 'icp_id'
newclusdf = newclusdf.reset_index()
# print(newclusdf)

newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')

######## ZZZ Something isn't working here
anndf = newdf.set_index('icp_id').join(newclusdf)
print(anndf)