petra
/
ampli


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
							# An algorithm for assigning a dataset to pre-existing clusters
import pandas as p
from pprint import pprint

# Pre-existing aggregated clusters
clusfile = '../data/9-clusters.agg.pkl'

# A new dataset
ndsfile = '../data/2016-17-sample.pkl'

# Table of assigned clusters
aclusfile = '../data/1617-asgn-table.pkl'

# Aggregated dataset
aggfile = '../data/1617-agg.pkl'


clusdf = p.read_pickle(clusfile)
clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
del clusdf.columns.name
print(clusdf.info())


newdf = p.read_pickle(ndsfile).pivot(index = 'read_time', 
                                     columns = 'icp_id', 
                                     values = 'kwh_tot').loc[clusdf.index, :]
print(newdf)
print(newdf.info())

clusdict = {}

clusters = list(clusdf.columns)
icps = list(newdf.columns)

print(clusters)

for i in icps:
    bestc = -1
    s = newdf.loc[:, i]
    if (s.min() != s.max()):
        bestr = -1
        for c in clusters:
            thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
            if thisr > bestr:
                bestr = thisr
                bestc = c
        print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
    else:
        print('ICP {} has constant value; assigning to cluster -1'.format(i))
    clusdict[i] = bestc

newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
newclusdf.index.name = 'icp_id'
newclusdf = newclusdf.reset_index()
print(newclusdf)
newclusdf.to_pickle(aclusfile)


newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')

print(newdf.info())
print(newclusdf.info())

anndf = newdf.set_index('icp_id').join(newclusdf.set_index('icp_id')).reset_index()
print(anndf)

qlow  = lambda x: x.quantile(0.250)
qhigh = lambda x: x.quantile(0.750)
newagg = anndf.groupby(['read_time', 'cluster']).agg({
        'kwh': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
})
newagg.columns = ['_tot_'.join(x) for x in newagg.columns.ravel()]
newagg = newagg.reset_index()

print(newagg)
newagg.to_pickle(aggfile)