Repository for Petra's work at ampli Jan-Feb 2019

clusAssign.py 1.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. # An algorithm for assigning a dataset to pre-existing clusters
  2. import pandas as p
  3. from pprint import pprint
  4. # Pre-existing aggregated clusters
  5. clusfile = '../data/9-clusters.agg.pkl'
  6. # A new dataset
  7. ndsfile = '../data/2016-17-sample.pkl'
  8. clusdf = p.read_pickle(clusfile)
  9. clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
  10. del clusdf.columns.name
  11. print(clusdf.info())
  12. newdf = p.read_pickle(ndsfile).pivot(index = 'read_time',
  13. columns = 'icp_id',
  14. values = 'kwh_tot').loc[clusdf.index, :]
  15. print(newdf)
  16. print(newdf.info())
  17. clusdict = {}
  18. clusters = list(clusdf.columns)
  19. icps = list(newdf.columns)
  20. print(clusters)
  21. for i in icps:
  22. bestc = -1
  23. s = newdf.loc[:, i]
  24. if (s.min() != s.max()):
  25. bestr = -1
  26. for c in clusters:
  27. thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
  28. if thisr > bestr:
  29. bestr = thisr
  30. bestc = c
  31. print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
  32. else:
  33. print('ICP {} has constant value; assigning to cluster -1'.format(i))
  34. clusdict[i] = bestc
  35. # Need to make sure cluster is integer ZZZ
  36. newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
  37. newclusdf.index.name = 'icp_id'
  38. newclusdf = newclusdf.reset_index()
  39. # print(newclusdf)
  40. newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')
  41. ######## ZZZ Something isn't working here
  42. anndf = newdf.set_index('icp_id').join(newclusdf)
  43. print(anndf)