Repository for Petra's work at ampli Jan-Feb 2019

clusAssign.py 2.1KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # An algorithm for assigning a dataset to pre-existing clusters
  2. import pandas as p
  3. from pprint import pprint
  4. # Pre-existing aggregated clusters
  5. clusfile = '../data/9-clusters.agg.pkl'
  6. # A new dataset
  7. ndsfile = '../data/2016-17-sample.pkl'
  8. # Table of assigned clusters
  9. aclusfile = '../data/1617-asgn-table.pkl'
  10. # Aggregated dataset
  11. aggfile = '../data/1617-agg.pkl'
  12. clusdf = p.read_pickle(clusfile)
  13. clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
  14. del clusdf.columns.name
  15. print(clusdf.info())
  16. newdf = p.read_pickle(ndsfile).pivot(index = 'read_time',
  17. columns = 'icp_id',
  18. values = 'kwh_tot').loc[clusdf.index, :]
  19. print(newdf)
  20. print(newdf.info())
  21. clusdict = {}
  22. clusters = list(clusdf.columns)
  23. icps = list(newdf.columns)
  24. print(clusters)
  25. for i in icps:
  26. bestc = -1
  27. s = newdf.loc[:, i]
  28. if (s.min() != s.max()):
  29. bestr = -1
  30. for c in clusters:
  31. thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
  32. if thisr > bestr:
  33. bestr = thisr
  34. bestc = c
  35. print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
  36. else:
  37. print('ICP {} has constant value; assigning to cluster -1'.format(i))
  38. clusdict[i] = bestc
  39. newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
  40. newclusdf.index.name = 'icp_id'
  41. newclusdf = newclusdf.reset_index()
  42. print(newclusdf)
  43. newclusdf.to_pickle(aclusfile)
  44. newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')
  45. print(newdf.info())
  46. print(newclusdf.info())
  47. anndf = newdf.set_index('icp_id').join(newclusdf.set_index('icp_id')).reset_index()
  48. print(anndf)
  49. qlow = lambda x: x.quantile(0.250)
  50. qhigh = lambda x: x.quantile(0.750)
  51. newagg = anndf.groupby(['read_time', 'cluster']).agg({
  52. 'kwh': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
  53. })
  54. newagg.columns = ['_tot_'.join(x) for x in newagg.columns.ravel()]
  55. newagg = newagg.reset_index()
  56. print(newagg)
  57. newagg.to_pickle(aggfile)