Repository for Petra's work at ampli Jan-Feb 2019

clusAssign.py 2.1KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. # An algorithm for assigning a dataset to pre-existing clusters
  2. import pandas as p
  3. import numpy as np
  4. from pprint import pprint
  5. from argparse import ArgumentParser
  6. from tqdm import tqdm
  7. def AssignClusters(df, agg, threshold = -1):
  8. agg = agg.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
  9. del agg.columns.name
  10. df = df.loc[agg.index, :]
  11. clusdict = {}
  12. clusters = list(agg.columns)
  13. icps = list(df.columns)
  14. for i in tqdm(icps):
  15. bestc = -1
  16. s = df.loc[:, i]
  17. if (s.min() != s.max()):
  18. bestr = -1
  19. for c in clusters:
  20. thisr = s.corr(agg.loc[:, c], method = 'pearson')
  21. if thisr > bestr:
  22. bestr = thisr
  23. bestc = c
  24. if bestr > threshold:
  25. clusdict[i] = bestc
  26. else:
  27. clusdict[i] = -1
  28. newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
  29. newclusdf.index.name = 'icp_id'
  30. newclusdf = newclusdf.reset_index()
  31. return newclusdf
  32. def main():
  33. parser = ArgumentParser(description='Assign clusters found from one dataset to the values of another')
  34. parser.add_argument("-i", "--input", dest="input", help = "input pickle path", metavar="PATH", required = True)
  35. parser.add_argument("-a", "--agg", dest="agg", help = "Aggregated dataset to take cluster information from", metavar="PATH", required = True)
  36. parser.add_argument("-c", "--clusters", dest="clusfile", help = "output cluster pickle path", metavar="PATH", required = True)
  37. parser.add_argument("-t", "--threshold", dest = "threshold", help = "Set threshold for clustering; default = -1, range from -1 to 1", default = -1, metavar = "NUM", type = np.float32)
  38. args = parser.parse_args()
  39. if args.threshold < -1 or args.threshold >= 1:
  40. parser.error("-t/--threshold must be at least -1 and less than 1, is {}".format(args.threshold))
  41. idf = p.read_pickle(args.input)
  42. adf = p.read_pickle(args.agg)
  43. cdf = AssignClusters(idf, adf, args.threshold)
  44. cdf.to_pickle(args.clusfile)
  45. if __name__ == "__main__":
  46. main()