Repository for Petra's work at ampli Jan-Feb 2019

agg.py 2.1KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. # Aggregrate given clusters
  2. from argparse import ArgumentParser
  3. import pandas as p
  4. from tqdm import tqdm
  5. def aggregator(widedf, clusdf):
  6. """Aggregate a (wide-form) dataframe by the cluster mappings in a second dataframe
  7. """
  8. clusters = clusdf['cluster'].unique()
  9. clusters.sort()
  10. dflis = []
  11. qlow = lambda x: x.quantile(0.250)
  12. qhigh = lambda x: x.quantile(0.750)
  13. for c in tqdm(clusters):
  14. icps = clusdf[clusdf.cluster == c].icp_id.unique()
  15. subdf = widedf[icps]
  16. aggmed = subdf.agg(func = 'median', axis = 1)
  17. aggmen = subdf.agg(func = 'mean', axis = 1)
  18. aggupq = subdf.agg(func = qlow, axis = 1)
  19. aggdwq = subdf.agg(func = qhigh, axis = 1)
  20. agged = p.DataFrame(data = {
  21. "cluster": c,
  22. "kwh_tot_median": aggmed,
  23. "kwh_tot_mean": aggmen,
  24. "kwh_tot_CI_low": aggupq,
  25. "kwh_tot_CI_high": aggdwq,
  26. }).reset_index()
  27. dflis.append(agged)
  28. adf = p.concat(dflis, axis = 0, ignore_index = True)
  29. return adf
  30. def main():
  31. parser = ArgumentParser(description='Aggregate dataframe by specified clusters')
  32. parser.add_argument("-i", "--input", dest="input", help = "input pickle path; default: ../data/2017-5k-wide.pkl", metavar="PATH", default = "../data/2017-5k-wide.pkl")
  33. parser.add_argument("-c", "--clusters", dest="clusfile", help = "cluster pickle path; default: ../data/5kclustable.pkl", metavar="PATH", default = "../data/5kclustable.pkl")
  34. parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5k-ag.pkl", metavar="PATH", default = "../data/5k-ag.pkl")
  35. parser.add_argument("-p", "--pivot", dest = "istall", help = "input dataframe is in tall format and must be pivoted", action ="store_true")
  36. args = parser.parse_args()
  37. wd = p.read_pickle(args.input)
  38. cd = p.read_pickle(args.clusfile)
  39. if (args.istall):
  40. wd = wd.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
  41. agged = aggregator(wd, cd)
  42. agged.to_pickle(args.output)
  43. if __name__ == "__main__":
  44. main()