Repository for Petra's work at ampli Jan-Feb 2019

projprocess.py 1.0KB

123456789101112131415161718192021222324252627282930313233343536
  1. # This file simply takes future kwh data for the thousand previously
  2. # sampled ICPs and calculates new aggregated measures for each cluster
  3. #
  4. # Note: This is now redundant with agg.py
  5. import pandas as p
  6. # Load new data
  7. newdat = p.read_pickle("../data/2018-proj-sample.pkl")
  8. print(newdat.info())
  9. # Get cluseters dataframe
  10. clusters = p.read_pickle("../data/9-clusters.pkl")
  11. print(clusters)
  12. print(clusters.info())
  13. clusters = clusters.drop(['read_time', 'kwh_tot'],
  14. axis = 1).drop_duplicates().reset_index(drop = True)
  15. # Join dataframes
  16. newdat = clusters.set_index('icp_id').join(newdat.set_index('icp_id'), how = 'left').reset_index()
  17. print(newdat)
  18. print(newdat.info())
  19. # Aggregate median and mean (only really want mean)
  20. newagg = newdat.groupby(['read_time', 'cluster']).agg({
  21. 'kwh_tot': ['median', 'mean']
  22. })
  23. newagg.columns = ['_'.join(x) for x in newagg.columns.ravel()]
  24. newagg = newagg.reset_index()
  25. print(newagg)
  26. print(newagg.info())
  27. # Save data
  28. newagg.to_pickle("../data/9-proj-agg.pkl")