Repository for Petra's work at ampli Jan-Feb 2019

projprocess.py 990B

12345678910111213141516171819202122232425262728293031323334
  1. # This file simply takes future kwh data for the thousand previously
  2. # sampled ICPs and calculates new aggregated measures for each cluster
  3. import pandas as p
  4. # Load new data
  5. newdat = p.read_pickle("../data/2018-proj-sample.pkl")
  6. print(newdat.info())
  7. # Get cluseters dataframe
  8. clusters = p.read_pickle("../data/9-clusters.pkl")
  9. print(clusters)
  10. print(clusters.info())
  11. clusters = clusters.drop(['read_time', 'kwh_tot'],
  12. axis = 1).drop_duplicates().reset_index(drop = True)
  13. # Join dataframes
  14. newdat = clusters.set_index('icp_id').join(newdat.set_index('icp_id'), how = 'left').reset_index()
  15. print(newdat)
  16. print(newdat.info())
  17. # Aggregate median and mean (only really want mean)
  18. newagg = newdat.groupby(['read_time', 'cluster']).agg({
  19. 'kwh_tot': ['median', 'mean']
  20. })
  21. newagg.columns = ['_'.join(x) for x in newagg.columns.ravel()]
  22. newagg = newagg.reset_index()
  23. print(newagg)
  24. print(newagg.info())
  25. # Save data
  26. newagg.to_pickle("../data/9-proj-agg.pkl")