123456789101112131415161718192021222324252627282930313233343536 |
- # This file simply takes future kwh data for the thousand previously
- # sampled ICPs and calculates new aggregated measures for each cluster
- #
- # Note: This is now redundant with agg.py
-
- import pandas as p
-
- # Load new data
- newdat = p.read_pickle("../data/2018-proj-sample.pkl")
- print(newdat.info())
-
- # Get cluseters dataframe
- clusters = p.read_pickle("../data/9-clusters.pkl")
- print(clusters)
- print(clusters.info())
- clusters = clusters.drop(['read_time', 'kwh_tot'],
- axis = 1).drop_duplicates().reset_index(drop = True)
-
- # Join dataframes
- newdat = clusters.set_index('icp_id').join(newdat.set_index('icp_id'), how = 'left').reset_index()
-
- print(newdat)
- print(newdat.info())
-
- # Aggregate median and mean (only really want mean)
- newagg = newdat.groupby(['read_time', 'cluster']).agg({
- 'kwh_tot': ['median', 'mean']
- })
- newagg.columns = ['_'.join(x) for x in newagg.columns.ravel()]
- newagg = newagg.reset_index()
-
- print(newagg)
- print(newagg.info())
-
- # Save data
- newagg.to_pickle("../data/9-proj-agg.pkl")
|