# This file simply takes future kwh data for the thousand previously # sampled ICPs and calculates new aggregated measures for each cluster import pandas as p # Load new data newdat = p.read_pickle("../data/2018-proj-sample.pkl") print(newdat.info()) # Get cluseters dataframe clusters = p.read_pickle("../data/9-clusters.pkl") print(clusters) print(clusters.info()) clusters = clusters.drop(['read_time', 'kwh_tot'], axis = 1).drop_duplicates().reset_index(drop = True) # Join dataframes newdat = clusters.set_index('icp_id').join(newdat.set_index('icp_id'), how = 'left').reset_index() print(newdat) print(newdat.info()) # Aggregate median and mean (only really want mean) newagg = newdat.groupby(['read_time', 'cluster']).agg({ 'kwh_tot': ['median', 'mean'] }) newagg.columns = ['_'.join(x) for x in newagg.columns.ravel()] newagg = newagg.reset_index() print(newagg) print(newagg.info()) # Save data newagg.to_pickle("../data/9-proj-agg.pkl")