1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- # An algorithm for assigning a dataset to pre-existing clusters
- import pandas as p
- import numpy as np
- from pprint import pprint
- from argparse import ArgumentParser
- from tqdm import tqdm
-
- def AssignClusters(df, agg, threshold = -1):
- agg = agg.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
- del agg.columns.name
- df = df.loc[agg.index, :]
-
- clusdict = {}
-
- clusters = list(agg.columns)
- icps = list(df.columns)
-
- for i in tqdm(icps):
- bestc = -1
- s = df.loc[:, i]
- if (s.min() != s.max()):
- bestr = -1
- for c in clusters:
- thisr = s.corr(agg.loc[:, c], method = 'pearson')
- if thisr > bestr:
- bestr = thisr
- bestc = c
- if bestr > threshold:
- clusdict[i] = bestc
- else:
- clusdict[i] = -1
-
- newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
- newclusdf.index.name = 'icp_id'
- newclusdf = newclusdf.reset_index()
- return newclusdf
-
- def main():
- parser = ArgumentParser(description='Assign clusters found from one dataset to the values of another')
- parser.add_argument("-i", "--input", dest="input", help = "input pickle path", metavar="PATH", required = True)
- parser.add_argument("-a", "--agg", dest="agg", help = "Aggregated dataset to take cluster information from", metavar="PATH", required = True)
- parser.add_argument("-c", "--clusters", dest="clusfile", help = "output cluster pickle path", metavar="PATH", required = True)
- parser.add_argument("-t", "--threshold", dest = "threshold", help = "Set threshold for clustering; default = -1, range from -1 to 1", default = -1, metavar = "NUM", type = np.float32)
- args = parser.parse_args()
- if args.threshold < -1 or args.threshold >= 1:
- parser.error("-t/--threshold must be at least -1 and less than 1, is {}".format(args.threshold))
-
- idf = p.read_pickle(args.input)
- adf = p.read_pickle(args.agg)
-
- cdf = AssignClusters(idf, adf, args.threshold)
- cdf.to_pickle(args.clusfile)
-
-
- if __name__ == "__main__":
- main()
|