# An algorithm for assigning a dataset to pre-existing clusters import pandas as p import numpy as np from pprint import pprint from argparse import ArgumentParser from tqdm import tqdm def AssignClusters(df, agg, threshold = -1): agg = agg.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean') del agg.columns.name df = df.loc[agg.index, :] clusdict = {} clusters = list(agg.columns) icps = list(df.columns) for i in tqdm(icps): bestc = -1 s = df.loc[:, i] if (s.min() != s.max()): bestr = -1 for c in clusters: thisr = s.corr(agg.loc[:, c], method = 'pearson') if thisr > bestr: bestr = thisr bestc = c if bestr > threshold: clusdict[i] = bestc else: clusdict[i] = -1 newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster']) newclusdf.index.name = 'icp_id' newclusdf = newclusdf.reset_index() return newclusdf def main(): parser = ArgumentParser(description='Assign clusters found from one dataset to the values of another') parser.add_argument("-i", "--input", dest="input", help = "input pickle path", metavar="PATH", required = True) parser.add_argument("-a", "--agg", dest="agg", help = "Aggregated dataset to take cluster information from", metavar="PATH", required = True) parser.add_argument("-c", "--clusters", dest="clusfile", help = "output cluster pickle path", metavar="PATH", required = True) parser.add_argument("-t", "--threshold", dest = "threshold", help = "Set threshold for clustering; default = -1, range from -1 to 1", default = -1, metavar = "NUM", type = np.float32) args = parser.parse_args() if args.threshold < -1 or args.threshold >= 1: parser.error("-t/--threshold must be at least -1 and less than 1, is {}".format(args.threshold)) idf = p.read_pickle(args.input) adf = p.read_pickle(args.agg) cdf = AssignClusters(idf, adf, args.threshold) cdf.to_pickle(args.clusfile) if __name__ == "__main__": main()