# An algorithm for assigning a dataset to pre-existing clusters
import pandas as p
import numpy as np
from pprint import pprint
from argparse import ArgumentParser
from tqdm import tqdm

def AssignClusters(df, agg, threshold = -1):
    agg = agg.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
    del agg.columns.name
    df = df.loc[agg.index, :]

    clusdict = {}

    clusters = list(agg.columns)
    icps = list(df.columns)

    for i in tqdm(icps):
        bestc = -1
        s = df.loc[:, i]
        if (s.min() != s.max()):
            bestr = -1
            for c in clusters:
                thisr = s.corr(agg.loc[:, c], method = 'pearson')
                if thisr > bestr:
                    bestr = thisr
                    bestc = c
        if bestr > threshold:
            clusdict[i] = bestc
        else:
            clusdict[i] = -1

    newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
    newclusdf.index.name = 'icp_id'
    newclusdf = newclusdf.reset_index()
    return newclusdf

def main():
    parser = ArgumentParser(description='Assign clusters found from one dataset to the values of another')
    parser.add_argument("-i", "--input",  dest="input",      help = "input pickle path",  metavar="PATH", required = True)
    parser.add_argument("-a", "--agg", dest="agg",     help = "Aggregated dataset to take cluster information from", metavar="PATH", required = True)
    parser.add_argument("-c", "--clusters", dest="clusfile", help = "output cluster pickle path", metavar="PATH", required = True)
    parser.add_argument("-t", "--threshold", dest = "threshold", help = "Set threshold for clustering; default = -1, range from -1 to 1", default = -1, metavar = "NUM", type = np.float32)
    args = parser.parse_args()
    if args.threshold < -1 or args.threshold >= 1:
        parser.error("-t/--threshold must be at least -1 and less than 1, is {}".format(args.threshold))

    idf = p.read_pickle(args.input)
    adf = p.read_pickle(args.agg)

    cdf = AssignClusters(idf, adf, args.threshold)
    cdf.to_pickle(args.clusfile)


if __name__ == "__main__":
    main()