# Takes the correlation part out of clustering file from argparse import ArgumentParser import numpy as np import pandas as p from tqdm import tqdm from itertools import combinations from math import factorial as f def tqcorr(df): """Calculates a (distance) correlation matrix for wide-form dataframe, with progressbar """ cols = df.columns ncols = len(cols) cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16) for c in tqdm(cols): cdf.loc[c, c] = 0 comb = combinations(cols, 2) ncomb = f(ncols) // f(2) // f(ncols - 2) for c1, c2 in tqdm(comb, total = ncomb): dv = 1 - df[c1].corr(df[c2]) cdf.loc[c1, c2] = dv cdf.loc[c2, c1] = dv return cdf def createCorr(source, output, piv): """Load a pkl in wide form from source, process, run tqcorr() and save response to output """ df = p.read_pickle(source) if piv: df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot') df = df[df.columns[df.max() != df.min()]] cmat = tqcorr(df) cmat.to_pickle(output) if __name__ == "__main__": parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe') parser.add_argument("-i", "--input", dest="input", help = "input pickle path", metavar="PATH", required = True) parser.add_argument("-o", "--output", dest="output", help = "output pickle path", metavar="PATH", required = True) parser.add_argument("-p", "--pivot", dest = "istall", help = "input dataframe is in tall format and must be pivoted", action ="store_true") args = parser.parse_args() createCorr(args.input, args.output, args.istall)