Repository for Petra's work at ampli Jan-Feb 2019

dcorr.py 1.7KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # Takes the correlation part out of clustering file
  2. from argparse import ArgumentParser
  3. import numpy as np
  4. import pandas as p
  5. from tqdm import tqdm
  6. from itertools import combinations
  7. from math import factorial as f
  8. def tqcorr(df):
  9. """Calculates a (distance) correlation matrix for wide-form dataframe, with progressbar
  10. """
  11. cols = df.columns
  12. ncols = len(cols)
  13. cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
  14. for c in tqdm(cols):
  15. cdf.loc[c, c] = 0
  16. comb = combinations(cols, 2)
  17. ncomb = f(ncols) // f(2) // f(ncols - 2)
  18. for c1, c2 in tqdm(comb, total = ncomb):
  19. dv = 1 - df[c1].corr(df[c2])
  20. cdf.loc[c1, c2] = dv
  21. cdf.loc[c2, c1] = dv
  22. return cdf
  23. def createCorr(source, output, piv):
  24. """Load a pkl in wide form from source, process, run tqcorr() and save response to output
  25. """
  26. df = p.read_pickle(source)
  27. if piv:
  28. df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
  29. df = df[df.columns[df.max() != df.min()]]
  30. cmat = tqcorr(df)
  31. cmat.to_pickle(output)
  32. if __name__ == "__main__":
  33. parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe')
  34. parser.add_argument("-i", "--input", dest="input", help = "input pickle path", metavar="PATH", required = True)
  35. parser.add_argument("-o", "--output", dest="output", help = "output pickle path", metavar="PATH", required = True)
  36. parser.add_argument("-p", "--pivot", dest = "istall", help = "input dataframe is in tall format and must be pivoted", action ="store_true")
  37. args = parser.parse_args()
  38. createCorr(args.input, args.output, args.istall)