Repository for Petra's work at ampli Jan-Feb 2019

dcorr.py 1.6KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # Takes the correlation part out of clustering file
  2. from argparse import ArgumentParser
  3. import numpy as np
  4. import pandas as p
  5. from tqdm import tqdm
  6. from itertools import combinations
  7. from math import factorial as f
  8. def tqcorr(df):
  9. """Calculates a (distance) correlation matrix for wide-form dataframe, with progressbar
  10. """
  11. cols = df.columns
  12. ncols = len(cols)
  13. cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
  14. print(cdf.info())
  15. for c in tqdm(cols):
  16. cdf.loc[c, c] = 0
  17. print(cdf.info())
  18. comb = combinations(cols, 2)
  19. ncomb = f(ncols) // f(2) // f(ncols - 2)
  20. for c1, c2 in tqdm(comb, total = ncomb):
  21. dv = 1 - df[c1].corr(df[c2])
  22. cdf.loc[c1, c2] = dv
  23. cdf.loc[c2, c1] = dv
  24. print(cdf.info())
  25. return cdf
  26. def createCorr(source, output):
  27. """Load a pkl in wide form from source, process, run tqcorr() and save response to output
  28. """
  29. df = p.read_pickle(source)
  30. df = df[df.columns[df.max() != df.min()]]
  31. cmat = tqcorr(df)
  32. cmat.to_pickle(output)
  33. if __name__ == "__main__":
  34. parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe')
  35. parser.add_argument("-i", "--input", dest="input", help = "input pickle path; default: ../data/2017-5k-wide.pkl", metavar="[PATH]", default = "../data/2017-5k-wide.pkl")
  36. parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/5kdcorrmatrix.pkl")
  37. args = parser.parse_args()
  38. createCorr(args.input, args.output)