Repository for Petra's work at ampli Jan-Feb 2019

dcorr.py 1.6KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. # Takes the correlation part out of clustering file
  2. from argparse import ArgumentParser
  3. import numpy as np
  4. import pandas as p
  5. from tqdm import tqdm
  6. from itertools import combinations
  7. from math import factorial as f
  8. from pprint import pprint
  9. def tqcorr(df):
  10. """Calculates a (distance) correlation matrix for wide-form dataframe, with progressbar
  11. """
  12. cols = df.columns
  13. ncols = len(cols)
  14. cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
  15. print(cdf.info())
  16. for c in tqdm(cols):
  17. cdf.loc[c, c] = 0
  18. print(cdf.info())
  19. comb = combinations(cols, 2)
  20. ncomb = f(ncols) // f(2) // f(ncols - 2)
  21. for c1, c2 in tqdm(comb, total = ncomb):
  22. dv = 1 - df[c1].corr(df[c2])
  23. cdf.loc[c1, c2] = dv
  24. cdf.loc[c2, c1] = dv
  25. print(cdf.info())
  26. return cdf
  27. def createCorr(source, output):
  28. """Load a pkl in wide form from source, process, run tqcorr() and save response to output
  29. """
  30. df = p.read_pickle(source)
  31. df = df[df.columns[df.max() != df.min()]]
  32. cmat = tqcorr(df)
  33. cmat.to_pickle(output)
  34. if __name__ == "__main__":
  35. parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe')
  36. parser.add_argument("-i", "--input", dest="input", help = "input pickle path; default: ../data/2017-5k-wide.pkl", metavar="[PATH]", default = "../data/2017-5k-wide.pkl")
  37. parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/fulldcorrmatrix.pkl")
  38. args = parser.parse_args()
  39. createCorr(args.input, args.output)