|
@@ -0,0 +1,44 @@
|
|
1
|
+# Takes the correlation part out of clustering file
|
|
2
|
+from argparse import ArgumentParser
|
|
3
|
+import numpy as np
|
|
4
|
+import pandas as p
|
|
5
|
+from tqdm import tqdm
|
|
6
|
+from itertools import combinations
|
|
7
|
+from math import factorial as f
|
|
8
|
+from pprint import pprint
|
|
9
|
+
|
|
10
|
+def tqcorr(df):
|
|
11
|
+ """Calculates a (distance) correlation matrix for wide-form dataframe, with progressbar
|
|
12
|
+ """
|
|
13
|
+ cols = df.columns
|
|
14
|
+ ncols = len(cols)
|
|
15
|
+ cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
|
|
16
|
+ print(cdf.info())
|
|
17
|
+ for c in tqdm(cols):
|
|
18
|
+ cdf.loc[c, c] = 0
|
|
19
|
+ print(cdf.info())
|
|
20
|
+ comb = combinations(cols, 2)
|
|
21
|
+ ncomb = f(ncols) // f(2) // f(ncols - 2)
|
|
22
|
+ for c1, c2 in tqdm(comb, total = ncomb):
|
|
23
|
+ dv = 1 - df[c1].corr(df[c2])
|
|
24
|
+ cdf.loc[c1, c2] = dv
|
|
25
|
+ cdf.loc[c2, c1] = dv
|
|
26
|
+ print(cdf.info())
|
|
27
|
+ return cdf
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+def createCorr(source, output):
|
|
31
|
+ """Load a pkl in wide form from source, process, run tqcorr() and save response to output
|
|
32
|
+ """
|
|
33
|
+ df = p.read_pickle(source)
|
|
34
|
+ df = df[df.columns[df.max() != df.min()]]
|
|
35
|
+ cmat = tqcorr(df)
|
|
36
|
+ cmat.to_pickle(output)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+if __name__ == "__main__":
|
|
40
|
+ parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe')
|
|
41
|
+ parser.add_argument("-i", "--input", dest="input", help = "input pickle path; default: ../data/2017-5k-wide.pkl", metavar="[PATH]", default = "../data/2017-5k-wide.pkl")
|
|
42
|
+ parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/fulldcorrmatrix.pkl")
|
|
43
|
+ args = parser.parse_args()
|
|
44
|
+ createCorr(args.input, args.output)
|