Browse Source

Make separate dcorr script

Petra Lamborn 5 years ago
parent
commit
060efd02d4
1 changed files with 44 additions and 0 deletions
  1. 44
    0
      py/dcorr.py

+ 44
- 0
py/dcorr.py View File

@@ -0,0 +1,44 @@
1
+# Takes the correlation part out of clustering file
2
+from argparse import ArgumentParser
3
+import numpy as np
4
+import pandas as p
5
+from tqdm import tqdm
6
+from itertools import combinations
7
+from math import factorial as f
8
+from pprint import pprint
9
+
10
+def tqcorr(df):
11
+    """Calculates a (distance) correlation matrix for wide-form dataframe, with progressbar
12
+    """
13
+    cols = df.columns
14
+    ncols = len(cols)
15
+    cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
16
+    print(cdf.info())
17
+    for c in tqdm(cols):
18
+        cdf.loc[c, c] = 0
19
+    print(cdf.info())
20
+    comb = combinations(cols, 2)
21
+    ncomb = f(ncols) // f(2) // f(ncols - 2)
22
+    for c1, c2 in tqdm(comb, total = ncomb):
23
+        dv = 1 - df[c1].corr(df[c2])
24
+        cdf.loc[c1, c2] = dv
25
+        cdf.loc[c2, c1] = dv
26
+    print(cdf.info())
27
+    return cdf
28
+
29
+
30
+def createCorr(source, output):
31
+    """Load a pkl in wide form from source, process, run tqcorr() and save response to output
32
+    """
33
+    df = p.read_pickle(source)
34
+    df = df[df.columns[df.max() != df.min()]]
35
+    cmat = tqcorr(df)
36
+    cmat.to_pickle(output)
37
+
38
+
39
+if __name__ == "__main__":
40
+    parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe')
41
+    parser.add_argument("-i", "--input",  dest="input",  help = "input pickle path; default: ../data/2017-5k-wide.pkl",  metavar="[PATH]", default = "../data/2017-5k-wide.pkl")
42
+    parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="[PATH]", default = "../data/fulldcorrmatrix.pkl")
43
+    args = parser.parse_args()
44
+    createCorr(args.input, args.output)