Browse Source

Algorithm for calculating correlation matrix for 5k

Petra Lamborn 5 years ago
parent
commit
e9a112ad75
1 changed files with 7 additions and 5 deletions
  1. 7
    5
      py/clustering.py

+ 7
- 5
py/clustering.py View File

@@ -9,6 +9,7 @@ from scipy.spatial.distance import squareform
9 9
 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
10 10
 from tqdm import tqdm
11 11
 from itertools import combinations
12
+from math import factorial as f
12 13
 
13 14
 def tqcorr(df):
14 15
     cols = df.columns
@@ -19,7 +20,8 @@ def tqcorr(df):
19 20
         cdf.loc[c, c] = 0
20 21
     print(cdf.info())
21 22
     comb = combinations(cols, 2)
22
-    for c1, c2 in tqdm(comb):
23
+    ncomb = f(ncols) // f(2) // f(ncols - 2)
24
+    for c1, c2 in tqdm(comb, total = ncomb):
23 25
         dv = 1 - df[c1].corr(df[c2])
24 26
         cdf.loc[c1, c2] = dv
25 27
         cdf.loc[c2, c1] = dv
@@ -29,10 +31,10 @@ def tqcorr(df):
29 31
 
30 32
 tqdm.pandas()
31 33
 
32
-Sourcedata =   '../data/2017-all-wide.pkl'
33
-lableddata =   '../data/9-clusters-all.pkl'
34
-aggdata =      '../data/9-clusters-all-agg.pkl'
35
-clustertable = '../data/9-clusters-all-table.pkl'
34
+Sourcedata =   '../data/2017-5k-wide.pkl'
35
+lableddata =   '../data/9-clusters-5k.pkl'
36
+aggdata =      '../data/9-clusters-5k-agg.pkl'
37
+clustertable = '../data/9-clusters-5k-table.pkl'
36 38
 
37 39
 numclusts = 9
38 40
 df = p.read_pickle(Sourcedata)