Browse Source

Algorithm for calculating correlation matrix for 5k

Petra Lamborn 5 years ago
parent
commit
e9a112ad75
1 changed files with 7 additions and 5 deletions
  1. 7
    5
      py/clustering.py

+ 7
- 5
py/clustering.py View File

9
 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
9
 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
10
 from tqdm import tqdm
10
 from tqdm import tqdm
11
 from itertools import combinations
11
 from itertools import combinations
12
+from math import factorial as f
12
 
13
 
13
 def tqcorr(df):
14
 def tqcorr(df):
14
     cols = df.columns
15
     cols = df.columns
19
         cdf.loc[c, c] = 0
20
         cdf.loc[c, c] = 0
20
     print(cdf.info())
21
     print(cdf.info())
21
     comb = combinations(cols, 2)
22
     comb = combinations(cols, 2)
22
-    for c1, c2 in tqdm(comb):
23
+    ncomb = f(ncols) // f(2) // f(ncols - 2)
24
+    for c1, c2 in tqdm(comb, total = ncomb):
23
         dv = 1 - df[c1].corr(df[c2])
25
         dv = 1 - df[c1].corr(df[c2])
24
         cdf.loc[c1, c2] = dv
26
         cdf.loc[c1, c2] = dv
25
         cdf.loc[c2, c1] = dv
27
         cdf.loc[c2, c1] = dv
29
 
31
 
30
 tqdm.pandas()
32
 tqdm.pandas()
31
 
33
 
32
-Sourcedata =   '../data/2017-all-wide.pkl'
33
-lableddata =   '../data/9-clusters-all.pkl'
34
-aggdata =      '../data/9-clusters-all-agg.pkl'
35
-clustertable = '../data/9-clusters-all-table.pkl'
34
+Sourcedata =   '../data/2017-5k-wide.pkl'
35
+lableddata =   '../data/9-clusters-5k.pkl'
36
+aggdata =      '../data/9-clusters-5k-agg.pkl'
37
+clustertable = '../data/9-clusters-5k-table.pkl'
36
 
38
 
37
 numclusts = 9
39
 numclusts = 9
38
 df = p.read_pickle(Sourcedata)
40
 df = p.read_pickle(Sourcedata)