|
@@ -9,6 +9,7 @@ from scipy.spatial.distance import squareform
|
9
|
9
|
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
|
10
|
10
|
from tqdm import tqdm
|
11
|
11
|
from itertools import combinations
|
|
12
|
+from math import factorial as f
|
12
|
13
|
|
13
|
14
|
def tqcorr(df):
|
14
|
15
|
cols = df.columns
|
|
@@ -19,7 +20,8 @@ def tqcorr(df):
|
19
|
20
|
cdf.loc[c, c] = 0
|
20
|
21
|
print(cdf.info())
|
21
|
22
|
comb = combinations(cols, 2)
|
22
|
|
- for c1, c2 in tqdm(comb):
|
|
23
|
+ ncomb = f(ncols) // f(2) // f(ncols - 2)
|
|
24
|
+ for c1, c2 in tqdm(comb, total = ncomb):
|
23
|
25
|
dv = 1 - df[c1].corr(df[c2])
|
24
|
26
|
cdf.loc[c1, c2] = dv
|
25
|
27
|
cdf.loc[c2, c1] = dv
|
|
@@ -29,10 +31,10 @@ def tqcorr(df):
|
29
|
31
|
|
30
|
32
|
tqdm.pandas()
|
31
|
33
|
|
32
|
|
-Sourcedata = '../data/2017-all-wide.pkl'
|
33
|
|
-lableddata = '../data/9-clusters-all.pkl'
|
34
|
|
-aggdata = '../data/9-clusters-all-agg.pkl'
|
35
|
|
-clustertable = '../data/9-clusters-all-table.pkl'
|
|
34
|
+Sourcedata = '../data/2017-5k-wide.pkl'
|
|
35
|
+lableddata = '../data/9-clusters-5k.pkl'
|
|
36
|
+aggdata = '../data/9-clusters-5k-agg.pkl'
|
|
37
|
+clustertable = '../data/9-clusters-5k-table.pkl'
|
36
|
38
|
|
37
|
39
|
numclusts = 9
|
38
|
40
|
df = p.read_pickle(Sourcedata)
|