Browse Source

Ok, so this should make a distance-correlation matrix of everything

Unfortunately, it would take approximately a week on an r5.large, so
I'm going to work with a large sample instead
Petra Lamborn 5 years ago
parent
commit
4c3c2ff9d0
1 changed files with 6 additions and 4 deletions
  1. 6
    4
      py/clustering.py

+ 6
- 4
py/clustering.py View File

@@ -20,7 +20,9 @@ def tqcorr(df):
20 20
     print(cdf.info())
21 21
     comb = combinations(cols, 2)
22 22
     for c1, c2 in tqdm(comb):
23
-        cdf.loc[c1, c2] = 1 - df[c1].corr(df[c2])
23
+        dv = 1 - df[c1].corr(df[c2])
24
+        cdf.loc[c1, c2] = dv
25
+        cdf.loc[c2, c1] = dv
24 26
     print(cdf.info())
25 27
     return cdf
26 28
 
@@ -28,9 +30,9 @@ def tqcorr(df):
28 30
 tqdm.pandas()
29 31
 
30 32
 Sourcedata =   '../data/2017-all-wide.pkl'
31
-lableddata =   '../data/9-clusters.pkl'
32
-aggdata =      '../data/9-clusters.agg.pkl'
33
-clustertable = '../data/9-clusters-sample-table.pkl'
33
+lableddata =   '../data/9-clusters-all.pkl'
34
+aggdata =      '../data/9-clusters-all-agg.pkl'
35
+clustertable = '../data/9-clusters-all-table.pkl'
34 36
 
35 37
 numclusts = 9
36 38
 df = p.read_pickle(Sourcedata)