Browse Source

Ok, so this should make a distance-correlation matrix of everything

Unfortunately, it would take approximately a week on an r5.large, so
I'm going to work with a large sample instead
Petra Lamborn 5 years ago
parent
commit
4c3c2ff9d0
1 changed files with 6 additions and 4 deletions
  1. 6
    4
      py/clustering.py

+ 6
- 4
py/clustering.py View File

20
     print(cdf.info())
20
     print(cdf.info())
21
     comb = combinations(cols, 2)
21
     comb = combinations(cols, 2)
22
     for c1, c2 in tqdm(comb):
22
     for c1, c2 in tqdm(comb):
23
-        cdf.loc[c1, c2] = 1 - df[c1].corr(df[c2])
23
+        dv = 1 - df[c1].corr(df[c2])
24
+        cdf.loc[c1, c2] = dv
25
+        cdf.loc[c2, c1] = dv
24
     print(cdf.info())
26
     print(cdf.info())
25
     return cdf
27
     return cdf
26
 
28
 
28
 tqdm.pandas()
30
 tqdm.pandas()
29
 
31
 
30
 Sourcedata =   '../data/2017-all-wide.pkl'
32
 Sourcedata =   '../data/2017-all-wide.pkl'
31
-lableddata =   '../data/9-clusters.pkl'
32
-aggdata =      '../data/9-clusters.agg.pkl'
33
-clustertable = '../data/9-clusters-sample-table.pkl'
33
+lableddata =   '../data/9-clusters-all.pkl'
34
+aggdata =      '../data/9-clusters-all-agg.pkl'
35
+clustertable = '../data/9-clusters-all-table.pkl'
34
 
36
 
35
 numclusts = 9
37
 numclusts = 9
36
 df = p.read_pickle(Sourcedata)
38
 df = p.read_pickle(Sourcedata)