|
@@ -36,7 +36,7 @@ from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
|
36
|
36
|
# plt.show()
|
37
|
37
|
|
38
|
38
|
numclusts = 9
|
39
|
|
-df = p.read_pickle('../data/2017-sample.pkl')
|
|
39
|
+df = p.read_pickle('../data/2016-17-sample.pkl')
|
40
|
40
|
dforig = df
|
41
|
41
|
|
42
|
42
|
print(df.info())
|
|
@@ -81,36 +81,36 @@ print(mdagg.info())
|
81
|
81
|
print(mdagg.describe())
|
82
|
82
|
# mdf.to_csv('~/windows/Documents/clusters-ward.csv')
|
83
|
83
|
print("Saving")
|
84
|
|
-mdf.to_pickle('../data/9-clusters.pkl')
|
85
|
|
-mdagg.to_pickle('../data/9-clusters.agg.pkl')
|
|
84
|
+mdf.to_pickle('../data/9-clusters-1617.pkl')
|
|
85
|
+mdagg.to_pickle('../data/9-clusters-1617.agg.pkl')
|
86
|
86
|
print("saved")
|
87
|
87
|
|
88
|
88
|
# Algorithm via
|
89
|
89
|
# <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
|
90
|
|
-# ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
|
91
|
|
-# link_cols = {}
|
92
|
|
-# for i, i12 in enumerate(lobj[:,:2].astype(int)):
|
93
|
|
-# c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
|
94
|
|
-# for x in i12)
|
95
|
|
-# link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
|
96
|
|
-#
|
97
|
|
-# plt.figure(figsize = (25, 10))
|
98
|
|
-# plt.title('ICP Clustering Dendrogram')
|
99
|
|
-# plt.xlabel('ICP ID/(Number of ICPs)')
|
100
|
|
-# plt.ylabel('distance')
|
101
|
|
-# dendrogram(
|
102
|
|
-# lobj,
|
103
|
|
-# labels = cmat.index.values,
|
104
|
|
-# leaf_rotation=90,
|
105
|
|
-# leaf_font_size=8,
|
106
|
|
-# # show_leaf_counts = True,
|
107
|
|
-# # truncate_mode = 'lastp',
|
108
|
|
-# # p = 50,
|
109
|
|
-# # show_contracted = True,
|
110
|
|
-# link_color_func = lambda x: link_cols[x],
|
111
|
|
-# color_threshold = None
|
112
|
|
-# )
|
113
|
|
-# plt.show()
|
|
90
|
+ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
|
|
91
|
+link_cols = {}
|
|
92
|
+for i, i12 in enumerate(lobj[:,:2].astype(int)):
|
|
93
|
+ c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
|
|
94
|
+ for x in i12)
|
|
95
|
+ link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
|
|
96
|
+
|
|
97
|
+plt.figure(figsize = (25, 10))
|
|
98
|
+plt.title('ICP Clustering Dendrogram')
|
|
99
|
+plt.xlabel('ICP ID/(Number of ICPs)')
|
|
100
|
+plt.ylabel('distance')
|
|
101
|
+dendrogram(
|
|
102
|
+ lobj,
|
|
103
|
+ labels = cmat.index.values,
|
|
104
|
+ leaf_rotation=90,
|
|
105
|
+ leaf_font_size=8,
|
|
106
|
+ # show_leaf_counts = True,
|
|
107
|
+ # truncate_mode = 'lastp',
|
|
108
|
+ # p = 50,
|
|
109
|
+ # show_contracted = True,
|
|
110
|
+ link_color_func = lambda x: link_cols[x],
|
|
111
|
+ color_threshold = None
|
|
112
|
+)
|
|
113
|
+plt.show()
|
114
|
114
|
|
115
|
115
|
# sns.set()
|
116
|
116
|
#
|