5 years ago · 6231929972
--- a/py/clustering.py
+++ b/py/clustering.py
@@ -7,101 +7,118 @@ import matplotlib.pyplot as plt
 
				 import seaborn as sns
			
 
				 from scipy.spatial.distance import squareform
			
 
				 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
			
 
				+from tqdm import tqdm
			
 
				 
			
 
				-Sourcedata =   '../data/2017-sample.pkl'
			
 
				+def tqcorr(df):
			
 
				+    cols = df.columns
			
 
				+    ncols = len(cols)
			
 
				+    cdf = p.DataFrame(index = cols, columns = cols)
			
 
				+    for c in tqdm(range(ncols)):
			
 
				+        cind = cols[c]
			
 
				+        cdf.loc[cind, cind] = 1
			
 
				+        # for i in range(c + 1, ncols):
			
 
				+        #     print("ZZZ")
			
 
				+    return cdf
			
 
				+
			
 
				+
			
 
				+tqdm.pandas()
			
 
				+
			
 
				+Sourcedata =   '../data/2017-all-wide.pkl'
			
 
				 lableddata =   '../data/9-clusters.pkl'
			
 
				 aggdata =      '../data/9-clusters.agg.pkl'
			
 
				 clustertable = '../data/9-clusters-sample-table.pkl'
			
 
				 
			
 
				 numclusts = 9
			
 
				 df = p.read_pickle(Sourcedata)
			
 
				-dforig = df
			
 
				+# dforig = df
			
 
				 
			
 
				 # print(df)
			
 
				 
			
 
				 print(df.info())
			
 
				-print(df.icp_id.nunique())
			
 
				-print(df.read_time.nunique())
			
 
				+tqcorr(df)
			
 
				+# print(df.icp_id.nunique())
			
 
				+# print(df.read_time.nunique())
			
 
				 # print(df.groupby('icp_id').read_time.nunique().nunique())
			
 
				-df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
			
 
				-print(df.info())
			
 
				+# df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
			
 
				+# print(df.info())
			
 
				 df = df[df.columns[df.max() != df.min()]]
			
 
				 print(df.info())
			
 
				-cmat = df.corr()
			
 
				+cmat = tqcorr(df)
			
 
				+print(cmat)
			
 
				 print(cmat.info())
			
 
				 
			
 
				-lmat = squareform(1 - cmat)
			
 
				-
			
 
				-lobj = linkage(lmat, method = 'ward')
			
 
				-print(lobj)
			
 
				-print(cophenet(lobj, lmat))
			
 
				-
			
 
				-
			
 
				-
			
 
				-clabs = [x + 1 for x in range(numclusts)]
			
 
				-cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
			
 
				-
			
 
				-clusts = fcluster(lobj, numclusts, criterion='maxclust')
			
 
				-print(clusts)
			
 
				-print(cmat.index.values)
			
 
				-clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
			
 
				-print(clustdf)
			
 
				-clustdf.to_pickle(clustertable)
			
 
				-mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
			
 
				-print(mdf)
			
 
				-print(mdf.info())
			
 
				-qlow  = lambda x: x.quantile(0.250)
			
 
				-qhigh = lambda x: x.quantile(0.750)
			
 
				-print(mdf.cluster.describe())
			
 
				-mdagg = mdf.groupby(['read_time', 'cluster']).agg({
			
 
				-        'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
			
 
				-}, q = 0.025)
			
 
				-mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
			
 
				-mdagg = mdagg.reset_index()
			
 
				-print(mdagg)
			
 
				-print(mdagg.info())
			
 
				-print(mdagg.describe())
			
 
				-# mdf.to_csv('~/windows/Documents/clusters-ward.csv')
			
 
				-print("Saving")
			
 
				-mdf.to_pickle(lableddata)
			
 
				-mdagg.to_pickle(aggdata)
			
 
				-print("saved")
			
 
				-
			
 
				-# Algorithm via 
			
 
				-# <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
			
 
				-ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
			
 
				-link_cols = {}
			
 
				-for i, i12 in enumerate(lobj[:,:2].astype(int)):
			
 
				-  c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
			
 
				-    for x in i12)
			
 
				-  link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
			
 
				-
			
 
				-plt.figure(figsize = (25, 10))
			
 
				-plt.title('ICP Clustering Dendrogram')
			
 
				-plt.xlabel('ICP ID/(Number of ICPs)')
			
 
				-plt.ylabel('distance')
			
 
				-dendrogram(
			
 
				-    lobj,
			
 
				-    labels = cmat.index.values,
			
 
				-    leaf_rotation=90,
			
 
				-    leaf_font_size=8,
			
 
				-    # show_leaf_counts = True,
			
 
				-    # truncate_mode = 'lastp',
			
 
				-    # p = 50,
			
 
				-    # show_contracted = True,
			
 
				-    link_color_func = lambda x: link_cols[x],
			
 
				-    color_threshold = None
			
 
				-)
			
 
				-# plt.show()
			
 
				-plt.savefig("../img/sample-9-dendro.png")
			
 
				-
			
 
				-sns.set()
			
 
				-
			
 
				-f, axes = plt.subplots(3,3)
			
 
				-
			
 
				-for i, c in enumerate(clabs):
			
 
				-    fds = mdagg[mdagg.cluster == c]
			
 
				-    sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
			
 
				-    axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
			
 
				-# plt.show()
			
 
				-plt.savefig("../img/sample-9-panedtrends.png")
			
 
				+# lmat = squareform(1 - cmat)
			
 
				+
			
 
				+# lobj = linkage(lmat, method = 'ward')
			
 
				+# print(lobj)
			
 
				+# print(cophenet(lobj, lmat))
			
 
				+
			
 
				+
			
 
				+
			
 
				+# clabs = [x + 1 for x in range(numclusts)]
			
 
				+# cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
			
 
				+
			
 
				+# clusts = fcluster(lobj, numclusts, criterion='maxclust')
			
 
				+# print(clusts)
			
 
				+# print(cmat.index.values)
			
 
				+# clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
			
 
				+# print(clustdf)
			
 
				+# clustdf.to_pickle(clustertable)
			
 
				+# mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
			
 
				+# print(mdf)
			
 
				+# print(mdf.info())
			
 
				+# qlow  = lambda x: x.quantile(0.250)
			
 
				+# qhigh = lambda x: x.quantile(0.750)
			
 
				+# print(mdf.cluster.describe())
			
 
				+# mdagg = mdf.groupby(['read_time', 'cluster']).agg({
			
 
				+#         'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
			
 
				+# }, q = 0.025)
			
 
				+# mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
			
 
				+# mdagg = mdagg.reset_index()
			
 
				+# print(mdagg)
			
 
				+# print(mdagg.info())
			
 
				+# print(mdagg.describe())
			
 
				+# # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
			
 
				+# print("Saving")
			
 
				+# mdf.to_pickle(lableddata)
			
 
				+# mdagg.to_pickle(aggdata)
			
 
				+# print("saved")
			
 
				+
			
 
				+# # Algorithm via 
			
 
				+# # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
			
 
				+# ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
			
 
				+# link_cols = {}
			
 
				+# for i, i12 in enumerate(lobj[:,:2].astype(int)):
			
 
				+#   c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
			
 
				+#     for x in i12)
			
 
				+#   link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
			
 
				+
			
 
				+# plt.figure(figsize = (25, 10))
			
 
				+# plt.title('ICP Clustering Dendrogram')
			
 
				+# plt.xlabel('ICP ID/(Number of ICPs)')
			
 
				+# plt.ylabel('distance')
			
 
				+# dendrogram(
			
 
				+#     lobj,
			
 
				+#     labels = cmat.index.values,
			
 
				+#     leaf_rotation=90,
			
 
				+#     leaf_font_size=8,
			
 
				+#     # show_leaf_counts = True,
			
 
				+#     # truncate_mode = 'lastp',
			
 
				+#     # p = 50,
			
 
				+#     # show_contracted = True,
			
 
				+#     link_color_func = lambda x: link_cols[x],
			
 
				+#     color_threshold = None
			
 
				+# )
			
 
				+# # plt.show()
			
 
				+# plt.savefig("../img/sample-9-dendro.png")
			
 
				+
			
 
				+# sns.set()
			
 
				+
			
 
				+# f, axes = plt.subplots(3,3)
			
 
				+
			
 
				+# for i, c in enumerate(clabs):
			
 
				+#     fds = mdagg[mdagg.cluster == c]
			
 
				+#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
			
 
				+#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
			
 
				+# # plt.show()
			
 
				+# plt.savefig("../img/sample-9-panedtrends.png")