Repository for Petra's work at ampli Jan-Feb 2019

clustering.py 3.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. from util import getQuery, pickleQuery
  2. import numpy as np
  3. import pandas as p
  4. import matplotlib
  5. matplotlib.use('agg')
  6. import matplotlib.pyplot as plt
  7. import seaborn as sns
  8. from scipy.spatial.distance import squareform
  9. from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
  10. Sourcedata = '../data/2017-sample.pkl'
  11. lableddata = '../data/9-clusters.pkl'
  12. aggdata = '../data/9-clusters.agg.pkl'
  13. clustertable = '../data/9-clusters-sample-table.pkl'
  14. numclusts = 9
  15. df = p.read_pickle(Sourcedata)
  16. dforig = df
  17. # print(df)
  18. print(df.info())
  19. print(df.icp_id.nunique())
  20. print(df.read_time.nunique())
  21. # print(df.groupby('icp_id').read_time.nunique().nunique())
  22. df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
  23. print(df.info())
  24. df = df[df.columns[df.max() != df.min()]]
  25. print(df.info())
  26. cmat = df.corr()
  27. print(cmat.info())
  28. lmat = squareform(1 - cmat)
  29. lobj = linkage(lmat, method = 'ward')
  30. print(lobj)
  31. print(cophenet(lobj, lmat))
  32. clabs = [x + 1 for x in range(numclusts)]
  33. cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
  34. clusts = fcluster(lobj, numclusts, criterion='maxclust')
  35. print(clusts)
  36. print(cmat.index.values)
  37. clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
  38. print(clustdf)
  39. clustdf.to_pickle(clustertable)
  40. mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
  41. print(mdf)
  42. print(mdf.info())
  43. qlow = lambda x: x.quantile(0.250)
  44. qhigh = lambda x: x.quantile(0.750)
  45. print(mdf.cluster.describe())
  46. mdagg = mdf.groupby(['read_time', 'cluster']).agg({
  47. 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
  48. }, q = 0.025)
  49. mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
  50. mdagg = mdagg.reset_index()
  51. print(mdagg)
  52. print(mdagg.info())
  53. print(mdagg.describe())
  54. # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
  55. print("Saving")
  56. mdf.to_pickle(lableddata)
  57. mdagg.to_pickle(aggdata)
  58. print("saved")
  59. # Algorithm via
  60. # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
  61. ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
  62. link_cols = {}
  63. for i, i12 in enumerate(lobj[:,:2].astype(int)):
  64. c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
  65. for x in i12)
  66. link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
  67. plt.figure(figsize = (25, 10))
  68. plt.title('ICP Clustering Dendrogram')
  69. plt.xlabel('ICP ID/(Number of ICPs)')
  70. plt.ylabel('distance')
  71. dendrogram(
  72. lobj,
  73. labels = cmat.index.values,
  74. leaf_rotation=90,
  75. leaf_font_size=8,
  76. # show_leaf_counts = True,
  77. # truncate_mode = 'lastp',
  78. # p = 50,
  79. # show_contracted = True,
  80. link_color_func = lambda x: link_cols[x],
  81. color_threshold = None
  82. )
  83. # plt.show()
  84. plt.savefig("../img/sample-9-dendro.png")
  85. sns.set()
  86. f, axes = plt.subplots(3,3)
  87. for i, c in enumerate(clabs):
  88. fds = mdagg[mdagg.cluster == c]
  89. sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
  90. axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
  91. # plt.show()
  92. plt.savefig("../img/sample-9-panedtrends.png")