Repository for Petra's work at ampli Jan-Feb 2019

clustering.py 3.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. from util import getQuery, pickleQuery
  2. import numpy as np
  3. import pandas as p
  4. import matplotlib
  5. matplotlib.use('agg')
  6. import matplotlib.pyplot as plt
  7. import seaborn as sns
  8. from scipy.spatial.distance import squareform
  9. from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
  10. from tqdm import tqdm
  11. from itertools import combinations
  12. from math import factorial as f
  13. def tqcorr(df):
  14. cols = df.columns
  15. ncols = len(cols)
  16. cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
  17. print(cdf.info())
  18. for c in tqdm(cols):
  19. cdf.loc[c, c] = 0
  20. print(cdf.info())
  21. comb = combinations(cols, 2)
  22. ncomb = f(ncols) // f(2) // f(ncols - 2)
  23. for c1, c2 in tqdm(comb, total = ncomb):
  24. dv = 1 - df[c1].corr(df[c2])
  25. cdf.loc[c1, c2] = dv
  26. cdf.loc[c2, c1] = dv
  27. print(cdf.info())
  28. return cdf
  29. tqdm.pandas()
  30. Sourcedata = '../data/2017-5k-wide.pkl'
  31. lableddata = '../data/9-clusters-5k.pkl'
  32. aggdata = '../data/9-clusters-5k-agg.pkl'
  33. clustertable = '../data/9-clusters-5k-table.pkl'
  34. numclusts = 9
  35. df = p.read_pickle(Sourcedata)
  36. # dforig = df
  37. # print(df)
  38. print(df.info())
  39. # print(df.icp_id.nunique())
  40. # print(df.read_time.nunique())
  41. # print(df.groupby('icp_id').read_time.nunique().nunique())
  42. # df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
  43. # print(df.info())
  44. df = df[df.columns[df.max() != df.min()]]
  45. print(df.info())
  46. cmat = tqcorr(df)
  47. print(cmat)
  48. print(cmat.info())
  49. cmat.to_pickle('../data/fulldcorrmatrix.pkl')
  50. # lmat = squareform(1 - cmat)
  51. # lobj = linkage(lmat, method = 'ward')
  52. # print(lobj)
  53. # print(cophenet(lobj, lmat))
  54. # clabs = [x + 1 for x in range(numclusts)]
  55. # cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
  56. # clusts = fcluster(lobj, numclusts, criterion='maxclust')
  57. # print(clusts)
  58. # print(cmat.index.values)
  59. # clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
  60. # print(clustdf)
  61. # clustdf.to_pickle(clustertable)
  62. # mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
  63. # print(mdf)
  64. # print(mdf.info())
  65. # qlow = lambda x: x.quantile(0.250)
  66. # qhigh = lambda x: x.quantile(0.750)
  67. # print(mdf.cluster.describe())
  68. # mdagg = mdf.groupby(['read_time', 'cluster']).agg({
  69. # 'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
  70. # }, q = 0.025)
  71. # mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
  72. # mdagg = mdagg.reset_index()
  73. # print(mdagg)
  74. # print(mdagg.info())
  75. # print(mdagg.describe())
  76. # # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
  77. # print("Saving")
  78. # mdf.to_pickle(lableddata)
  79. # mdagg.to_pickle(aggdata)
  80. # print("saved")
  81. # # Algorithm via
  82. # # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
  83. # ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
  84. # link_cols = {}
  85. # for i, i12 in enumerate(lobj[:,:2].astype(int)):
  86. # c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
  87. # for x in i12)
  88. # link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
  89. # plt.figure(figsize = (25, 10))
  90. # plt.title('ICP Clustering Dendrogram')
  91. # plt.xlabel('ICP ID/(Number of ICPs)')
  92. # plt.ylabel('distance')
  93. # dendrogram(
  94. # lobj,
  95. # labels = cmat.index.values,
  96. # leaf_rotation=90,
  97. # leaf_font_size=8,
  98. # # show_leaf_counts = True,
  99. # # truncate_mode = 'lastp',
  100. # # p = 50,
  101. # # show_contracted = True,
  102. # link_color_func = lambda x: link_cols[x],
  103. # color_threshold = None
  104. # )
  105. # # plt.show()
  106. # plt.savefig("../img/sample-9-dendro.png")
  107. # sns.set()
  108. # f, axes = plt.subplots(3,3)
  109. # for i, c in enumerate(clabs):
  110. # fds = mdagg[mdagg.cluster == c]
  111. # sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
  112. # axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
  113. # # plt.show()
  114. # plt.savefig("../img/sample-9-panedtrends.png")