Repository for Petra's work at ampli Jan-Feb 2019

clustering.py 2.2KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. from util import getQuery, pickleQuery
  2. import pandas as p
  3. import matplotlib.pyplot as plt
  4. import seaborn as sns
  5. from scipy.spatial.distance import squareform
  6. from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
  7. # query = """
  8. # SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
  9. # FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy')
  10. # ORDER BY icp_id, read_time;
  11. # """
  12. #
  13. # qparams = ['%%1117', '20/04/2017']
  14. # query = """
  15. # SELECT read_date, period, AVG(kwh_tot) AS average
  16. # FROM public.coup_tall_april
  17. # GROUP BY read_date, period
  18. # ORDER BY read_date, period;
  19. # """
  20. #
  21. # qparams = []
  22. #
  23. # df = getQuery(query, qparams)
  24. #
  25. # print(df.info())
  26. #
  27. # sns.set()
  28. #
  29. # #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
  30. # sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
  31. #
  32. # plt.show()
  33. df = p.read_pickle('../data/jan19s.pkl')
  34. dforig = df
  35. print(df.info())
  36. print(df.icp_id.nunique())
  37. print(df.read_time.nunique())
  38. print(df.groupby('icp_id').read_time.nunique().nunique())
  39. df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
  40. df = df[df.columns[df.max() != df.min()]]
  41. print(df.info())
  42. cmat = df.corr()
  43. print(cmat.info())
  44. lmat = squareform(1 - cmat)
  45. lobj = linkage(lmat, method = 'ward')
  46. print(lobj)
  47. print(cophenet(lobj, lmat))
  48. #plt.figure(figsize = (25, 10))
  49. #plt.title('ICP Clustering Dendrogram')
  50. #plt.xlabel('ICP ID/(Number of ICPs)')
  51. #plt.ylabel('distance')
  52. #dendrogram(
  53. # lobj,
  54. # labels = cmat.index.values,
  55. # leaf_rotation=90,
  56. # leaf_font_size=8,
  57. # #show_leaf_counts = True,
  58. # #truncate_mode = 'lastp',
  59. # #p = 50,
  60. # #show_contracted = True,
  61. # color_threshold = 1.9
  62. #)
  63. #plt.show()
  64. clusts = fcluster(lobj, 6, criterion='maxclust')
  65. print(clusts)
  66. print(cmat.index.values)
  67. clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : [chr(x + ord('A') - 1) for x in clusts]})
  68. print(clustdf)
  69. mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
  70. print(mdf)
  71. print(mdf.info())
  72. print(mdf.cluster.describe())
  73. mdf.to_csv('~/windows/Documents/clusters-ward.csv')
  74. sns.set()
  75. sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'cluster', data = mdf)
  76. plt.show()