5 years ago · 2ea6c125b2
--- a/py/clustering.py
+++ b/py/clustering.py
 
															 from scipy.spatial.distance import squareform
														
 
															 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
														
 
															-
														
 
															-# query = """
														
 
															-# SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
														
 
															-# FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy') 
														
 
															-# ORDER BY icp_id, read_time;
														
 
															-# """
														
 
															-# 
														
 
															-# qparams = ['%%1117', '20/04/2017']
														
 
															-
														
 
															-# query = """
														
 
															-# SELECT read_date, period, AVG(kwh_tot) AS average
														
 
															-# FROM public.coup_tall_april
														
 
															-# GROUP BY read_date, period
														
 
															-# ORDER BY read_date, period;
														
 
															-# """
														
 
															-# 
														
 
															-# qparams = []
														
 
															-# 
														
 
															-# df = getQuery(query, qparams)
														
 
															-# 
														
 
															-# print(df.info())
														
 
															-# 
														
 
															-# sns.set()
														
 
															-# 
														
 
															-# #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
														
 
															-# sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
														
 
															-# 
														
 
															-# plt.show()
														
 
															+Sourcedata =   '../data/2017-sample.pkl'
														
 
															+lableddata =   '../data/9-clusters.pkl'
														
 
															+aggdata =      '../data/9-clusters.agg.pkl'
														
 
															+clustertable = '../data/9-clusters-sample-table.pkl'
														
 
															 numclusts = 9
														
 
															-df = p.read_pickle('../data/2016-17-sample.pkl')
														
 
															+df = p.read_pickle(Sourcedata)
														
 
															 dforig = df
														
 
															 # print(df)
														
 
															 print(cmat.index.values)
														
 
															 clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
														
 
															 print(clustdf)
														
 
															+clustdf.to_pickle(clustertable)
														
 
															 mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
														
 
															 print(mdf)
														
 
															 print(mdf.info())
														
 
															 print(mdagg.describe())
														
 
															 # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
														
 
															 print("Saving")
														
 
															-mdf.to_pickle('../data/9-clusters-1617.pkl')
														
 
															-mdagg.to_pickle('../data/9-clusters-1617.agg.pkl')
														
 
															+mdf.to_pickle(lableddata)
														
 
															+mdagg.to_pickle(aggdata)
														
 
															 print("saved")
														
 
															 # Algorithm via 
														
 
															 )
														
 
															 plt.show()
														
 
															-# sns.set()
														
 
															-# 
														
 
															-# f, axes = plt.subplots(3,3)
														
 
															-# 
														
 
															-# for i, c in enumerate(clabs):
														
 
															-#     fds = mdagg[mdagg.cluster == c]
														
 
															-#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
														
 
															-#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
														
 
															-# plt.show()
														
 
															+sns.set()
														
 
															+
														
 
															+f, axes = plt.subplots(3,3)
														
 
															+
														
 
															+for i, c in enumerate(clabs):
														
 
															+    fds = mdagg[mdagg.cluster == c]
														
 
															+    sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
														
 
															+    axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
														
 
															+plt.show()