5 years ago · 2ea6c125b2
--- a/py/clustering.py
+++ b/py/clustering.py
@@ -6,37 +6,13 @@ import seaborn as sns
 
				 from scipy.spatial.distance import squareform
			
 
				 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
			
 
				 
			
 
				-
			
 
				-# query = """
			
 
				-# SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
			
 
				-# FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy') 
			
 
				-# ORDER BY icp_id, read_time;
			
 
				-# """
			
 
				-# 
			
 
				-# qparams = ['%%1117', '20/04/2017']
			
 
				-
			
 
				-# query = """
			
 
				-# SELECT read_date, period, AVG(kwh_tot) AS average
			
 
				-# FROM public.coup_tall_april
			
 
				-# GROUP BY read_date, period
			
 
				-# ORDER BY read_date, period;
			
 
				-# """
			
 
				-# 
			
 
				-# qparams = []
			
 
				-# 
			
 
				-# df = getQuery(query, qparams)
			
 
				-# 
			
 
				-# print(df.info())
			
 
				-# 
			
 
				-# sns.set()
			
 
				-# 
			
 
				-# #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
			
 
				-# sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
			
 
				-# 
			
 
				-# plt.show()
			
 
				+Sourcedata =   '../data/2017-sample.pkl'
			
 
				+lableddata =   '../data/9-clusters.pkl'
			
 
				+aggdata =      '../data/9-clusters.agg.pkl'
			
 
				+clustertable = '../data/9-clusters-sample-table.pkl'
			
 
				 
			
 
				 numclusts = 9
			
 
				-df = p.read_pickle('../data/2016-17-sample.pkl')
			
 
				+df = p.read_pickle(Sourcedata)
			
 
				 dforig = df
			
 
				 
			
 
				 # print(df)
			
@@ -68,6 +44,7 @@ print(clusts)
 
				 print(cmat.index.values)
			
 
				 clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
			
 
				 print(clustdf)
			
 
				+clustdf.to_pickle(clustertable)
			
 
				 mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
			
 
				 print(mdf)
			
 
				 print(mdf.info())
			
@@ -84,8 +61,8 @@ print(mdagg.info())
 
				 print(mdagg.describe())
			
 
				 # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
			
 
				 print("Saving")
			
 
				-mdf.to_pickle('../data/9-clusters-1617.pkl')
			
 
				-mdagg.to_pickle('../data/9-clusters-1617.agg.pkl')
			
 
				+mdf.to_pickle(lableddata)
			
 
				+mdagg.to_pickle(aggdata)
			
 
				 print("saved")
			
 
				 
			
 
				 # Algorithm via 
			
@@ -115,12 +92,12 @@ dendrogram(
 
				 )
			
 
				 plt.show()
			
 
				 
			
 
				-# sns.set()
			
 
				-# 
			
 
				-# f, axes = plt.subplots(3,3)
			
 
				-# 
			
 
				-# for i, c in enumerate(clabs):
			
 
				-#     fds = mdagg[mdagg.cluster == c]
			
 
				-#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
			
 
				-#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
			
 
				-# plt.show()
			
 
				+sns.set()
			
 
				+
			
 
				+f, axes = plt.subplots(3,3)
			
 
				+
			
 
				+for i, c in enumerate(clabs):
			
 
				+    fds = mdagg[mdagg.cluster == c]
			
 
				+    sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
			
 
				+    axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
			
 
				+plt.show()