|
@@ -6,37 +6,13 @@ import seaborn as sns
|
6
|
6
|
from scipy.spatial.distance import squareform
|
7
|
7
|
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
|
8
|
8
|
|
9
|
|
-
|
10
|
|
-# query = """
|
11
|
|
-# SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
|
12
|
|
-# FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy')
|
13
|
|
-# ORDER BY icp_id, read_time;
|
14
|
|
-# """
|
15
|
|
-#
|
16
|
|
-# qparams = ['%%1117', '20/04/2017']
|
17
|
|
-
|
18
|
|
-# query = """
|
19
|
|
-# SELECT read_date, period, AVG(kwh_tot) AS average
|
20
|
|
-# FROM public.coup_tall_april
|
21
|
|
-# GROUP BY read_date, period
|
22
|
|
-# ORDER BY read_date, period;
|
23
|
|
-# """
|
24
|
|
-#
|
25
|
|
-# qparams = []
|
26
|
|
-#
|
27
|
|
-# df = getQuery(query, qparams)
|
28
|
|
-#
|
29
|
|
-# print(df.info())
|
30
|
|
-#
|
31
|
|
-# sns.set()
|
32
|
|
-#
|
33
|
|
-# #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
|
34
|
|
-# sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
|
35
|
|
-#
|
36
|
|
-# plt.show()
|
|
9
|
+Sourcedata = '../data/2017-sample.pkl'
|
|
10
|
+lableddata = '../data/9-clusters.pkl'
|
|
11
|
+aggdata = '../data/9-clusters.agg.pkl'
|
|
12
|
+clustertable = '../data/9-clusters-sample-table.pkl'
|
37
|
13
|
|
38
|
14
|
numclusts = 9
|
39
|
|
-df = p.read_pickle('../data/2016-17-sample.pkl')
|
|
15
|
+df = p.read_pickle(Sourcedata)
|
40
|
16
|
dforig = df
|
41
|
17
|
|
42
|
18
|
# print(df)
|
|
@@ -68,6 +44,7 @@ print(clusts)
|
68
|
44
|
print(cmat.index.values)
|
69
|
45
|
clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
|
70
|
46
|
print(clustdf)
|
|
47
|
+clustdf.to_pickle(clustertable)
|
71
|
48
|
mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
|
72
|
49
|
print(mdf)
|
73
|
50
|
print(mdf.info())
|
|
@@ -84,8 +61,8 @@ print(mdagg.info())
|
84
|
61
|
print(mdagg.describe())
|
85
|
62
|
# mdf.to_csv('~/windows/Documents/clusters-ward.csv')
|
86
|
63
|
print("Saving")
|
87
|
|
-mdf.to_pickle('../data/9-clusters-1617.pkl')
|
88
|
|
-mdagg.to_pickle('../data/9-clusters-1617.agg.pkl')
|
|
64
|
+mdf.to_pickle(lableddata)
|
|
65
|
+mdagg.to_pickle(aggdata)
|
89
|
66
|
print("saved")
|
90
|
67
|
|
91
|
68
|
# Algorithm via
|
|
@@ -115,12 +92,12 @@ dendrogram(
|
115
|
92
|
)
|
116
|
93
|
plt.show()
|
117
|
94
|
|
118
|
|
-# sns.set()
|
119
|
|
-#
|
120
|
|
-# f, axes = plt.subplots(3,3)
|
121
|
|
-#
|
122
|
|
-# for i, c in enumerate(clabs):
|
123
|
|
-# fds = mdagg[mdagg.cluster == c]
|
124
|
|
-# sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
|
125
|
|
-# axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
|
126
|
|
-# plt.show()
|
|
95
|
+sns.set()
|
|
96
|
+
|
|
97
|
+f, axes = plt.subplots(3,3)
|
|
98
|
+
|
|
99
|
+for i, c in enumerate(clabs):
|
|
100
|
+ fds = mdagg[mdagg.cluster == c]
|
|
101
|
+ sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
|
|
102
|
+ axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
|
|
103
|
+plt.show()
|