Browse Source

Refactor out filenames for data pickles

Petra Lamborn 5 years ago
parent
commit
2ea6c125b2
1 changed files with 17 additions and 40 deletions
  1. 17
    40
      py/clustering.py

+ 17
- 40
py/clustering.py View File

6
 from scipy.spatial.distance import squareform
6
 from scipy.spatial.distance import squareform
7
 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
7
 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
8
 
8
 
9
-
10
-# query = """
11
-# SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
12
-# FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy') 
13
-# ORDER BY icp_id, read_time;
14
-# """
15
-# 
16
-# qparams = ['%%1117', '20/04/2017']
17
-
18
-# query = """
19
-# SELECT read_date, period, AVG(kwh_tot) AS average
20
-# FROM public.coup_tall_april
21
-# GROUP BY read_date, period
22
-# ORDER BY read_date, period;
23
-# """
24
-# 
25
-# qparams = []
26
-# 
27
-# df = getQuery(query, qparams)
28
-# 
29
-# print(df.info())
30
-# 
31
-# sns.set()
32
-# 
33
-# #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
34
-# sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
35
-# 
36
-# plt.show()
9
+Sourcedata =   '../data/2017-sample.pkl'
10
+lableddata =   '../data/9-clusters.pkl'
11
+aggdata =      '../data/9-clusters.agg.pkl'
12
+clustertable = '../data/9-clusters-sample-table.pkl'
37
 
13
 
38
 numclusts = 9
14
 numclusts = 9
39
-df = p.read_pickle('../data/2016-17-sample.pkl')
15
+df = p.read_pickle(Sourcedata)
40
 dforig = df
16
 dforig = df
41
 
17
 
42
 # print(df)
18
 # print(df)
68
 print(cmat.index.values)
44
 print(cmat.index.values)
69
 clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
45
 clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
70
 print(clustdf)
46
 print(clustdf)
47
+clustdf.to_pickle(clustertable)
71
 mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
48
 mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
72
 print(mdf)
49
 print(mdf)
73
 print(mdf.info())
50
 print(mdf.info())
84
 print(mdagg.describe())
61
 print(mdagg.describe())
85
 # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
62
 # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
86
 print("Saving")
63
 print("Saving")
87
-mdf.to_pickle('../data/9-clusters-1617.pkl')
88
-mdagg.to_pickle('../data/9-clusters-1617.agg.pkl')
64
+mdf.to_pickle(lableddata)
65
+mdagg.to_pickle(aggdata)
89
 print("saved")
66
 print("saved")
90
 
67
 
91
 # Algorithm via 
68
 # Algorithm via 
115
 )
92
 )
116
 plt.show()
93
 plt.show()
117
 
94
 
118
-# sns.set()
119
-# 
120
-# f, axes = plt.subplots(3,3)
121
-# 
122
-# for i, c in enumerate(clabs):
123
-#     fds = mdagg[mdagg.cluster == c]
124
-#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
125
-#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
126
-# plt.show()
95
+sns.set()
96
+
97
+f, axes = plt.subplots(3,3)
98
+
99
+for i, c in enumerate(clabs):
100
+    fds = mdagg[mdagg.cluster == c]
101
+    sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
102
+    axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
103
+plt.show()