Browse Source

Refactor out filenames for data pickles

Petra Lamborn 5 years ago
parent
commit
2ea6c125b2
1 changed files with 17 additions and 40 deletions
  1. 17
    40
      py/clustering.py

+ 17
- 40
py/clustering.py View File

@@ -6,37 +6,13 @@ import seaborn as sns
6 6
 from scipy.spatial.distance import squareform
7 7
 from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
8 8
 
9
-
10
-# query = """
11
-# SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
12
-# FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy') 
13
-# ORDER BY icp_id, read_time;
14
-# """
15
-# 
16
-# qparams = ['%%1117', '20/04/2017']
17
-
18
-# query = """
19
-# SELECT read_date, period, AVG(kwh_tot) AS average
20
-# FROM public.coup_tall_april
21
-# GROUP BY read_date, period
22
-# ORDER BY read_date, period;
23
-# """
24
-# 
25
-# qparams = []
26
-# 
27
-# df = getQuery(query, qparams)
28
-# 
29
-# print(df.info())
30
-# 
31
-# sns.set()
32
-# 
33
-# #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
34
-# sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
35
-# 
36
-# plt.show()
9
+Sourcedata =   '../data/2017-sample.pkl'
10
+lableddata =   '../data/9-clusters.pkl'
11
+aggdata =      '../data/9-clusters.agg.pkl'
12
+clustertable = '../data/9-clusters-sample-table.pkl'
37 13
 
38 14
 numclusts = 9
39
-df = p.read_pickle('../data/2016-17-sample.pkl')
15
+df = p.read_pickle(Sourcedata)
40 16
 dforig = df
41 17
 
42 18
 # print(df)
@@ -68,6 +44,7 @@ print(clusts)
68 44
 print(cmat.index.values)
69 45
 clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
70 46
 print(clustdf)
47
+clustdf.to_pickle(clustertable)
71 48
 mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
72 49
 print(mdf)
73 50
 print(mdf.info())
@@ -84,8 +61,8 @@ print(mdagg.info())
84 61
 print(mdagg.describe())
85 62
 # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
86 63
 print("Saving")
87
-mdf.to_pickle('../data/9-clusters-1617.pkl')
88
-mdagg.to_pickle('../data/9-clusters-1617.agg.pkl')
64
+mdf.to_pickle(lableddata)
65
+mdagg.to_pickle(aggdata)
89 66
 print("saved")
90 67
 
91 68
 # Algorithm via 
@@ -115,12 +92,12 @@ dendrogram(
115 92
 )
116 93
 plt.show()
117 94
 
118
-# sns.set()
119
-# 
120
-# f, axes = plt.subplots(3,3)
121
-# 
122
-# for i, c in enumerate(clabs):
123
-#     fds = mdagg[mdagg.cluster == c]
124
-#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
125
-#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
126
-# plt.show()
95
+sns.set()
96
+
97
+f, axes = plt.subplots(3,3)
98
+
99
+for i, c in enumerate(clabs):
100
+    fds = mdagg[mdagg.cluster == c]
101
+    sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
102
+    axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
103
+plt.show()