Browse Source

Dendrogram and fast line graph with constant colours

Petra Lamborn 5 years ago
parent
commit
702287cfd0
4 changed files with 204 additions and 57 deletions
  1. 50
    22
      py/clustering.py
  2. 42
    35
      py/downkwh.py
  3. 66
    0
      py/util.py
  4. 46
    0
      sql/queries.pgsql

+ 50
- 22
py/clustering.py View File

@@ -1,4 +1,5 @@
1 1
 from util import getQuery, pickleQuery
2
+import numpy as np
2 3
 import pandas as p
3 4
 import matplotlib.pyplot as plt
4 5
 import seaborn as sns
@@ -34,7 +35,8 @@ from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
34 35
 # 
35 36
 # plt.show()
36 37
 
37
-df = p.read_pickle('../data/jan19s.pkl')
38
+numclusts = 7
39
+df = p.read_pickle('../data/2017-20s.pkl')
38 40
 dforig = df
39 41
 
40 42
 print(df.info())
@@ -53,36 +55,62 @@ lobj = linkage(lmat, method = 'ward')
53 55
 print(lobj)
54 56
 print(cophenet(lobj, lmat))
55 57
 
56
-#plt.figure(figsize = (25, 10))
57
-#plt.title('ICP Clustering Dendrogram')
58
-#plt.xlabel('ICP ID/(Number of ICPs)')
59
-#plt.ylabel('distance')
60
-#dendrogram(
61
-#    lobj,
62
-#    labels = cmat.index.values,
63
-#    leaf_rotation=90,
64
-#    leaf_font_size=8,
65
-#    #show_leaf_counts = True,
66
-#    #truncate_mode = 'lastp',
67
-#    #p = 50,
68
-#    #show_contracted = True,
69
-#    color_threshold = 1.9
70
-#)
71
-#plt.show()
72 58
 
73
-clusts = fcluster(lobj, 6, criterion='maxclust')
59
+
60
+clabs = [x + 1 for x in range(numclusts)]
61
+cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
62
+
63
+clusts = fcluster(lobj, numclusts, criterion='maxclust')
74 64
 print(clusts)
75 65
 print(cmat.index.values)
76
-clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : [chr(x + ord('A') - 1) for x in clusts]})
66
+clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
77 67
 print(clustdf)
78 68
 mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
79 69
 print(mdf)
80 70
 print(mdf.info())
71
+qlow  = lambda x: x.quantile(0.250)
72
+qhigh = lambda x: x.quantile(0.750)
81 73
 print(mdf.cluster.describe())
74
+mdagg = mdf.groupby(['read_time', 'cluster']).agg({
75
+        'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
76
+}, q = 0.025)
77
+mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
78
+mdagg = mdagg.reset_index()
79
+print(mdagg)
80
+print(mdagg.info())
81
+print(mdagg.describe())
82
+# mdf.to_csv('~/windows/Documents/clusters-ward.csv')
82 83
 
83
-mdf.to_csv('~/windows/Documents/clusters-ward.csv')
84
+# Algorithm via 
85
+# <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
86
+ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
87
+link_cols = {}
88
+for i, i12 in enumerate(lobj[:,:2].astype(int)):
89
+  c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
90
+    for x in i12)
91
+  link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
84 92
 
85
-sns.set()
93
+plt.figure(figsize = (25, 10))
94
+plt.title('ICP Clustering Dendrogram')
95
+plt.xlabel('ICP ID/(Number of ICPs)')
96
+plt.ylabel('distance')
97
+dendrogram(
98
+    lobj,
99
+    labels = cmat.index.values,
100
+    leaf_rotation=90,
101
+    leaf_font_size=8,
102
+    # show_leaf_counts = True,
103
+    # truncate_mode = 'lastp',
104
+    # p = 50,
105
+    # show_contracted = True,
106
+    link_color_func = lambda x: link_cols[x],
107
+    color_threshold = None
108
+)
109
+plt.show()
86 110
 
87
-sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'cluster', data = mdf)
111
+sns.set()
112
+ax = sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', hue = 'cluster', data = mdagg, palette = cpal)
113
+for c in clabs:
114
+    fds = mdagg[mdagg.cluster == c]
115
+    ax.fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
88 116
 plt.show()

+ 42
- 35
py/downkwh.py View File

@@ -1,40 +1,47 @@
1
-from util import getQuery, pickleQuery
1
+from util import getQuery, pickleQuery, getkwh
2 2
 import pandas as p
3 3
 import matplotlib.pyplot as plt
4 4
 import seaborn as sns
5 5
 
6
-query = """
7
-SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
8
-FROM
9
-(
10
-    SELECT read_time, icp_id
11
-    FROM
12
-    (
13
-        SELECT read_time 
14
-        FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2017-02-01 00:00:00'::timestamp, 
15
-            '30 minutes'::interval) read_time
16
-    ) AS tsdata CROSS JOIN
17
-    (
18
-        SELECT *
19
-        FROM
20
-        (
21
-            SELECT icp_id, COUNT(DISTINCT read_date) AS data_days 
22
-            FROM coup_prd.coupdatamaster
23
-            WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
24
-                AND read_date <  to_date('01/01/2018','dd/mm/yyyy')
25
-                AND content_code = 'UN'
26
-                AND icp_id LIKE '%%19'
27
-            GROUP BY icp_id
28
-        ) AS cir 
29
-        WHERE data_days >= 360
30
-    ) AS qual_icp
31
-) AS comb
32
-LEFT JOIN
33
-(
34
-    SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
35
-    FROM public.coup_tall_jan
36
-) AS tall_timestamp 
37
-ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;
38
-"""
6
+# query = """
7
+# SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
8
+# FROM
9
+# (
10
+#     SELECT read_time, icp_id
11
+#     FROM
12
+#     (
13
+#         SELECT read_time 
14
+#         FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2017-02-01 00:00:00'::timestamp, 
15
+#             '30 minutes'::interval) read_time
16
+#     ) AS tsdata CROSS JOIN
17
+#     (
18
+#         SELECT *
19
+#         FROM
20
+#         (
21
+#             SELECT icp_id, COUNT(DISTINCT read_date) AS data_days 
22
+#             FROM coup_prd.coupdatamaster
23
+#             WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
24
+#                 AND read_date <  to_date('01/01/2018','dd/mm/yyyy')
25
+#                 AND content_code = 'UN'
26
+#                 AND icp_id LIKE '%%19'
27
+#             GROUP BY icp_id
28
+#         ) AS cir 
29
+#         WHERE data_days >= 360
30
+#     ) AS qual_icp
31
+# ) AS comb
32
+# LEFT JOIN
33
+# (
34
+#     SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
35
+#     FROM public.coup_tall_jan
36
+# ) AS tall_timestamp 
37
+# ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;
38
+# """
39
+# 
40
+# pickleQuery(query, "../data/jan19s.pkl")
39 41
 
40
-pickleQuery(query, "../data/jan19s.pkl")
42
+kwhdata = getkwh('2017-01-01', '2018-01-01', '2017-01-01 00:30:00', '2018-01-01 00:00:00', '%%20')
43
+
44
+print(kwhdata.info())
45
+
46
+print("Pickling")
47
+kwhdata.to_pickle("../data/2017-20s.pkl")

+ 66
- 0
py/util.py View File

@@ -1,6 +1,7 @@
1 1
 import psycopg2 as pg
2 2
 from configparser import ConfigParser
3 3
 import pandas.io.sql as psql
4
+import datetime as dt
4 5
 
5 6
 
6 7
 def config(filename='database.ini', section='postgresql'):
@@ -39,6 +40,7 @@ def getQuery(query, qparams=[]):
39 40
         cur = conn.cursor()
40 41
 
41 42
         # Get table
43
+        print("Retrieving table")
42 44
         dataframe = psql.read_sql(query, conn, params=qparams)
43 45
 
44 46
         cur.close()
@@ -63,6 +65,70 @@ def pickleQuery(query, path, qparams=[]):
63 65
     print("Table pickled")
64 66
 
65 67
 
68
+def getkwh(datestart, dateend, timestart, timeend, subset):
69
+    query = """
70
+    SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
71
+    FROM
72
+    (
73
+        SELECT read_time, icp_id
74
+        FROM
75
+        (
76
+            SELECT read_time 
77
+            FROM GENERATE_SERIES(%(tsstart)s::timestamp, %(tsend)s::timestamp, 
78
+                '30 minutes'::interval) read_time
79
+        ) AS tsdata CROSS JOIN
80
+        (
81
+            SELECT *
82
+            FROM
83
+            (
84
+                SELECT icp_id, COUNT(DISTINCT read_date) AS data_days 
85
+                FROM coup_prd.coupdatamaster
86
+                WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
87
+                    AND read_date <  to_date('01/01/2018','dd/mm/yyyy')
88
+                    AND content_code = 'UN'
89
+                    AND icp_id LIKE %(subset)s
90
+                GROUP BY icp_id
91
+            ) AS cir 
92
+            WHERE data_days >= 360
93
+        ) AS qual_icp
94
+    ) AS comb
95
+    LEFT JOIN
96
+    (
97
+        SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
98
+        FROM (
99
+            SELECT  a.icp_id
100
+                 , a.read_date
101
+                 , c.period
102
+                 , sum(c.read_kwh) as kwh_tot
103
+                 , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
104
+                 , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
105
+            FROM    coup_prd.coupdatamaster a,
106
+                unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
107
+            WHERE   a.read_date >= to_date(%(datestart)s,'yyyy-mm-dd')
108
+             and   a.read_date <  to_date(%(dateend)s,'yyyy-mm-dd')
109
+             and   a.content_code  ~ ('UN|CN|EG')
110
+             AND   a.icp_id LIKE %(subset)s
111
+            GROUP BY 1, 2, 3
112
+        ) AS coup_tall
113
+    ) AS tall_timestamp 
114
+    ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;
115
+    """
116
+    pdict = {
117
+        'datestart': datestart,
118
+        'dateend': dateend,
119
+        'tsstart': timestart,
120
+        'tsend': timeend,
121
+        'subset': subset
122
+    }
123
+    print("Getting data with parameters:")
124
+    print(pdict)
125
+    qdf = getQuery(query, pdict)
126
+    print("Done")
127
+    return(qdf)
128
+
129
+
130
+
131
+
66 132
 if __name__ == "__main__":
67 133
     dv = getQuery('SELECT version()').version[0]
68 134
     print('PostgreSQL database version:')

+ 46
- 0
sql/queries.pgsql View File

@@ -298,3 +298,49 @@ WHERE   a.read_date >= to_date('01/01/2017','dd/mm/yyyy')
298 298
  and   a.content_code  ~ ('UN|CN|EG')
299 299
 GROUP BY 1, 2, 3
300 300
 ORDER BY 1, 2, 3;
301
+
302
+-- Really big query (jan)
303
+SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
304
+FROM
305
+(
306
+    SELECT read_time, icp_id
307
+    FROM
308
+    (
309
+        SELECT read_time 
310
+        FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2017-02-01 00:00:00'::timestamp, 
311
+            '30 minutes'::interval) read_time
312
+    ) AS tsdata CROSS JOIN
313
+    (
314
+        SELECT *
315
+        FROM
316
+        (
317
+            SELECT icp_id, COUNT(DISTINCT read_date) AS data_days 
318
+            FROM coup_prd.coupdatamaster
319
+            WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
320
+                AND read_date <  to_date('01/01/2018','dd/mm/yyyy')
321
+                AND content_code = 'UN'
322
+                AND icp_id LIKE '%%19'
323
+            GROUP BY icp_id
324
+        ) AS cir 
325
+        WHERE data_days >= 360
326
+    ) AS qual_icp
327
+) AS comb
328
+LEFT JOIN
329
+(
330
+    SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
331
+    FROM (
332
+        SELECT  a.icp_id
333
+             , a.read_date
334
+             , c.period
335
+             , sum(c.read_kwh) as kwh_tot
336
+             , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
337
+             , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
338
+        FROM    coup_prd.coupdatamaster a,
339
+            unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
340
+        WHERE   a.read_date >= to_date('01/01/2017','dd/mm/yyyy')
341
+         and   a.read_date <  to_date('01/02/2017','dd/mm/yyyy')
342
+         and   a.content_code  ~ ('UN|CN|EG')
343
+        GROUP BY 1, 2, 3
344
+    ) AS coup_tall
345
+) AS tall_timestamp 
346
+ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;