Browse Source

Moving from getting a longer dataset to some more decomposition stuff

Petra Lamborn 5 years ago
parent
commit
2355086ff5

+ 19
- 19
R/clusterviz.R View File

27
 p <- import("pandas")
27
 p <- import("pandas")
28
 sns <- import("seaborn")
28
 sns <- import("seaborn")
29
 cbp <- as.character(p$Series(sns$color_palette("colorblind", as.integer(9))$as_hex()))
29
 cbp <- as.character(p$Series(sns$color_palette("colorblind", as.integer(9))$as_hex()))
30
-aggdf <- p$read_pickle("../data/9-clusters.agg.pkl")
30
+aggdf <- p$read_pickle("../data/9-clusters-1617.agg.pkl")
31
 # aggdf <- as.data.frame(aggdf)
31
 # aggdf <- as.data.frame(aggdf)
32
 aggdf$cluster <- factor(aggdf$cluster)
32
 aggdf$cluster <- factor(aggdf$cluster)
33
 str(aggdf)
33
 str(aggdf)
37
 
37
 
38
 facall <- ggplot(aggdf, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) + 
38
 facall <- ggplot(aggdf, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) + 
39
     geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
39
     geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
40
-    labs(title = "Cluster behaviour over full year, 2017", x = "Date", y = "kwh") +
40
+    labs(title = "Cluster behaviour over 2016 and 2017", x = "Date", y = "kwh") +
41
     scale_color_manual(values = cbp) +
41
     scale_color_manual(values = cbp) +
42
     scale_fill_manual(values = cbp) +
42
     scale_fill_manual(values = cbp) +
43
     theme(legend.position = "none") +
43
     theme(legend.position = "none") +
44
-    scale_x_datetime(date_breaks = "1 month", date_labels = "%-d %B")
44
+    scale_x_datetime(date_breaks = "1 month", date_labels = "%-d %b %y")
45
 
45
 
46
 allcon <- facall + facet_grid(cluster ~ .)
46
 allcon <- facall + facet_grid(cluster ~ .)
47
 allfre <- facall + facet_grid(cluster ~ ., scales = "free")
47
 allfre <- facall + facet_grid(cluster ~ ., scales = "free")
101
 octcon <- facoct + facet_grid(cluster ~ .)
101
 octcon <- facoct + facet_grid(cluster ~ .)
102
 octfre <- facoct + facet_grid(cluster ~ ., scales = "free")
102
 octfre <- facoct + facet_grid(cluster ~ ., scales = "free")
103
 
103
 
104
-ggsave("all-9-fix.png", allcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
105
-ggsave("all-9-fre.png", allfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
106
-ggsave("jan-9-fix.png", jancon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
107
-ggsave("jan-9-fre.png", janfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
108
-ggsave("apr-9-fix.png",  apcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
109
-ggsave("apr-9-fre.png",  apfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
110
-ggsave("jul-9-fix.png", julcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
111
-ggsave("jul-9-fre.png", julfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
112
-ggsave("oct-9-fix.png", octcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
113
-ggsave("oct-9-fre.png", octfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
104
+ggsave("all-9-fix-1617.png", allcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
105
+ggsave("all-9-fre-1617.png", allfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
106
+ggsave("jan-9-fix-1617.png", jancon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
107
+ggsave("jan-9-fre-1617.png", janfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
108
+ggsave("apr-9-fix-1617.png",  apcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
109
+ggsave("apr-9-fre-1617.png",  apfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
110
+ggsave("jul-9-fix-1617.png", julcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
111
+ggsave("jul-9-fre-1617.png", julfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
112
+ggsave("oct-9-fix-1617.png", octcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
113
+ggsave("oct-9-fre-1617.png", octfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
114
 
114
 
115
 
115
 
116
 # ----
116
 # ----
142
     theme(legend.position = "none") + coord_cartesian(xlim = c(0, 15), expand = FALSE) +
142
     theme(legend.position = "none") + coord_cartesian(xlim = c(0, 15), expand = FALSE) +
143
     labs(title = "Autocorrelation plot (two weeks)", y = "Autocorrelation", x = "lag (days)")
143
     labs(title = "Autocorrelation plot (two weeks)", y = "Autocorrelation", x = "lag (days)")
144
 
144
 
145
-ggsave("full-autocorr.png", fcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
146
-ggsave("week-autocorr.png", wcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
145
+ggsave("full-autocorr-1617.png", fcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
146
+ggsave("week-autocorr-1617.png", wcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
147
 
147
 
148
 perd <- bind_rows(perd)
148
 perd <- bind_rows(perd)
149
 
149
 
160
 plot(forecast(ctbats, h = 48 * 7 * 4))
160
 plot(forecast(ctbats, h = 48 * 7 * 4))
161
 
161
 
162
 c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean
162
 c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean
163
-ctsnp <- msts(c9ts, c(48, 48*7))
163
+ctsnp <- msts(c9ts, c(48, 48*7, 48*7*365.25))
164
 ctbats <- tbats(ctsnp)
164
 ctbats <- tbats(ctsnp)
165
-plot(forecast(ctbats, h = 48 * 7 * 4))
165
+plot(forecast(ctbats, h = 48 * 7))
166
 
166
 
167
 p <- periodogram(c1ts)
167
 p <- periodogram(c1ts)
168
 dd <- data.frame(freq = p$freq, spec = p$spec) %>% mutate(per = 1/freq)
168
 dd <- data.frame(freq = p$freq, spec = p$spec) %>% mutate(per = 1/freq)
172
 
172
 
173
 ggplot(c9ts, aes(x = read_time, y = kwh_tot_mean)) + geom_line()
173
 ggplot(c9ts, aes(x = read_time, y = kwh_tot_mean)) + geom_line()
174
 
174
 
175
-nft <- fextract(c9ts$read_time, c9ts$kwh_tot_mean, keep = 10)
175
+nft <- fextract(c9ts$read_time, c9ts$kwh_tot_mean, keep = 15)
176
 ggplot(nft, aes(x, y)) + geom_line() + 
176
 ggplot(nft, aes(x, y)) + geom_line() + 
177
     geom_line(aes(x, f), color = "blue") +
177
     geom_line(aes(x, f), color = "blue") +
178
     scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") +
178
     scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") +
179
-    coord_cartesian(xlim = c(as.POSIXct("2017-07-16", tz = "UTC"), as.POSIXct("2017-07-23", tz = "UTC")), expand = TRUE)
179
+    coord_cartesian(xlim = c(as.POSIXct("2016-07-16", tz = "UTC"), as.POSIXct("2016-07-23", tz = "UTC")), expand = TRUE)
180
 
180
 
181
 clus <- "9"
181
 clus <- "9"
182
 kp <- 50
182
 kp <- 50

BIN
img/all-9-fix-1617.png View File


BIN
img/all-9-fre-1617.png View File


BIN
img/apr-9-fix-1617.png View File


BIN
img/apr-9-fre-1617.png View File


BIN
img/full-autocorr-1617.png View File


BIN
img/jan-9-fix-1617.png View File


BIN
img/jan-9-fre-1617.png View File


BIN
img/jul-9-fix-1617.png View File


BIN
img/jul-9-fre-1617.png View File


BIN
img/oct-9-fix-1617.png View File


BIN
img/oct-9-fre-1617.png View File


BIN
img/week-autocorr-1617.png View File


+ 3
- 0
py/clustering.py View File

39
 df = p.read_pickle('../data/2016-17-sample.pkl')
39
 df = p.read_pickle('../data/2016-17-sample.pkl')
40
 dforig = df
40
 dforig = df
41
 
41
 
42
+# print(df)
43
+
42
 print(df.info())
44
 print(df.info())
43
 print(df.icp_id.nunique())
45
 print(df.icp_id.nunique())
44
 print(df.read_time.nunique())
46
 print(df.read_time.nunique())
45
 # print(df.groupby('icp_id').read_time.nunique().nunique())
47
 # print(df.groupby('icp_id').read_time.nunique().nunique())
46
 df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
48
 df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
49
+print(df.info())
47
 df = df[df.columns[df.max() != df.min()]]
50
 df = df[df.columns[df.max() != df.min()]]
48
 print(df.info())
51
 print(df.info())
49
 cmat = df.corr()
52
 cmat = df.corr()

+ 2
- 2
py/util.py View File

78
             SELECT read_time 
78
             SELECT read_time 
79
             FROM GENERATE_SERIES(%(tsstart)s::timestamp, %(tsend)s::timestamp, 
79
             FROM GENERATE_SERIES(%(tsstart)s::timestamp, %(tsend)s::timestamp, 
80
                 '30 minutes'::interval) read_time
80
                 '30 minutes'::interval) read_time
81
-        ) AS tsdata CROSS JOIN public.icp_sample
81
+        ) AS tsdata CROSS JOIN public.icp_sample_1618
82
     ) AS comb
82
     ) AS comb
83
     LEFT JOIN
83
     LEFT JOIN
84
     (
84
     (
96
              and   a.read_date <  to_date(%(dateend)s,'yyyy-mm-dd')
96
              and   a.read_date <  to_date(%(dateend)s,'yyyy-mm-dd')
97
              and   a.content_code  ~ ('UN|CN|EG')
97
              and   a.content_code  ~ ('UN|CN|EG')
98
              AND   a.icp_id IN (
98
              AND   a.icp_id IN (
99
-                SELECT icp_id FROM public.icp_sample
99
+                SELECT icp_id FROM public.icp_sample_1618
100
              )
100
              )
101
             GROUP BY 1, 2, 3
101
             GROUP BY 1, 2, 3
102
         ) AS coup_tall
102
         ) AS coup_tall

+ 46
- 0
sql/queries.pgsql View File

368
 -- range of date values
368
 -- range of date values
369
 SELECT MIN(read_date) AS mindate, MAX(read_date) AS maxdate
369
 SELECT MIN(read_date) AS mindate, MAX(read_date) AS maxdate
370
 FROM coup_prd.coupdatamaster;
370
 FROM coup_prd.coupdatamaster;
371
+
372
+-- View for best data in 18 month set
373
+CREATE VIEW public.best_icp_18m AS
374
+SELECT *
375
+FROM
376
+(
377
+    SELECT icp_id, COUNT(DISTINCT read_date) AS data_days 
378
+    FROM coup_prd.coupdatamaster
379
+    WHERE read_date >= to_date('01/07/2016','dd/mm/yyyy')
380
+        AND read_date <  to_date('01/01/2018','dd/mm/yyyy')
381
+        AND content_code = 'UN'
382
+    GROUP BY icp_id
383
+) AS cir 
384
+WHERE data_days >= 540;
385
+
386
+-- Produces sample table for 18m data
387
+CREATE TABLE public.icp_sample_18m AS
388
+SELECT * FROM public.best_icp_18m
389
+ORDER BY random()
390
+LIMIT 1000;
391
+
392
+-- Number of ICPs with data for each day
393
+SELECT read_date, COUNT(DISTINCT icp_id) as icps
394
+FROM coup_prd.coupdatamaster
395
+GROUP BY read_date
396
+ORDER BY read_date;
397
+
398
+-- View for best data in 2016-18 set
399
+CREATE VIEW public.best_icp_1618 AS
400
+SELECT *
401
+FROM
402
+(
403
+    SELECT icp_id, COUNT(DISTINCT read_date) AS data_days 
404
+    FROM coup_prd.coupdatamaster
405
+    WHERE read_date >= to_date('01/04/2016','dd/mm/yyyy')
406
+        AND read_date <  to_date('01/04/2018','dd/mm/yyyy')
407
+        AND content_code = 'UN'
408
+    GROUP BY icp_id
409
+) AS cir 
410
+WHERE data_days >= 720;
411
+
412
+-- Produces sample table for 1618 data
413
+CREATE TABLE public.icp_sample_1618 AS
414
+SELECT * FROM public.best_icp_1618
415
+ORDER BY random()
416
+LIMIT 1000;