Repository for Petra's work at ampli Jan-Feb 2019

clusterviz.R 7.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. library(reticulate)
  2. library(ggplot2)
  3. library(dplyr)
  4. library(tidyr)
  5. library(TSA)
  6. library(forecast)
  7. theme_set(theme_bw())
  8. use_virtualenv("../venv/")
  9. p <- import("pandas")
  10. sns <- import("seaborn")
  11. cbp <- as.character(p$Series(sns$color_palette("colorblind", as.integer(9))$as_hex()))
  12. aggdf <- p$read_pickle("../data/9-clusters.agg.pkl")
  13. aggdf <- as.data.frame(aggdf)
  14. aggdf$cluster <- factor(aggdf$cluster)
  15. str(aggdf)
  16. clusters = levels(aggdf$cluster)
  17. ggplot(aggdf, aes(y = kwh_tot_mean, x = cluster)) + geom_boxplot()
  18. facall <- ggplot(aggdf, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  19. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  20. labs(title = "Cluster behaviour over full year, 2017", x = "Date", y = "kwh") +
  21. scale_color_manual(values = cbp) +
  22. scale_fill_manual(values = cbp) +
  23. theme(legend.position = "none") +
  24. scale_x_datetime(date_breaks = "1 month", date_labels = "%-d %B")
  25. allcon <- facall + facet_grid(cluster ~ .)
  26. allfre <- facall + facet_grid(cluster ~ ., scales = "free")
  27. midjan <- filter(aggdf, read_time >= as.POSIXct("2017-01-15", tz = "UTC"), read_time <= as.POSIXct("2017-01-22", tz = "UTC"))
  28. facjan <- ggplot(midjan, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  29. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  30. labs(title = "Cluster behaviour over third week of January", x = "Date", y = "kwh") +
  31. scale_color_manual(values = cbp) +
  32. scale_fill_manual(values = cbp) +
  33. theme(legend.position = "none") +
  34. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y")
  35. jancon <- facjan + facet_grid(cluster ~ .)
  36. janfre <- facjan + facet_grid(cluster ~ ., scales = "free")
  37. midap <- filter(aggdf, read_time >= as.POSIXct("2017-04-16", tz = "UTC"), read_time <= as.POSIXct("2017-04-23", tz = "UTC"))
  38. facap <- ggplot(midap, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  39. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  40. labs(title = "Cluster behaviour over third week of April 2017", x = "Date", y = "kwh") +
  41. scale_color_manual(values = cbp) +
  42. scale_fill_manual(values = cbp) +
  43. theme(legend.position = "none") +
  44. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y")
  45. apcon <- facap + facet_grid(cluster ~ .)
  46. apfre <- facap + facet_grid(cluster ~ ., scales = "free")
  47. midjul <- filter(aggdf, read_time >= as.POSIXct("2017-07-16", tz = "UTC"), read_time <= as.POSIXct("2017-07-23", tz = "UTC"))
  48. facjul <- ggplot(midjul, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  49. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  50. labs(title = "Cluster behaviour over third week of July 2017", x = "Date", y = "kwh") +
  51. scale_color_manual(values = cbp) +
  52. scale_fill_manual(values = cbp) +
  53. theme(legend.position = "none") +
  54. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y")
  55. julcon <- facjul + facet_grid(cluster ~ .)
  56. julfre <- facjul + facet_grid(cluster ~ ., scales = "free")
  57. midoct <- filter(aggdf, read_time >= as.POSIXct("2017-10-15", tz = "UTC"), read_time <= as.POSIXct("2017-10-22", tz = "UTC"))
  58. facoct <- ggplot(midoct, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  59. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  60. labs(title = "Cluster behaviour over third week of October 2017", x = "Date", y = "kwh") +
  61. scale_color_manual(values = cbp) +
  62. scale_fill_manual(values = cbp) +
  63. theme(legend.position = "none") +
  64. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y")
  65. octcon <- facoct + facet_grid(cluster ~ .)
  66. octfre <- facoct + facet_grid(cluster ~ ., scales = "free")
  67. ggsave("all-9-fix.png", allcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  68. ggsave("all-9-fre.png", allfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  69. ggsave("jan-9-fix.png", jancon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  70. ggsave("jan-9-fre.png", janfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  71. ggsave("apr-9-fix.png", apcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  72. ggsave("apr-9-fre.png", apfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  73. ggsave("jul-9-fix.png", julcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  74. ggsave("jul-9-fre.png", julfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  75. ggsave("oct-9-fix.png", octcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  76. ggsave("oct-9-fre.png", octfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  77. # ----
  78. cacf <- list()
  79. perd <- list()
  80. for (c in clusters) {
  81. cagg <- filter(aggdf, cluster == c)
  82. cacf[[c]] <- acf(cagg$kwh_tot_mean, lag.max = 48*365, plot = FALSE)$acf
  83. per <- periodogram(cagg$kwh_tot_mean, plot = FALSE)
  84. perd[[c]] <- data.frame(freq = per$freq, spec = per$spec) %>% mutate(cluster = c, period = (1/freq))
  85. #%>% arrange(desc(spec)) %>% head(5)
  86. }
  87. acfm <- sapply(cacf, as.numeric) %>% as.data.frame() %>% mutate(hour = ((1:length(`1`)) - 1)/2) %>%
  88. gather(key = "cluster", value = "acorr", clusters) %>% mutate(day = hour / 24, week = hour / (24 * 7))
  89. fcorr <- ggplot(acfm, aes(x = week, y = acorr, color = cluster)) + geom_line(size = 1.5) +
  90. scale_color_manual(values = cbp) + facet_grid(cluster ~ .) + coord_cartesian(expand = FALSE) +
  91. theme(legend.position = "none") + labs(title = "Autocorrelation plot (full year)",
  92. y = "Autocorrelation", x = "lag (weeks)")
  93. wcorr <- ggplot(acfm, aes(x = day, y = acorr, color = cluster)) + geom_line(size = 1.5) +
  94. scale_color_manual(values = cbp) + facet_grid(cluster ~ .) +
  95. scale_x_continuous(breaks = unique(floor(acfm$day / 7)) * 7) +
  96. theme(legend.position = "none") + coord_cartesian(xlim = c(0, 15), expand = FALSE) +
  97. labs(title = "Autocorrelation plot (two weeks)", y = "Autocorrelation", x = "lag (days)")
  98. ggsave("full-autocorr.png", fcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  99. ggsave("week-autocorr.png", wcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  100. perd <- bind_rows(perd)
  101. ggplot(perd, aes(x = freq, y = spec)) +
  102. geom_line() + facet_grid(cluster ~ ., scales = "free") +
  103. scale_x_continuous(breaks = 1 / (c(48, 48*7, 48*7*4, 48*365)))
  104. c1ts <- filter(aggdf, cluster == "1")$kwh_tot_mean
  105. cts <- ts(c1ts, frequency = 48, start = c(1, 1))
  106. # carima <- auto.arima(cts, trace = TRUE)
  107. # plot(forecast(carima, h = 480))
  108. ctsnp <- msts(c1ts, c(48, 48*7))
  109. ctbats <- tbats(ctsnp)
  110. plot(forecast(ctbats, h = 48 * 7 * 4))
  111. c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean
  112. ctsnp <- msts(c9ts, c(48, 48*7))
  113. ctbats <- tbats(ctsnp)
  114. plot(forecast(ctbats, h = 48 * 7 * 4))