Repository for Petra's work at ampli Jan-Feb 2019

clusterviz.R 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. library(reticulate)
  2. library(ggplot2)
  3. library(dplyr)
  4. library(tidyr)
  5. library(TSA)
  6. library(forecast)
  7. theme_set(theme_bw())
  8. use_virtualenv("../venv/")
  9. fextract <- function(x, y, keep = 1, top = TRUE) {
  10. sdy <- sd(y)
  11. my <- mean(y)
  12. stany <- (y - my) / sdy
  13. ftf <- fft(stany)
  14. if (top) {
  15. ftf[rank(-abs(ftf)) > keep] <- 0
  16. } else {
  17. ftf[(keep + 1):length(ftf)] <- 0
  18. }
  19. rfv <- Re(fft(ftf, inverse = TRUE))
  20. dfr <- data.frame(x = x, y = y, f = (rfv - mean(rfv)) / sd(rfv) * sdy + my)
  21. dfr$res <- dfr$y - dfr$f
  22. return(dfr)
  23. }
  24. p <- import("pandas")
  25. sns <- import("seaborn")
  26. cbp <- as.character(p$Series(sns$color_palette("colorblind", as.integer(9))$as_hex()))
  27. aggdf <- p$read_pickle("../data/9-clusters.agg.pkl")
  28. # aggdf <- as.data.frame(aggdf)
  29. aggdf$cluster <- factor(aggdf$cluster)
  30. str(aggdf)
  31. clusters = levels(aggdf$cluster)
  32. mtempdf <- read.csv("../data/weatherharm.csv", stringsAsFactors = FALSE) %>% mutate(x = as.POSIXct(x, tz = "UTC"))
  33. ggplot(aggdf, aes(y = kwh_tot_mean, x = cluster)) + geom_boxplot()
  34. facall <- ggplot(aggdf, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  35. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  36. labs(title = "Cluster behaviour over 2017", x = "Date", y = "kwh") +
  37. scale_color_manual(values = cbp) +
  38. scale_fill_manual(values = cbp) +
  39. theme(legend.position = "none") +
  40. scale_x_datetime(date_breaks = "1 month", date_labels = "%-d %b %y")
  41. allcon <- facall + facet_grid(cluster ~ .)
  42. allfre <- facall + facet_grid(cluster ~ ., scales = "free")
  43. midjan <- filter(aggdf, read_time >= as.POSIXct("2017-01-15", tz = "UTC"), read_time <= as.POSIXct("2017-01-22", tz = "UTC"))
  44. facjan <- ggplot(midjan, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  45. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  46. labs(title = "Cluster behaviour over third week of January", x = "Date", y = "kwh") +
  47. scale_color_manual(values = cbp) +
  48. scale_fill_manual(values = cbp) +
  49. theme(legend.position = "none") +
  50. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y")
  51. jancon <- facjan + facet_grid(cluster ~ .)
  52. janfre <- facjan + facet_grid(cluster ~ ., scales = "free")
  53. midap <- filter(aggdf, read_time >= as.POSIXct("2017-04-16", tz = "UTC"), read_time <= as.POSIXct("2017-04-23", tz = "UTC"))
  54. facap <- ggplot(midap, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  55. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  56. labs(title = "Cluster behaviour over third week of April 2017", x = "Date", y = "kwh") +
  57. scale_color_manual(values = cbp) +
  58. scale_fill_manual(values = cbp) +
  59. theme(legend.position = "none") +
  60. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y")
  61. apcon <- facap + facet_grid(cluster ~ .)
  62. apfre <- facap + facet_grid(cluster ~ ., scales = "free")
  63. midjul <- filter(aggdf, read_time >= as.POSIXct("2017-07-16", tz = "UTC"), read_time <= as.POSIXct("2017-07-23", tz = "UTC"))
  64. facjul <- ggplot(midjul, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  65. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  66. labs(title = "Cluster behaviour over third week of July 2017", x = "Date", y = "kwh") +
  67. scale_color_manual(values = cbp) +
  68. scale_fill_manual(values = cbp) +
  69. theme(legend.position = "none") +
  70. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y")
  71. julcon <- facjul + facet_grid(cluster ~ .)
  72. julfre <- facjul + facet_grid(cluster ~ ., scales = "free")
  73. midoct <- filter(aggdf, read_time >= as.POSIXct("2017-10-15", tz = "UTC"), read_time <= as.POSIXct("2017-10-22", tz = "UTC"))
  74. facoct <- ggplot(midoct, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) +
  75. geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) +
  76. labs(title = "Cluster behaviour over third week of October 2017", x = "Date", y = "kwh") +
  77. scale_color_manual(values = cbp) +
  78. scale_fill_manual(values = cbp) +
  79. theme(legend.position = "none") +
  80. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y")
  81. octcon <- facoct + facet_grid(cluster ~ .)
  82. octfre <- facoct + facet_grid(cluster ~ ., scales = "free")
  83. # ggsave("all-9-fix-1617.png", allcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  84. # ggsave("all-9-fre-1617.png", allfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  85. # ggsave("jan-9-fix-1617.png", jancon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  86. # ggsave("jan-9-fre-1617.png", janfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  87. # ggsave("apr-9-fix-1617.png", apcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  88. # ggsave("apr-9-fre-1617.png", apfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  89. # ggsave("jul-9-fix-1617.png", julcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  90. # ggsave("jul-9-fre-1617.png", julfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  91. # ggsave("oct-9-fix-1617.png", octcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  92. # ggsave("oct-9-fre-1617.png", octfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  93. # ----
  94. cacf <- list()
  95. perd <- list()
  96. for (c in clusters) {
  97. cagg <- filter(aggdf, cluster == c)
  98. cacf[[c]] <- acf(cagg$kwh_tot_mean, lag.max = 48*365, plot = FALSE)$acf
  99. per <- periodogram(cagg$kwh_tot_mean, plot = FALSE)
  100. perd[[c]] <- data.frame(freq = per$freq, spec = per$spec) %>% mutate(cluster = c, period = (1/freq))
  101. #%>% arrange(desc(spec)) %>% head(5)
  102. }
  103. acfm <- sapply(cacf, as.numeric) %>% as.data.frame() %>% mutate(hour = ((1:length(`1`)) - 1)/2) %>%
  104. gather(key = "cluster", value = "acorr", clusters) %>% mutate(day = hour / 24, week = hour / (24 * 7))
  105. fcorr <- ggplot(acfm, aes(x = week, y = acorr, color = cluster)) + geom_line(size = 1.5) +
  106. scale_color_manual(values = cbp) + facet_grid(cluster ~ .) + coord_cartesian(expand = FALSE) +
  107. theme(legend.position = "none") + labs(title = "Autocorrelation plot (full year)",
  108. y = "Autocorrelation", x = "lag (weeks)")
  109. wcorr <- ggplot(acfm, aes(x = day, y = acorr, color = cluster)) + geom_line(size = 1.5) +
  110. scale_color_manual(values = cbp) + facet_grid(cluster ~ .) +
  111. scale_x_continuous(breaks = unique(floor(acfm$day / 7)) * 7) +
  112. theme(legend.position = "none") + coord_cartesian(xlim = c(0, 15), expand = FALSE) +
  113. labs(title = "Autocorrelation plot (two weeks)", y = "Autocorrelation", x = "lag (days)")
  114. # ggsave("full-autocorr-1617.png", fcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  115. # ggsave("week-autocorr-1617.png", wcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
  116. perd <- bind_rows(perd)
  117. ggplot(perd, aes(x = freq, y = spec)) +
  118. geom_line() + facet_grid(cluster ~ ., scales = "free") +
  119. scale_x_continuous(breaks = 1 / (c(48, 48*7, 48*7*4, 48*365)))
  120. c1ts <- filter(aggdf, cluster == "1")$kwh_tot_mean
  121. cts <- ts(c1ts, frequency = 48, start = c(1, 1))
  122. # carima <- auto.arima(cts, trace = TRUE)
  123. # plot(forecast(carima, h = 480))
  124. ctsnp <- msts(c1ts, c(48, 48*7))
  125. ctbats <- tbats(ctsnp)
  126. plot(forecast(ctbats, h = 48 * 7 * 4))
  127. c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean
  128. ctsnp <- msts(c9ts, c(48, 48*7, 48*7*365.25))
  129. ctbats <- tbats(ctsnp)
  130. plot(forecast(ctbats, h = 48 * 7))
  131. p <- periodogram(c1ts)
  132. dd <- data.frame(freq = p$freq, spec = p$spec) %>% mutate(per = 1/freq)
  133. dd %>% arrange(desc(spec)) %>% mutate(d = per / 48) %>% head()
  134. c9ts <- filter(aggdf, cluster == "9")
  135. ggplot(c9ts, aes(x = read_time, y = kwh_tot_mean)) + geom_line()
  136. nft <- fextract(c9ts$read_time, c9ts$kwh_tot_mean, keep = 15)
  137. ggplot(nft, aes(x, y)) + geom_line() +
  138. geom_line(aes(x, f), color = "blue") +
  139. scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") +
  140. coord_cartesian(xlim = c(as.POSIXct("2016-07-16", tz = "UTC"), as.POSIXct("2016-07-23", tz = "UTC")), expand = TRUE)
  141. clus <- "9"
  142. kp <- 50
  143. cts <- filter(aggdf, cluster == clus)
  144. # ggplot(cts, aes(x = read_time, y = kwh_tot_mean)) + geom_line()
  145. nft <- fextract(cts$read_time, cts$kwh_tot_mean, keep = kp)
  146. ggplot(nft, aes(x, y)) + geom_line() +
  147. geom_line(aes(x, f), color = "blue") +
  148. scale_x_datetime(date_breaks = "2 days", date_labels = "%a, %-d %b %Y") +
  149. coord_cartesian(xlim = c(as.POSIXct("2017-09-14", tz = "UTC"), as.POSIXct("2017-09-28", tz = "UTC")), expand = TRUE)
  150. ggplot(nft, aes(x, res)) + geom_line() + scale_x_datetime(date_breaks = "1 week") +
  151. coord_cartesian(xlim = c(as.POSIXct("2017-08-01", tz = "UTC"), as.POSIXct("2017-09-28", tz = "UTC")), expand = TRUE)
  152. ggplot(nft, aes(y = res)) + geom_boxplot()
  153. ggplot(nft, aes(x = res)) + geom_histogram()
  154. c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean
  155. c9ft <- as.numeric(abs(fft(c9ts)))
  156. c9ftdf <- data.frame(x = 1:length(c9ft), ft = c9ft)
  157. # %>% arrange(desc(ft)) %>% mutate(xt = 1/x)
  158. c9df <- data.frame(x = 1:length(c9ts), y = c9ts, ly = log(c9ts + min(c9ts) + 1))
  159. c9harm. <- harmonic(ts(c9ts, frequency = 17544), 2)
  160. sinmod <- lm(c9df$y ~ c9harm.)
  161. c9df$fit <- fitted(sinmod)
  162. c9df$res <- resid(sinmod)
  163. ggplot(c9df, aes(x = x, y = ly)) + geom_point() +
  164. geom_line(aes(y = fit), size = 2, colour = "blue")
  165. plot(c(c9df$res))
  166. hist(log(c9df$y), breaks = 30)
  167. n <- 1.5
  168. ytra <- c9df$y / c9df$fit^n - 1/c9df$fit^(n-1)
  169. plot(ytra)
  170. yts <- msts(ytra, seasonal.periods = c(48, 48*7))
  171. autoplot(mstl(yts))
  172. ytsd <- ts(ytra, frequency = 48)
  173. ydharm. <- harmonic(ytsd, 10)
  174. sdmod <- lm(ytra ~ ydharm.)
  175. sddf <- data.frame(x = 1:length(ytra), y = ytra, f = fitted(sdmod), r = resid(sdmod))
  176. ggplot(sddf[1:(48*7*2), ], aes(x = x, y = y)) + geom_point() +
  177. geom_line(aes(y = f), colour = "blue", size = 2)
  178. plot(resid(sdmod)[1:(48*7*2)])
  179. c9tsy <- ts(c9ts, frequency = floor(48 * 365.25))
  180. c9tsw <- ts(c9ts, frequency = floor(48 * 7))
  181. c9tsd <- ts(c9ts, frequency = floor(48))
  182. harm.y <- harmonic(c9tsy, 2)
  183. harm.w <- harmonic(c9tsw, 2)
  184. harm.d <- harmonic(c9tsd, 6)
  185. cmod <- lm(c9ts ~ harm.y + harm.w + harm.d)
  186. summary(cmod)
  187. plot(fitted(cmod))
  188. plot(resid(cmod))
  189. cmodi <- lm(c9ts ~ harm.y * harm.w * harm.d)
  190. summary(cmodi)
  191. plot(fitted(cmodi))
  192. plot(c9ts)
  193. plot(resid(cmodi))
  194. hist(resid(cmodi), breaks = 100)
  195. # Heavy tails
  196. qqnorm(resid(cmodi))
  197. clust = "5"
  198. cts <- filter(aggdf, cluster == clust)$kwh_tot_mean
  199. ctsy <- ts(cts, frequency = floor(48 * 365.25))
  200. ctsw <- ts(cts, frequency = floor(48 * 7))
  201. ctsd <- ts(cts, frequency = floor(48))
  202. harm.y <- harmonic(ctsy, 4)
  203. harm.w <- harmonic(ctsw, 3)
  204. harm.d <- harmonic(ctsd, 4)
  205. cmodi <- lm(cts ~ harm.y * harm.w * harm.d)
  206. summary(cmodi)
  207. cldf <- data.frame(x = unique(aggdf$read_time), y = cts, f = fitted(cmodi), r = resid(cmodi))
  208. ggplot(cldf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
  209. geom_point(aes(y = r), color = "darkgreen") +
  210. coord_cartesian(xlim = c(as.POSIXct("2017-01-01", tz = "UTC"), as.POSIXct("2017-12-31", tz = "UTC")))
  211. cwcombdf <- left_join(cldf, mtempdf, by = "x")
  212. str(cwcombdf)
  213. # A variety of models
  214. plot(f.x ~ f.y, data = cwcombdf)
  215. plot(r.x ~ r.y, data = cwcombdf)
  216. plot(r.x ~ y.y, data = cwcombdf)
  217. plot(y.x ~ r.y, data = cwcombdf)
  218. plot(y.x ~ x, data = cwcombdf)
  219. plot(y.x ~ y.y, data = cwcombdf)
  220. plot(y.x ~ y.y, data = cwcombdf)
  221. plot(y.y ~ r.x, data = cwcombdf)
  222. plot(y.y ~ x, data = cwcombdf)
  223. summary(lm(f.x ~ f.y, data = cwcombdf))
  224. summary(lm(r.x ~ r.y, data = cwcombdf))
  225. summary(lm(r.x ~ y.y, data = cwcombdf))
  226. summary(lm(y.x ~ r.y, data = cwcombdf))
  227. summary(lm(y.x ~ x, data = cwcombdf))
  228. summary(lm(y.x ~ y.y, data = cwcombdf))
  229. summary(lm(y.x ~ y.y, data = cwcombdf))
  230. summary(lm(y.y ~ r.x, data = cwcombdf))
  231. summary(lm(y.y ~ x, data = cwcombdf))
  232. summary(lm(r.x ~ r.y * f.y, data = cwcombdf))
  233. imod <- lm(r.x ~ r.y * f.y, data = cwcombdf)
  234. plot(fitted(imod))
  235. plot(resid(imod))
  236. plot(cwcombdf$r.x)
  237. plot(cwcombdf$r.x ~ resid(imod))