library(reticulate) library(ggplot2) library(dplyr) library(tidyr) library(TSA) library(forecast) theme_set(theme_bw()) use_virtualenv("../venv/") fextract <- function(x, y, keep = 1, top = TRUE) { sdy <- sd(y) my <- mean(y) stany <- (y - my) / sdy ftf <- fft(stany) if (top) { ftf[rank(-abs(ftf)) > keep] <- 0 } else { ftf[(keep + 1):length(ftf)] <- 0 } rfv <- Re(fft(ftf, inverse = TRUE)) dfr <- data.frame(x = x, y = y, f = (rfv - mean(rfv)) / sd(rfv) * sdy + my) dfr$res <- dfr$y - dfr$f return(dfr) } p <- import("pandas") sns <- import("seaborn") cbp <- as.character(p$Series(sns$color_palette("colorblind", as.integer(9))$as_hex())) aggdf <- p$read_pickle("../data/9-clusters.agg.pkl") # aggdf <- as.data.frame(aggdf) aggdf$cluster <- factor(aggdf$cluster) str(aggdf) clusters = levels(aggdf$cluster) mtempdf <- read.csv("../data/weatherharm.csv", stringsAsFactors = FALSE) %>% mutate(x = as.POSIXct(x, tz = "UTC")) ggplot(aggdf, aes(y = kwh_tot_mean, x = cluster)) + geom_boxplot() facall <- ggplot(aggdf, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) + geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) + labs(title = "Cluster behaviour over 2017", x = "Date", y = "kwh") + scale_color_manual(values = cbp) + scale_fill_manual(values = cbp) + theme(legend.position = "none") + scale_x_datetime(date_breaks = "1 month", date_labels = "%-d %b %y") allcon <- facall + facet_grid(cluster ~ .) allfre <- facall + facet_grid(cluster ~ ., scales = "free") midjan <- filter(aggdf, read_time >= as.POSIXct("2017-01-15", tz = "UTC"), read_time <= as.POSIXct("2017-01-22", tz = "UTC")) facjan <- ggplot(midjan, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) + geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) + labs(title = "Cluster behaviour over third week of January", x = "Date", y = "kwh") + scale_color_manual(values = cbp) + scale_fill_manual(values = cbp) + theme(legend.position = "none") + scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") jancon <- facjan + facet_grid(cluster ~ .) janfre <- facjan + facet_grid(cluster ~ ., scales = "free") midap <- filter(aggdf, read_time >= as.POSIXct("2017-04-16", tz = "UTC"), read_time <= as.POSIXct("2017-04-23", tz = "UTC")) facap <- ggplot(midap, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) + geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) + labs(title = "Cluster behaviour over third week of April 2017", x = "Date", y = "kwh") + scale_color_manual(values = cbp) + scale_fill_manual(values = cbp) + theme(legend.position = "none") + scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") apcon <- facap + facet_grid(cluster ~ .) apfre <- facap + facet_grid(cluster ~ ., scales = "free") midjul <- filter(aggdf, read_time >= as.POSIXct("2017-07-16", tz = "UTC"), read_time <= as.POSIXct("2017-07-23", tz = "UTC")) facjul <- ggplot(midjul, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) + geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) + labs(title = "Cluster behaviour over third week of July 2017", x = "Date", y = "kwh") + scale_color_manual(values = cbp) + scale_fill_manual(values = cbp) + theme(legend.position = "none") + scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") julcon <- facjul + facet_grid(cluster ~ .) julfre <- facjul + facet_grid(cluster ~ ., scales = "free") midoct <- filter(aggdf, read_time >= as.POSIXct("2017-10-15", tz = "UTC"), read_time <= as.POSIXct("2017-10-22", tz = "UTC")) facoct <- ggplot(midoct, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) + geom_line(size = 1.5) + geom_ribbon(aes(ymin = kwh_tot_CI_low, ymax = kwh_tot_CI_high), alpha = 0.2, color = NA) + labs(title = "Cluster behaviour over third week of October 2017", x = "Date", y = "kwh") + scale_color_manual(values = cbp) + scale_fill_manual(values = cbp) + theme(legend.position = "none") + scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") octcon <- facoct + facet_grid(cluster ~ .) octfre <- facoct + facet_grid(cluster ~ ., scales = "free") # ggsave("all-9-fix-1617.png", allcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("all-9-fre-1617.png", allfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("jan-9-fix-1617.png", jancon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("jan-9-fre-1617.png", janfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("apr-9-fix-1617.png", apcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("apr-9-fre-1617.png", apfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("jul-9-fix-1617.png", julcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("jul-9-fre-1617.png", julfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("oct-9-fix-1617.png", octcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("oct-9-fre-1617.png", octfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ---- cacf <- list() perd <- list() for (c in clusters) { cagg <- filter(aggdf, cluster == c) cacf[[c]] <- acf(cagg$kwh_tot_mean, lag.max = 48*365, plot = FALSE)$acf per <- periodogram(cagg$kwh_tot_mean, plot = FALSE) perd[[c]] <- data.frame(freq = per$freq, spec = per$spec) %>% mutate(cluster = c, period = (1/freq)) #%>% arrange(desc(spec)) %>% head(5) } acfm <- sapply(cacf, as.numeric) %>% as.data.frame() %>% mutate(hour = ((1:length(`1`)) - 1)/2) %>% gather(key = "cluster", value = "acorr", clusters) %>% mutate(day = hour / 24, week = hour / (24 * 7)) fcorr <- ggplot(acfm, aes(x = week, y = acorr, color = cluster)) + geom_line(size = 1.5) + scale_color_manual(values = cbp) + facet_grid(cluster ~ .) + coord_cartesian(expand = FALSE) + theme(legend.position = "none") + labs(title = "Autocorrelation plot (full year)", y = "Autocorrelation", x = "lag (weeks)") wcorr <- ggplot(acfm, aes(x = day, y = acorr, color = cluster)) + geom_line(size = 1.5) + scale_color_manual(values = cbp) + facet_grid(cluster ~ .) + scale_x_continuous(breaks = unique(floor(acfm$day / 7)) * 7) + theme(legend.position = "none") + coord_cartesian(xlim = c(0, 15), expand = FALSE) + labs(title = "Autocorrelation plot (two weeks)", y = "Autocorrelation", x = "lag (days)") # ggsave("full-autocorr-1617.png", fcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") # ggsave("week-autocorr-1617.png", wcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm") perd <- bind_rows(perd) ggplot(perd, aes(x = freq, y = spec)) + geom_line() + facet_grid(cluster ~ ., scales = "free") + scale_x_continuous(breaks = 1 / (c(48, 48*7, 48*7*4, 48*365))) c1ts <- filter(aggdf, cluster == "1")$kwh_tot_mean cts <- ts(c1ts, frequency = 48, start = c(1, 1)) # carima <- auto.arima(cts, trace = TRUE) # plot(forecast(carima, h = 480)) ctsnp <- msts(c1ts, c(48, 48*7)) ctbats <- tbats(ctsnp) plot(forecast(ctbats, h = 48 * 7 * 4)) c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean ctsnp <- msts(c9ts, c(48, 48*7, 48*7*365.25)) ctbats <- tbats(ctsnp) plot(forecast(ctbats, h = 48 * 7)) p <- periodogram(c1ts) dd <- data.frame(freq = p$freq, spec = p$spec) %>% mutate(per = 1/freq) dd %>% arrange(desc(spec)) %>% mutate(d = per / 48) %>% head() c9ts <- filter(aggdf, cluster == "9") ggplot(c9ts, aes(x = read_time, y = kwh_tot_mean)) + geom_line() nft <- fextract(c9ts$read_time, c9ts$kwh_tot_mean, keep = 15) ggplot(nft, aes(x, y)) + geom_line() + geom_line(aes(x, f), color = "blue") + scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") + coord_cartesian(xlim = c(as.POSIXct("2016-07-16", tz = "UTC"), as.POSIXct("2016-07-23", tz = "UTC")), expand = TRUE) clus <- "9" kp <- 50 cts <- filter(aggdf, cluster == clus) # ggplot(cts, aes(x = read_time, y = kwh_tot_mean)) + geom_line() nft <- fextract(cts$read_time, cts$kwh_tot_mean, keep = kp) ggplot(nft, aes(x, y)) + geom_line() + geom_line(aes(x, f), color = "blue") + scale_x_datetime(date_breaks = "2 days", date_labels = "%a, %-d %b %Y") + coord_cartesian(xlim = c(as.POSIXct("2017-09-14", tz = "UTC"), as.POSIXct("2017-09-28", tz = "UTC")), expand = TRUE) ggplot(nft, aes(x, res)) + geom_line() + scale_x_datetime(date_breaks = "1 week") + coord_cartesian(xlim = c(as.POSIXct("2017-08-01", tz = "UTC"), as.POSIXct("2017-09-28", tz = "UTC")), expand = TRUE) ggplot(nft, aes(y = res)) + geom_boxplot() ggplot(nft, aes(x = res)) + geom_histogram() c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean c9ft <- as.numeric(abs(fft(c9ts))) c9ftdf <- data.frame(x = 1:length(c9ft), ft = c9ft) # %>% arrange(desc(ft)) %>% mutate(xt = 1/x) c9df <- data.frame(x = 1:length(c9ts), y = c9ts, ly = log(c9ts + min(c9ts) + 1)) c9harm. <- harmonic(ts(c9ts, frequency = 17544), 2) sinmod <- lm(c9df$y ~ c9harm.) c9df$fit <- fitted(sinmod) c9df$res <- resid(sinmod) ggplot(c9df, aes(x = x, y = ly)) + geom_point() + geom_line(aes(y = fit), size = 2, colour = "blue") plot(c(c9df$res)) hist(log(c9df$y), breaks = 30) n <- 1.5 ytra <- c9df$y / c9df$fit^n - 1/c9df$fit^(n-1) plot(ytra) yts <- msts(ytra, seasonal.periods = c(48, 48*7)) autoplot(mstl(yts)) ytsd <- ts(ytra, frequency = 48) ydharm. <- harmonic(ytsd, 10) sdmod <- lm(ytra ~ ydharm.) sddf <- data.frame(x = 1:length(ytra), y = ytra, f = fitted(sdmod), r = resid(sdmod)) ggplot(sddf[1:(48*7*2), ], aes(x = x, y = y)) + geom_point() + geom_line(aes(y = f), colour = "blue", size = 2) plot(resid(sdmod)[1:(48*7*2)]) c9tsy <- ts(c9ts, frequency = floor(48 * 365.25)) c9tsw <- ts(c9ts, frequency = floor(48 * 7)) c9tsd <- ts(c9ts, frequency = floor(48)) harm.y <- harmonic(c9tsy, 2) harm.w <- harmonic(c9tsw, 2) harm.d <- harmonic(c9tsd, 6) cmod <- lm(c9ts ~ harm.y + harm.w + harm.d) summary(cmod) plot(fitted(cmod)) plot(resid(cmod)) cmodi <- lm(c9ts ~ harm.y * harm.w * harm.d) summary(cmodi) plot(fitted(cmodi)) plot(c9ts) plot(resid(cmodi)) hist(resid(cmodi), breaks = 100) # Heavy tails qqnorm(resid(cmodi)) clust = "5" cts <- filter(aggdf, cluster == clust)$kwh_tot_mean ctsy <- ts(cts, frequency = floor(48 * 365.25)) ctsw <- ts(cts, frequency = floor(48 * 7)) ctsd <- ts(cts, frequency = floor(48)) harm.y <- harmonic(ctsy, 4) harm.w <- harmonic(ctsw, 3) harm.d <- harmonic(ctsd, 4) cmodi <- lm(cts ~ harm.y * harm.w * harm.d) summary(cmodi) cldf <- data.frame(x = unique(aggdf$read_time), y = cts, f = fitted(cmodi), r = resid(cmodi)) ggplot(cldf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() + geom_point(aes(y = r), color = "darkgreen") + coord_cartesian(xlim = c(as.POSIXct("2017-01-01", tz = "UTC"), as.POSIXct("2017-12-31", tz = "UTC"))) cwcombdf <- left_join(cldf, mtempdf, by = "x") str(cwcombdf) # A variety of models plot(f.x ~ f.y, data = cwcombdf) plot(r.x ~ r.y, data = cwcombdf) plot(r.x ~ y.y, data = cwcombdf) plot(y.x ~ r.y, data = cwcombdf) plot(y.x ~ x, data = cwcombdf) plot(y.x ~ y.y, data = cwcombdf) plot(y.x ~ y.y, data = cwcombdf) plot(y.y ~ r.x, data = cwcombdf) plot(y.y ~ x, data = cwcombdf) summary(lm(f.x ~ f.y, data = cwcombdf)) summary(lm(r.x ~ r.y, data = cwcombdf)) summary(lm(r.x ~ y.y, data = cwcombdf)) summary(lm(y.x ~ r.y, data = cwcombdf)) summary(lm(y.x ~ x, data = cwcombdf)) summary(lm(y.x ~ y.y, data = cwcombdf)) summary(lm(y.x ~ y.y, data = cwcombdf)) summary(lm(y.y ~ r.x, data = cwcombdf)) summary(lm(y.y ~ x, data = cwcombdf)) summary(lm(r.x ~ r.y * f.y, data = cwcombdf)) imod <- lm(r.x ~ r.y * f.y, data = cwcombdf) plot(fitted(imod)) plot(resid(imod)) plot(cwcombdf$r.x) plot(cwcombdf$r.x ~ resid(imod))