# Combined models # Continuation of clusterviz.R and weathmod.R library(TSA) library(caTools) library(dplyr) library(ggplot2) library(reticulate) library(tidyr) library(MASS) theme_set(theme_bw()) use_virtualenv("../venv/") p <- import("pandas") sns <- import("seaborn") aggdf <- p$read_pickle("../data/9-clusters.agg.pkl") aggdf$cluster <- factor(aggdf$cluster) clusters <- levels(aggdf$cluster) str(aggdf) mtempdf <- read.csv("../data/weatherharm.csv", stringsAsFactors = FALSE) %>% mutate(x = as.POSIXct(x, tz = "UTC")) %>% rename(read_time = x, rollingmin = y.min, fitmin = f.min, resmin = r.min, rollingmax = y.max, fitmax = f.max, resmax = r.max) str(mtempdf) sns <- import("seaborn") cbp <- as.character(p$Series(sns$color_palette("colorblind", as.integer(9))$as_hex())) ntps <- length(unique(aggdf$read_time)) clus = "1" yfreq <- floor(48 * 365.25) wfreq <- floor(48 * 7) dfreq <- floor(48) harmonics <- c(2, 3, 3) harm.y <- ts(1:ntps, frequency = yfreq) %>% harmonic(harmonics[1]) harm.w <- ts(1:ntps, frequency = wfreq) %>% harmonic(harmonics[2]) harm.d <- ts(1:ntps, frequency = dfreq) %>% harmonic(harmonics[3]) colnames(harm.y) <- sprintf("%s.%s.%s", "year", rep(c("cos", "sin"), each = ncol(harm.y)/2), rep(1:(ncol(harm.y)/2), times = 2)) colnames(harm.w) <- sprintf("%s.%s.%s", "week", rep(c("cos", "sin"), each = ncol(harm.w)/2), rep(1:(ncol(harm.w)/2), times = 2)) colnames(harm.d) <- sprintf("%s.%s.%s", "day", rep(c("cos", "sin"), each = ncol(harm.d)/2), rep(1:(ncol(harm.d)/2), times = 2)) clusdf <- filter(aggdf, cluster == clus) %>% dplyr::select(read_time, kwh = kwh_tot_mean) %>% left_join(mtempdf, by = "read_time") %>% cbind(harm.y, harm.w, harm.d) str(clusdf) ycols <- paste(colnames(harm.y), collapse = " + ") wcols <- paste(colnames(harm.w), collapse = " + ") dcols <- paste(colnames(harm.d), collapse = " + ") nform.full <- sprintf(paste0("kwh ~ %s + %s + %s + (%s):(%s) + (%s):(%s) + (%s):(%s) + resmin", " + resmin:(%s) + resmin:(%s) + resmin:(%s)", " + resmax + resmax:(%s) + resmax:(%s) + resmax:(%s)"), ycols, wcols, dcols, ycols, wcols, ycols, dcols, wcols, dcols, ycols, wcols, dcols, ycols, wcols, dcols) %>% formula() nform.comp <- sprintf(paste0("kwh ~ %s + %s + %s + (%s):(%s) + (%s):(%s) + resmin + resmin:(%s) + resmin:(%s) + resmin:(%s)", " + resmax + resmax:(%s) + resmax:(%s) + resmax:(%s)"), ycols, wcols, dcols, ycols, dcols, wcols, dcols, ycols, wcols, dcols, ycols, wcols, dcols) %>% formula() nform.now <- sprintf("kwh ~ %s + %s + %s + (%s):(%s) + (%s):(%s)", ycols, wcols, dcols, ycols, dcols, wcols, dcols) %>% formula() nform.min <- formula("kwh ~ 1") nform.start <- sprintf("kwh ~ %s + %s + %s + resmin", ycols, wcols, dcols) %>% formula() # charmmod <- lm(kwh ~ resmin + harm.y * harm.w * harm.d + resmin:harm.y, data = clusdf) charmmod <- lm(nform.comp, data = clusdf) # charmmod <- lm(nform.full, data = clusdf) # charmmod <- lm(kwh ~ ., data = clusdf) summary(charmmod) mean(abs(lm(nform.now, data = clusdf)$residuals)) mean(abs(lm(nform.comp, data = clusdf)$residuals)) mean(abs(lm(nform.full, data = clusdf)$residuals)) sd(lm(nform.now, data = clusdf)$residuals) sd(lm(nform.comp, data = clusdf)$residuals) sd(lm(nform.full, data = clusdf)$residuals) cmdf <- data.frame(x = clusdf$read_time, y = clusdf$kwh, f = fitted(charmmod), r = resid(charmmod)) cmplot <-ggplot(cmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() + geom_point(aes(y = r), color = "darkgreen") cmplot cmplot + coord_cartesian(xlim = c(as.POSIXct("2017-03-01", tz = "UTC"), as.POSIXct("2017-04-01", tz = "UTC"))) # sres <- stepAIC(charmmod, scope = list(upper = nform.full, lower = nform.min), # direction = "both", steps = 300) newagg <- p$read_pickle("../data/1617-agg.pkl") newagg$cluster <- factor(newagg$cluster) str(newagg) ptps <- length(unique(newagg$read_time)) perdiff <- as.numeric(min(newagg$read_time) - min(aggdf$read_time), units = "mins") / 30 pharm.y <- ts(1:ptps, frequency = yfreq, start = c(perdiff %/% yfreq + 1, perdiff %% yfreq + 1)) %>% harmonic(harmonics[1]) pharm.w <- ts(1:ptps, frequency = wfreq, start = c(perdiff %/% wfreq + 1, perdiff %% wfreq + 1)) %>% harmonic(harmonics[2]) pharm.d <- ts(1:ptps, frequency = dfreq, start = c(perdiff %/% dfreq + 1, perdiff %% dfreq + 1)) %>% harmonic(harmonics[3]) colnames(pharm.y) <- sprintf("%s.%s.%s", "year", rep(c("cos", "sin"), each = ncol(pharm.y)/2), rep(1:(ncol(pharm.y)/2), times = 2)) colnames(pharm.w) <- sprintf("%s.%s.%s", "week", rep(c("cos", "sin"), each = ncol(pharm.w)/2), rep(1:(ncol(pharm.w)/2), times = 2)) colnames(pharm.d) <- sprintf("%s.%s.%s", "day", rep(c("cos", "sin"), each = ncol(pharm.d)/2), rep(1:(ncol(pharm.d)/2), times = 2)) pclusdf <- filter(newagg, cluster == clus) %>% dplyr::select(read_time, kwh = kwh_tot_mean) %>% left_join(mtempdf, by = "read_time") %>% cbind(pharm.y, pharm.w, pharm.d) str(pclusdf) ptestdata <- dplyr::select(pclusdf, -kwh) str(ptestdata) predvals <- predict(charmmod, ptestdata) predf <- data.frame(x = pclusdf$read_time, y = pclusdf$kwh, f = predvals, r = pclusdf$kwh - predvals) predplot <-ggplot(predf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() + geom_point(aes(y = r), color = "darkgreen") predplot predplot + coord_cartesian(xlim = c(as.POSIXct("2017-03-01", tz = "UTC"), as.POSIXct("2017-04-01", tz = "UTC"))) mean(abs(predf$r)) sd(predf$r) # number of icps per cluster ocdf <- p$read_pickle('../data/9-clusters-sample-table.pkl') ncdf <- p$read_pickle('../data/1617-asgn-table.pkl') table(ocdf$cluster) table(ncdf$cluster) allmods <- list() for (clus in clusters) { clusdf <- filter(aggdf, cluster == clus) %>% dplyr::select(read_time, kwh = kwh_tot_mean) %>% left_join(mtempdf, by = "read_time") %>% cbind(harm.y, harm.w, harm.d) largm <- lm(nform.comp, data = clusdf, model = FALSE, qr = FALSE) allmods[[clus]] <- largm } saveRDS(allmods, '../models/1kmods.rds')