Repository for Petra's work at ampli Jan-Feb 2019

combmodels.R 6.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. # Combined models
  2. # Continuation of clusterviz.R and weathmod.R
  3. library(TSA)
  4. library(caTools)
  5. library(dplyr)
  6. library(ggplot2)
  7. library(reticulate)
  8. library(tidyr)
  9. library(MASS)
  10. theme_set(theme_bw())
  11. use_virtualenv("../venv/")
  12. p <- import("pandas")
  13. sns <- import("seaborn")
  14. aggdf <- p$read_pickle("../data/9-clusters.agg.pkl")
  15. aggdf$cluster <- factor(aggdf$cluster)
  16. clusters <- levels(aggdf$cluster)
  17. str(aggdf)
  18. mtempdf <- read.csv("../data/weatherharm.csv", stringsAsFactors = FALSE) %>%
  19. mutate(x = as.POSIXct(x, tz = "UTC")) %>%
  20. rename(read_time = x, rollingmin = y.min, fitmin = f.min, resmin = r.min,
  21. rollingmax = y.max, fitmax = f.max, resmax = r.max)
  22. str(mtempdf)
  23. sns <- import("seaborn")
  24. cbp <- as.character(p$Series(sns$color_palette("colorblind", as.integer(9))$as_hex()))
  25. ntps <- length(unique(aggdf$read_time))
  26. clus = "1"
  27. yfreq <- floor(48 * 365.25)
  28. wfreq <- floor(48 * 7)
  29. dfreq <- floor(48)
  30. harmonics <- c(2, 3, 3)
  31. harm.y <- ts(1:ntps, frequency = yfreq) %>% harmonic(harmonics[1])
  32. harm.w <- ts(1:ntps, frequency = wfreq) %>% harmonic(harmonics[2])
  33. harm.d <- ts(1:ntps, frequency = dfreq) %>% harmonic(harmonics[3])
  34. colnames(harm.y) <- sprintf("%s.%s.%s", "year", rep(c("cos", "sin"), each = ncol(harm.y)/2), rep(1:(ncol(harm.y)/2), times = 2))
  35. colnames(harm.w) <- sprintf("%s.%s.%s", "week", rep(c("cos", "sin"), each = ncol(harm.w)/2), rep(1:(ncol(harm.w)/2), times = 2))
  36. colnames(harm.d) <- sprintf("%s.%s.%s", "day", rep(c("cos", "sin"), each = ncol(harm.d)/2), rep(1:(ncol(harm.d)/2), times = 2))
  37. clusdf <- filter(aggdf, cluster == clus) %>%
  38. dplyr::select(read_time, kwh = kwh_tot_mean) %>%
  39. left_join(mtempdf, by = "read_time") %>% cbind(harm.y, harm.w, harm.d)
  40. str(clusdf)
  41. ycols <- paste(colnames(harm.y), collapse = " + ")
  42. wcols <- paste(colnames(harm.w), collapse = " + ")
  43. dcols <- paste(colnames(harm.d), collapse = " + ")
  44. nform.full <- sprintf(paste0("kwh ~ %s + %s + %s + (%s):(%s) + (%s):(%s) + (%s):(%s) + resmin",
  45. " + resmin:(%s) + resmin:(%s) + resmin:(%s)",
  46. " + resmax + resmax:(%s) + resmax:(%s) + resmax:(%s)"),
  47. ycols, wcols, dcols, ycols, wcols, ycols, dcols, wcols, dcols, ycols, wcols, dcols, ycols, wcols, dcols) %>% formula()
  48. nform.comp <- sprintf(paste0("kwh ~ %s + %s + %s + (%s):(%s) + (%s):(%s) + resmin + resmin:(%s) + resmin:(%s) + resmin:(%s)",
  49. " + resmax + resmax:(%s) + resmax:(%s) + resmax:(%s)"),
  50. ycols, wcols, dcols, ycols, dcols, wcols, dcols, ycols, wcols, dcols, ycols, wcols, dcols) %>% formula()
  51. nform.now <- sprintf("kwh ~ %s + %s + %s + (%s):(%s) + (%s):(%s)",
  52. ycols, wcols, dcols, ycols, dcols, wcols, dcols) %>% formula()
  53. nform.min <- formula("kwh ~ 1")
  54. nform.start <- sprintf("kwh ~ %s + %s + %s + resmin",
  55. ycols, wcols, dcols) %>% formula()
  56. # charmmod <- lm(kwh ~ resmin + harm.y * harm.w * harm.d + resmin:harm.y, data = clusdf)
  57. charmmod <- lm(nform.comp, data = clusdf)
  58. # charmmod <- lm(nform.full, data = clusdf)
  59. # charmmod <- lm(kwh ~ ., data = clusdf)
  60. summary(charmmod)
  61. mean(abs(lm(nform.now, data = clusdf)$residuals))
  62. mean(abs(lm(nform.comp, data = clusdf)$residuals))
  63. mean(abs(lm(nform.full, data = clusdf)$residuals))
  64. sd(lm(nform.now, data = clusdf)$residuals)
  65. sd(lm(nform.comp, data = clusdf)$residuals)
  66. sd(lm(nform.full, data = clusdf)$residuals)
  67. cmdf <- data.frame(x = clusdf$read_time, y = clusdf$kwh, f = fitted(charmmod), r = resid(charmmod))
  68. cmplot <-ggplot(cmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
  69. geom_point(aes(y = r), color = "darkgreen")
  70. cmplot
  71. cmplot + coord_cartesian(xlim = c(as.POSIXct("2017-03-01", tz = "UTC"), as.POSIXct("2017-04-01", tz = "UTC")))
  72. # sres <- stepAIC(charmmod, scope = list(upper = nform.full, lower = nform.min),
  73. # direction = "both", steps = 300)
  74. newagg <- p$read_pickle("../data/1617-agg.pkl")
  75. newagg$cluster <- factor(newagg$cluster)
  76. str(newagg)
  77. ptps <- length(unique(newagg$read_time))
  78. perdiff <- as.numeric(min(newagg$read_time) - min(aggdf$read_time), units = "mins") / 30
  79. pharm.y <- ts(1:ptps, frequency = yfreq, start = c(perdiff %/% yfreq + 1, perdiff %% yfreq + 1)) %>% harmonic(harmonics[1])
  80. pharm.w <- ts(1:ptps, frequency = wfreq, start = c(perdiff %/% wfreq + 1, perdiff %% wfreq + 1)) %>% harmonic(harmonics[2])
  81. pharm.d <- ts(1:ptps, frequency = dfreq, start = c(perdiff %/% dfreq + 1, perdiff %% dfreq + 1)) %>% harmonic(harmonics[3])
  82. colnames(pharm.y) <- sprintf("%s.%s.%s", "year", rep(c("cos", "sin"), each = ncol(pharm.y)/2), rep(1:(ncol(pharm.y)/2), times = 2))
  83. colnames(pharm.w) <- sprintf("%s.%s.%s", "week", rep(c("cos", "sin"), each = ncol(pharm.w)/2), rep(1:(ncol(pharm.w)/2), times = 2))
  84. colnames(pharm.d) <- sprintf("%s.%s.%s", "day", rep(c("cos", "sin"), each = ncol(pharm.d)/2), rep(1:(ncol(pharm.d)/2), times = 2))
  85. pclusdf <- filter(newagg, cluster == clus) %>%
  86. dplyr::select(read_time, kwh = kwh_tot_mean) %>%
  87. left_join(mtempdf, by = "read_time") %>% cbind(pharm.y, pharm.w, pharm.d)
  88. str(pclusdf)
  89. ptestdata <- dplyr::select(pclusdf, -kwh)
  90. str(ptestdata)
  91. predvals <- predict(charmmod, ptestdata)
  92. predf <- data.frame(x = pclusdf$read_time, y = pclusdf$kwh, f = predvals, r = pclusdf$kwh - predvals)
  93. predplot <-ggplot(predf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
  94. geom_point(aes(y = r), color = "darkgreen")
  95. predplot
  96. predplot + coord_cartesian(xlim = c(as.POSIXct("2017-03-01", tz = "UTC"), as.POSIXct("2017-04-01", tz = "UTC")))
  97. mean(abs(predf$r))
  98. sd(predf$r)
  99. # number of icps per cluster
  100. ocdf <- p$read_pickle('../data/9-clusters-sample-table.pkl')
  101. ncdf <- p$read_pickle('../data/1617-asgn-table.pkl')
  102. table(ocdf$cluster)
  103. table(ncdf$cluster)
  104. allmods <- list()
  105. for (clus in clusters) {
  106. clusdf <- filter(aggdf, cluster == clus) %>%
  107. dplyr::select(read_time, kwh = kwh_tot_mean) %>%
  108. left_join(mtempdf, by = "read_time") %>% cbind(harm.y, harm.w, harm.d)
  109. largm <- lm(nform.comp, data = clusdf, model = FALSE, qr = FALSE)
  110. allmods[[clus]] <- largm
  111. }
  112. saveRDS(allmods, '../models/1kmods.rds')