Repository for Petra's work at ampli Jan-Feb 2019

weathmod.R 3.3KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. library(TSA)
  2. library(caTools)
  3. library(dplyr)
  4. library(forecast)
  5. library(ggplot2)
  6. library(reticulate)
  7. library(tidyr)
  8. theme_set(theme_bw())
  9. use_virtualenv("../venv/")
  10. # Load, transform data
  11. p <- import("pandas")
  12. tempdf <- p$read_pickle("../data/2016-18-weather.pkl") %>%
  13. select(-record_no, -station) %>%
  14. mutate(temp_date = as.Date(temp_date))
  15. str(tempdf)
  16. # Get a data frame of all the dates/times we want
  17. wdts <- seq.POSIXt(min(tempdf$temp_timestamp), max(tempdf$temp_timestamp), by = "30 mins")
  18. hhdt <- as.difftime("30", format = "%M")
  19. wantdf <- data.frame(temp_timestamp = wdts, temp_date = as.Date(wdts)) %>% arrange(temp_timestamp)
  20. str(wantdf)
  21. # Recursively impute NAs from previous non-NA value; if first value is NA replaced by median but this shouldn't happen
  22. # If all values are NA will hang, but this shouldn't happen either
  23. fulltemp <- left_join(wantdf, tempdf, by = c("temp_timestamp", "temp_date"))
  24. str(fulltemp)
  25. for (coln in names(fulltemp)[3:7]) {
  26. tc <- fulltemp[[coln]]
  27. if (is.na(tc[1])) {
  28. print("First value of column is NA; Imputing from median")
  29. tc[1] <- median(tc, na.rm = TRUE)
  30. }
  31. while (sum(is.na(tc)) != 0){
  32. tc <- ifelse(is.na(tc), lag(tc, 1), tc)
  33. }
  34. fulltemp[[coln]] <- tc
  35. }
  36. sum(is.na(fulltemp$tmin_c))
  37. fulltemp$runmin <- runmin(fulltemp$tmin_c, 48, endrule = "min", align = "right")
  38. fulltemp$runmax <- runmax(fulltemp$tmax_c, 48, endrule = "max", align = "right")
  39. str(fulltemp)
  40. # Temperature plots
  41. tplot <- ggplot(fulltemp, aes(x = temp_timestamp, y = tmin_c)) + geom_line() +
  42. geom_line(aes(y = tmax_c), color = "magenta") +
  43. geom_line(aes(y = runmin), color = "blue") +
  44. geom_line(aes(y = runmax), color = "red")
  45. tplot
  46. tplot + coord_cartesian(xlim = c(as.POSIXct("2016-12-01"), as.POSIXct("2017-03-01")))
  47. # Create a harmonic (sine wave) model for minimum temperature
  48. yharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(365.25 * 48)), 2)
  49. #dharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(48)), 1)
  50. hmod <- lm(fulltemp$runmin ~ yharm)
  51. summary(hmod)
  52. hmdf <- data.frame(x = fulltemp$temp_timestamp, y = fulltemp$runmin, f = fitted(hmod), r = resid(hmod))
  53. tmplot <- ggplot(hmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
  54. geom_point(aes(y = r), color = "darkgreen")
  55. tmplot
  56. tmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC")))
  57. maxhmod <- lm(fulltemp$runmax ~ yharm)
  58. summary(maxhmod)
  59. mhmdf <- data.frame(x = fulltemp$temp_timestamp, y = fulltemp$runmax, f = fitted(maxhmod), r = resid(maxhmod))
  60. mtmplot <- ggplot(mhmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
  61. geom_point(aes(y = r), color = "darkgreen")
  62. mtmplot
  63. mtmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC")))
  64. hmdf.comb <- left_join(hmdf, mhmdf, by = "x", suffix = c(".min", ".max"))
  65. ctmplot <- ggplot(hmdf.comb, aes(x = x, y = f.min)) + geom_line(color = "blue", size = 2) +
  66. geom_line(aes(y = f.max), color = "red", size = 2) +
  67. geom_point(aes(y = y.max), color = "magenta") +
  68. geom_point(aes(y = y.min), color = "lightblue")
  69. ctmplot
  70. ctmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC")))
  71. write.csv(hmdf.comb, "../data/weatherharm.csv", row.names = FALSE)