1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- library(TSA)
- library(caTools)
- library(dplyr)
- library(forecast)
- library(ggplot2)
- library(reticulate)
- library(tidyr)
- theme_set(theme_bw())
- use_virtualenv("../venv/")
-
- # Load, transform data
- p <- import("pandas")
- tempdf <- p$read_pickle("../data/2016-18-weather.pkl") %>%
- select(-record_no, -station) %>%
- mutate(temp_date = as.Date(temp_date))
- str(tempdf)
-
- # Get a data frame of all the dates/times we want
- wdts <- seq.POSIXt(min(tempdf$temp_timestamp), max(tempdf$temp_timestamp), by = "30 mins")
- hhdt <- as.difftime("30", format = "%M")
- wantdf <- data.frame(temp_timestamp = wdts, temp_date = as.Date(wdts)) %>% arrange(temp_timestamp)
- str(wantdf)
-
- # Recursively impute NAs from previous non-NA value; if first value is NA replaced by median but this shouldn't happen
- # If all values are NA will hang, but this shouldn't happen either
- fulltemp <- left_join(wantdf, tempdf, by = c("temp_timestamp", "temp_date"))
- str(fulltemp)
- for (coln in names(fulltemp)[3:7]) {
- tc <- fulltemp[[coln]]
- if (is.na(tc[1])) {
- print("First value of column is NA; Imputing from median")
- tc[1] <- median(tc, na.rm = TRUE)
- }
- while (sum(is.na(tc)) != 0){
- tc <- ifelse(is.na(tc), lag(tc, 1), tc)
- }
- fulltemp[[coln]] <- tc
- }
- sum(is.na(fulltemp$tmin_c))
- fulltemp$runmin <- runmin(fulltemp$tmin_c, 48, endrule = "min", align = "right")
- str(fulltemp)
-
- # Temperature plots
- tplot <- ggplot(fulltemp, aes(x = temp_timestamp, y = tmin_c)) + geom_line() +
- geom_line(aes(y = runmin), color = "blue")
-
- tplot
-
- tplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01"), as.POSIXct("2017-06-01")))
-
- # Create a harmonic (sine wave) model for minimum temperature
- yharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(365.25 * 48)), 2)
- #dharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(48)), 1)
- hmod <- lm(fulltemp$runmin ~ yharm)
- summary(hmod)
-
- hmdf <- data.frame(x = fulltemp$temp_timestamp, y = fulltemp$runmin, f = fitted(hmod), r = resid(hmod))
- tmplot <- ggplot(hmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
- geom_point(aes(y = r), color = "darkgreen")
-
- tmplot
-
- tmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC")))
-
- write.csv(hmdf, "../data/weatherharm.csv", row.names = FALSE)
|