library(TSA) library(caTools) library(dplyr) library(forecast) library(ggplot2) library(reticulate) library(tidyr) theme_set(theme_bw()) use_virtualenv("../venv/") # Load, transform data p <- import("pandas") tempdf <- p$read_pickle("../data/2016-18-weather.pkl") %>% select(-record_no, -station) %>% mutate(temp_date = as.Date(temp_date)) str(tempdf) # Get a data frame of all the dates/times we want wdts <- seq.POSIXt(min(tempdf$temp_timestamp), max(tempdf$temp_timestamp), by = "30 mins") hhdt <- as.difftime("30", format = "%M") wantdf <- data.frame(temp_timestamp = wdts, temp_date = as.Date(wdts)) %>% arrange(temp_timestamp) str(wantdf) # Recursively impute NAs from previous non-NA value; if first value is NA replaced by median but this shouldn't happen # If all values are NA will hang, but this shouldn't happen either fulltemp <- left_join(wantdf, tempdf, by = c("temp_timestamp", "temp_date")) str(fulltemp) fulltemp <- fill(fulltemp, tmax_c:rhmean) sum(is.na(fulltemp$tmin_c)) fulltemp$runmin <- runmin(fulltemp$tmin_c, 48, endrule = "min", align = "right") fulltemp$runmax <- runmax(fulltemp$tmax_c, 48, endrule = "max", align = "right") str(fulltemp) # Temperature plots tplot <- ggplot(fulltemp, aes(x = temp_timestamp, y = tmin_c)) + geom_line() + geom_line(aes(y = tmax_c), color = "magenta") + geom_line(aes(y = runmin), color = "blue") + geom_line(aes(y = runmax), color = "red") # tplot # tplot + coord_cartesian(xlim = c(as.POSIXct("2016-12-01"), as.POSIXct("2017-03-01"))) # Create a harmonic (sine wave) model for minimum temperature yharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(365.25 * 48)), 2) %>% as.data.frame() #dharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(48)), 1) hmod <- lm(fulltemp$runmin ~ ., data = yharm) summary(hmod) hmdf <- data.frame(x = fulltemp$temp_timestamp, y = fulltemp$runmin, f = fitted(hmod), r = resid(hmod)) tmplot <- ggplot(hmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() + geom_point(aes(y = r), color = "darkgreen") # tmplot # tmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC"))) maxhmod <- lm(fulltemp$runmax ~ ., data = yharm) summary(maxhmod) mhmdf <- data.frame(x = fulltemp$temp_timestamp, y = fulltemp$runmax, f = fitted(maxhmod), r = resid(maxhmod)) mtmplot <- ggplot(mhmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() + geom_point(aes(y = r), color = "darkgreen") # mtmplot # mtmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC"))) hmdf.comb <- left_join(hmdf, mhmdf, by = "x", suffix = c(".min", ".max")) ctmplot <- ggplot(hmdf.comb, aes(x = x, y = f.min)) + geom_line(color = "blue", size = 2) + geom_line(aes(y = f.max), color = "red", size = 2) + geom_point(aes(y = y.max), color = "magenta") + geom_point(aes(y = y.min), color = "lightblue") # ctmplot # ctmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC"))) write.csv(hmdf.comb, "../data/weatherharm.csv", row.names = FALSE) saveRDS(hmod, "../models/weatherMinharmonic.rds") saveRDS(maxhmod, "../models/weatherMaxharmonic.rds")