|
@@ -0,0 +1,65 @@
|
|
1
|
+library(TSA)
|
|
2
|
+library(caTools)
|
|
3
|
+library(dplyr)
|
|
4
|
+library(forecast)
|
|
5
|
+library(ggplot2)
|
|
6
|
+library(reticulate)
|
|
7
|
+library(tidyr)
|
|
8
|
+theme_set(theme_bw())
|
|
9
|
+use_virtualenv("../venv/")
|
|
10
|
+
|
|
11
|
+# Load, transform data
|
|
12
|
+p <- import("pandas")
|
|
13
|
+tempdf <- p$read_pickle("../data/2016-18-weather.pkl") %>%
|
|
14
|
+ select(-record_no, -station) %>%
|
|
15
|
+ mutate(temp_date = as.Date(temp_date))
|
|
16
|
+str(tempdf)
|
|
17
|
+
|
|
18
|
+# Get a data frame of all the dates/times we want
|
|
19
|
+wdts <- seq.POSIXt(min(tempdf$temp_timestamp), max(tempdf$temp_timestamp), by = "30 mins")
|
|
20
|
+hhdt <- as.difftime("30", format = "%M")
|
|
21
|
+wantdf <- data.frame(temp_timestamp = wdts, temp_date = as.Date(wdts)) %>% arrange(temp_timestamp)
|
|
22
|
+str(wantdf)
|
|
23
|
+
|
|
24
|
+# Recursively impute NAs from previous non-NA value; if first value is NA replaced by median but this shouldn't happen
|
|
25
|
+# If all values are NA will hang, but this shouldn't happen either
|
|
26
|
+fulltemp <- left_join(wantdf, tempdf, by = c("temp_timestamp", "temp_date"))
|
|
27
|
+str(fulltemp)
|
|
28
|
+for (coln in names(fulltemp)[3:7]) {
|
|
29
|
+ tc <- fulltemp[[coln]]
|
|
30
|
+ if (is.na(tc[1])) {
|
|
31
|
+ print("First value of column is NA; Imputing from median")
|
|
32
|
+ tc[1] <- median(tc, na.rm = TRUE)
|
|
33
|
+ }
|
|
34
|
+ while (sum(is.na(tc)) != 0){
|
|
35
|
+ tc <- ifelse(is.na(tc), lag(tc, 1), tc)
|
|
36
|
+ }
|
|
37
|
+ fulltemp[[coln]] <- tc
|
|
38
|
+}
|
|
39
|
+sum(is.na(fulltemp$tmin_c))
|
|
40
|
+fulltemp$runmin <- runmin(fulltemp$tmin_c, 48, endrule = "min", align = "right")
|
|
41
|
+str(fulltemp)
|
|
42
|
+
|
|
43
|
+# Temperature plots
|
|
44
|
+tplot <- ggplot(fulltemp, aes(x = temp_timestamp, y = tmin_c)) + geom_line() +
|
|
45
|
+ geom_line(aes(y = runmin), color = "blue")
|
|
46
|
+
|
|
47
|
+tplot
|
|
48
|
+
|
|
49
|
+tplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01"), as.POSIXct("2017-06-01")))
|
|
50
|
+
|
|
51
|
+# Create a harmonic (sine wave) model for minimum temperature
|
|
52
|
+yharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(365.25 * 48)), 2)
|
|
53
|
+#dharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(48)), 1)
|
|
54
|
+hmod <- lm(fulltemp$runmin ~ yharm)
|
|
55
|
+summary(hmod)
|
|
56
|
+
|
|
57
|
+hmdf <- data.frame(x = fulltemp$temp_timestamp, y = fulltemp$runmin, f = fitted(hmod), r = resid(hmod))
|
|
58
|
+tmplot <- ggplot(hmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
|
|
59
|
+ geom_point(aes(y = r), color = "darkgreen")
|
|
60
|
+
|
|
61
|
+tmplot
|
|
62
|
+
|
|
63
|
+tmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC")))
|
|
64
|
+
|
|
65
|
+write.csv(hmdf, "../data/weatherharm.csv", row.names = FALSE)
|