Browse Source

A harmonic model for annual patterns in rolling 24-hour minimum temperature

Petra Lamborn 5 years ago
parent
commit
654fe4c15d
5 changed files with 130 additions and 0 deletions
  1. 3
    0
      .gitignore
  2. 65
    0
      R/weathmod.R
  3. 10
    0
      py/downweather.py
  4. 27
    0
      py/util.py
  5. 25
    0
      sql/weather.pgsql

+ 3
- 0
.gitignore View File

@@ -1,3 +1,6 @@
1
+# Ignore data folder completely
2
+data/
3
+
1 4
 # ---> Python
2 5
 # Byte-compiled / optimized / DLL files
3 6
 __pycache__/

+ 65
- 0
R/weathmod.R View File

@@ -0,0 +1,65 @@
1
+library(TSA)
2
+library(caTools)
3
+library(dplyr)
4
+library(forecast)
5
+library(ggplot2)
6
+library(reticulate)
7
+library(tidyr)
8
+theme_set(theme_bw())
9
+use_virtualenv("../venv/")
10
+
11
+# Load, transform data
12
+p <- import("pandas")
13
+tempdf <- p$read_pickle("../data/2016-18-weather.pkl") %>% 
14
+    select(-record_no, -station) %>% 
15
+    mutate(temp_date = as.Date(temp_date))
16
+str(tempdf)
17
+
18
+# Get a data frame of all the dates/times we want
19
+wdts <- seq.POSIXt(min(tempdf$temp_timestamp), max(tempdf$temp_timestamp), by = "30 mins")
20
+hhdt <- as.difftime("30", format = "%M")
21
+wantdf <- data.frame(temp_timestamp = wdts, temp_date = as.Date(wdts)) %>% arrange(temp_timestamp)
22
+str(wantdf)
23
+
24
+# Recursively impute NAs from previous non-NA value; if first value is NA replaced by median but this shouldn't happen
25
+# If all values are NA will hang, but this shouldn't happen either
26
+fulltemp <- left_join(wantdf, tempdf, by = c("temp_timestamp", "temp_date"))
27
+str(fulltemp)
28
+for (coln in names(fulltemp)[3:7]) {
29
+    tc <- fulltemp[[coln]]
30
+    if (is.na(tc[1])) {
31
+        print("First value of column is NA; Imputing from median")
32
+        tc[1] <- median(tc, na.rm = TRUE)
33
+    }
34
+    while (sum(is.na(tc)) != 0){
35
+        tc <- ifelse(is.na(tc), lag(tc, 1), tc)
36
+    }
37
+    fulltemp[[coln]] <- tc
38
+}
39
+sum(is.na(fulltemp$tmin_c))
40
+fulltemp$runmin <- runmin(fulltemp$tmin_c, 48, endrule = "min", align = "right")
41
+str(fulltemp)
42
+
43
+# Temperature plots
44
+tplot <- ggplot(fulltemp, aes(x = temp_timestamp, y = tmin_c)) + geom_line() +
45
+    geom_line(aes(y = runmin), color = "blue")
46
+
47
+tplot
48
+
49
+tplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01"), as.POSIXct("2017-06-01")))
50
+
51
+# Create a harmonic (sine wave) model for minimum temperature
52
+yharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(365.25 * 48)), 2)
53
+#dharm <- harmonic(ts(1:nrow(fulltemp), frequency = floor(48)), 1)
54
+hmod <- lm(fulltemp$runmin ~ yharm)
55
+summary(hmod)
56
+
57
+hmdf <- data.frame(x = fulltemp$temp_timestamp, y = fulltemp$runmin, f = fitted(hmod), r = resid(hmod))
58
+tmplot <- ggplot(hmdf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
59
+    geom_point(aes(y = r), color = "darkgreen")
60
+
61
+tmplot
62
+
63
+tmplot + coord_cartesian(xlim = c(as.POSIXct("2017-05-01", tz = "UTC"), as.POSIXct("2017-06-01", tz = "UTC")))
64
+
65
+write.csv(hmdf, "../data/weatherharm.csv", row.names = FALSE)

+ 10
- 0
py/downweather.py View File

@@ -0,0 +1,10 @@
1
+from util import gettemp
2
+import pandas as p
3
+
4
+tempdata = gettemp('2016-04-01', '2019-01-01', 2006)
5
+
6
+print(tempdata.info())
7
+print(tempdata.describe())
8
+
9
+print("Pickling")
10
+tempdata.to_pickle("../data/2016-18-weather.pkl")

+ 27
- 0
py/util.py View File

@@ -1,6 +1,7 @@
1 1
 import psycopg2 as pg
2 2
 from configparser import ConfigParser
3 3
 import pandas.io.sql as psql
4
+import pandas as p
4 5
 import datetime as dt
5 6
 import numpy as np
6 7
 from pprint import pprint
@@ -120,6 +121,32 @@ def getkwh(datestart, dateend, timestart, timeend, subset):
120 121
     return(qdf)
121 122
 
122 123
 
124
+def gettemp(datestart, dateend, station):
125
+    query = """
126
+    SELECT record_no, station, temp_date, temp_date + temp_time AS temp_timestamp, tmax_c, tmin_c,
127
+        tgmin, tmean, rhmean
128
+    FROM weather.temperature_fact
129
+    WHERE station = %(station)s AND
130
+        temp_date >= to_date(%(datestart)s, 'yyyy-mm-dd') AND
131
+        temp_date <  to_date(%(dateend)s,   'yyyy-mm-dd')
132
+    ORDER BY temp_date, temp_time;
133
+    """
134
+    pdict = {
135
+        'datestart': datestart,
136
+        'dateend':   dateend,
137
+        'station':   station
138
+    }
139
+    print("Getting data with parameters:")
140
+    pprint(pdict)
141
+    qdf = getQuery(query, pdict)
142
+    print("converting")
143
+    qdf.temp_date = p.to_datetime(qdf.temp_date)
144
+    # qdf.temp_time = qdf.temp_time.to_timestamp()
145
+    print('Done')
146
+    return qdf
147
+
148
+
149
+
123 150
 
124 151
 
125 152
 if __name__ == "__main__":

+ 25
- 0
sql/weather.pgsql View File

@@ -0,0 +1,25 @@
1
+-------------------------------
2
+-- Querying the weather data --
3
+-------------------------------
4
+
5
+-- Get currently operating stations
6
+SELECT agent, network, start_date, end_date, station_name,
7
+    lat_dec_deg, long_dec_deg
8
+FROM weather.station_dim
9
+WHERE end_date IS NULL AND
10
+    start_date <= to_date('2015-01-01', 'yyyy-mm-dd')
11
+ORDER BY start_date;
12
+
13
+-- 1 agent per network and vice versa?
14
+SELECT COUNT(DISTINCT agent) as agents, network
15
+FROM weather.station_dim
16
+GROUP BY network
17
+ORDER BY agents DESC;
18
+
19
+SELECT record_no, station, temp_date, temp_time, tmax_c, tmin_c,
20
+    tgmin, tmean, rhmean
21
+FROM weather.temperature_fact
22
+WHERE station = 2006 AND
23
+    temp_date >= to_date('2016-01-01', 'yyyy-mm-dd') AND
24
+    temp_date <  to_date('2019-01-01', 'yyyy-mm-dd')
25
+ORDER BY temp_date, temp_time;