Browse Source

Starting proper clusterviz

Petra Lamborn 5 years ago
parent
commit
43fd22ad18
2 changed files with 28 additions and 225 deletions
  1. 27
    224
      R/clusterviz.R
  2. 1
    1
      README.md

+ 27
- 224
R/clusterviz.R View File

@@ -1,3 +1,16 @@
1
+#!/usr/bin/env Rscript
2
+library(argparse)
3
+
4
+parser <- ArgumentParser(description="Create plots for aggregated cluster data")
5
+parser$add_argument("cluster_file", help = "file to visualise")
6
+parser$add_argument("-i", "--img-path", dest = "img_path", default = "../img/", help = "path to store plots in; default: ../img/")
7
+parser$add_argument("-p", "--postfix", dest = "postfix", default = "_plot", help = "postfix for files, default: _plot")
8
+parser$add_argument("-v", "--virtualenv", dest = "virtualenv", default = "../venv/", help = "path of virtualenv; default: ../venv/; '-' for none")
9
+parser$add_argument("--width", dest = "width", default = 40, help = "width (cm), default 40", type = "double")
10
+parser$add_argument("--height", dest = "height", default = 25, help = "height (cm), default 25", type = "double")
11
+
12
+args <- parser$parse_args()
13
+
1 14
 library(reticulate)
2 15
 library(ggplot2)
3 16
 library(dplyr)
@@ -5,36 +18,21 @@ library(tidyr)
5 18
 library(TSA)
6 19
 library(forecast)
7 20
 theme_set(theme_bw())
8
-use_virtualenv("../venv/")
9 21
 
10
-fextract <- function(x, y, keep = 1, top = TRUE) {
11
-    sdy <- sd(y)
12
-    my <- mean(y)
13
-    stany <- (y - my) / sdy
14
-    ftf <- fft(stany)
15
-    if (top) {
16
-        ftf[rank(-abs(ftf)) > keep] <- 0
17
-    } else {
18
-        ftf[(keep + 1):length(ftf)] <- 0
19
-    }
20
-    rfv <- Re(fft(ftf, inverse = TRUE))
21
-    dfr <- data.frame(x = x, y = y, f = (rfv - mean(rfv)) / sd(rfv) * sdy + my)
22
-    dfr$res <- dfr$y - dfr$f
23
-    return(dfr)
22
+if (args$virtualenv != "-") {
23
+    print(args$virtualenv)
24
+    use_virtualenv(args$virtualenv)
24 25
 }
25 26
 
26
-
27 27
 p <- import("pandas")
28 28
 sns <- import("seaborn")
29 29
 cbp <- as.character(p$Series(sns$color_palette("colorblind", as.integer(9))$as_hex()))
30
-aggdf <- p$read_pickle("../data/9-clusters.agg.pkl")
30
+aggdf <- p$read_pickle(args$cluster_file)
31 31
 # aggdf <- as.data.frame(aggdf)
32 32
 aggdf$cluster <- factor(aggdf$cluster)
33 33
 str(aggdf)
34 34
 clusters = levels(aggdf$cluster)
35 35
 
36
-mtempdf <- read.csv("../data/weatherharm.csv", stringsAsFactors = FALSE) %>% mutate(x = as.POSIXct(x, tz = "UTC"))
37
-
38 36
 ggplot(aggdf, aes(y = kwh_tot_mean, x = cluster)) + geom_boxplot()
39 37
 
40 38
 facall <- ggplot(aggdf, aes(x = read_time, y = kwh_tot_mean, color = cluster, fill = cluster)) + 
@@ -103,210 +101,15 @@ facoct <- ggplot(midoct, aes(x = read_time, y = kwh_tot_mean, color = cluster, f
103 101
 octcon <- facoct + facet_grid(cluster ~ .)
104 102
 octfre <- facoct + facet_grid(cluster ~ ., scales = "free")
105 103
 
106
-# ggsave("all-9-fix-1617.png", allcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
107
-# ggsave("all-9-fre-1617.png", allfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
108
-# ggsave("jan-9-fix-1617.png", jancon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
109
-# ggsave("jan-9-fre-1617.png", janfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
110
-# ggsave("apr-9-fix-1617.png",  apcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
111
-# ggsave("apr-9-fre-1617.png",  apfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
112
-# ggsave("jul-9-fix-1617.png", julcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
113
-# ggsave("jul-9-fre-1617.png", julfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
114
-# ggsave("oct-9-fix-1617.png", octcon, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
115
-# ggsave("oct-9-fre-1617.png", octfre, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
116
-
117
-
118
-# ----
119
-
120
-
121
-cacf <- list()
122
-perd <- list()
123
-
124
-for (c in clusters) {
125
-    cagg <- filter(aggdf, cluster == c)
126
-    cacf[[c]] <- acf(cagg$kwh_tot_mean, lag.max = 48*365, plot = FALSE)$acf
127
-    per <- periodogram(cagg$kwh_tot_mean, plot = FALSE)
128
-    perd[[c]] <- data.frame(freq = per$freq, spec = per$spec) %>% mutate(cluster = c, period = (1/freq)) 
129
-    #%>% arrange(desc(spec)) %>% head(5)
130
-}
131
-
132
-acfm <- sapply(cacf, as.numeric) %>% as.data.frame() %>% mutate(hour = ((1:length(`1`)) - 1)/2) %>% 
133
-    gather(key = "cluster", value = "acorr", clusters) %>% mutate(day = hour / 24, week = hour / (24 * 7))
134
-
135
-
136
-fcorr <- ggplot(acfm, aes(x = week, y = acorr, color = cluster)) + geom_line(size = 1.5) +
137
-    scale_color_manual(values = cbp) + facet_grid(cluster ~ .) + coord_cartesian(expand = FALSE) +
138
-    theme(legend.position = "none") + labs(title = "Autocorrelation plot (full year)", 
139
-                                           y = "Autocorrelation", x = "lag (weeks)")
140
-
141
-wcorr <- ggplot(acfm, aes(x = day, y = acorr, color = cluster)) + geom_line(size = 1.5) +
142
-    scale_color_manual(values = cbp) + facet_grid(cluster ~ .) +
143
-    scale_x_continuous(breaks = unique(floor(acfm$day / 7)) * 7) +
144
-    theme(legend.position = "none") + coord_cartesian(xlim = c(0, 15), expand = FALSE) +
145
-    labs(title = "Autocorrelation plot (two weeks)", y = "Autocorrelation", x = "lag (days)")
146
-
147
-# ggsave("full-autocorr-1617.png", fcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
148
-# ggsave("week-autocorr-1617.png", wcorr, path = "../img/", dpi = "retina", width = 40, height = 25, units = "cm")
149
-
150
-perd <- bind_rows(perd)
151
-
152
-ggplot(perd, aes(x = freq, y = spec)) + 
153
-    geom_line() + facet_grid(cluster ~ ., scales = "free") +
154
-    scale_x_continuous(breaks = 1 / (c(48, 48*7, 48*7*4, 48*365)))
155
-
156
-c1ts <- filter(aggdf, cluster == "1")$kwh_tot_mean
157
-cts <- ts(c1ts, frequency = 48, start = c(1, 1))
158
-# carima <- auto.arima(cts, trace = TRUE)
159
-# plot(forecast(carima, h = 480))
160
-ctsnp <- msts(c1ts, c(48, 48*7))
161
-ctbats <- tbats(ctsnp)
162
-plot(forecast(ctbats, h = 48 * 7 * 4))
163
-
164
-c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean
165
-ctsnp <- msts(c9ts, c(48, 48*7, 48*7*365.25))
166
-ctbats <- tbats(ctsnp)
167
-plot(forecast(ctbats, h = 48 * 7))
168
-
169
-p <- periodogram(c1ts)
170
-dd <- data.frame(freq = p$freq, spec = p$spec) %>% mutate(per = 1/freq)
171
-dd %>% arrange(desc(spec)) %>% mutate(d = per / 48) %>% head()
172
-
173
-c9ts <- filter(aggdf, cluster == "9")
174
-
175
-ggplot(c9ts, aes(x = read_time, y = kwh_tot_mean)) + geom_line()
176
-
177
-nft <- fextract(c9ts$read_time, c9ts$kwh_tot_mean, keep = 15)
178
-ggplot(nft, aes(x, y)) + geom_line() + 
179
-    geom_line(aes(x, f), color = "blue") +
180
-    scale_x_datetime(date_breaks = "1 day", date_labels = "%a, %-d %B %Y") +
181
-    coord_cartesian(xlim = c(as.POSIXct("2016-07-16", tz = "UTC"), as.POSIXct("2016-07-23", tz = "UTC")), expand = TRUE)
182
-
183
-clus <- "9"
184
-kp <- 50
185
-cts <- filter(aggdf, cluster == clus)
186
-# ggplot(cts, aes(x = read_time, y = kwh_tot_mean)) + geom_line()
187
-nft <- fextract(cts$read_time, cts$kwh_tot_mean, keep = kp)
188
-ggplot(nft, aes(x, y)) + geom_line() + 
189
-    geom_line(aes(x, f), color = "blue") +
190
-    scale_x_datetime(date_breaks = "2 days", date_labels = "%a, %-d %b %Y") +
191
-    coord_cartesian(xlim = c(as.POSIXct("2017-09-14", tz = "UTC"), as.POSIXct("2017-09-28", tz = "UTC")), expand = TRUE)
192
-
193
-ggplot(nft, aes(x, res)) + geom_line() + scale_x_datetime(date_breaks = "1 week") +
194
-    coord_cartesian(xlim = c(as.POSIXct("2017-08-01", tz = "UTC"), as.POSIXct("2017-09-28", tz = "UTC")), expand = TRUE)
195
-
196
-ggplot(nft, aes(y = res)) + geom_boxplot()
197
-ggplot(nft, aes(x = res)) + geom_histogram()
198
-
199
-
200
-c9ts <- filter(aggdf, cluster == "9")$kwh_tot_mean
201
-c9ft <- as.numeric(abs(fft(c9ts)))
202
-c9ftdf <- data.frame(x = 1:length(c9ft), ft = c9ft)
203
-# %>% arrange(desc(ft)) %>% mutate(xt = 1/x)
204
-
205
-c9df <- data.frame(x = 1:length(c9ts), y = c9ts, ly = log(c9ts + min(c9ts) + 1))
206
-c9harm. <- harmonic(ts(c9ts, frequency = 17544), 2)
207
-sinmod <- lm(c9df$y ~ c9harm.)
208
-c9df$fit <- fitted(sinmod)
209
-c9df$res <- resid(sinmod)
210
-
211
-ggplot(c9df, aes(x = x, y = ly)) + geom_point() +
212
-    geom_line(aes(y = fit), size = 2, colour = "blue")
213
-
214
-plot(c(c9df$res))
215
-hist(log(c9df$y), breaks = 30)
216
-
217
-n <- 1.5
218
-ytra <- c9df$y / c9df$fit^n - 1/c9df$fit^(n-1)
219
-plot(ytra)
220
-
221
-
222
-yts <- msts(ytra, seasonal.periods = c(48, 48*7))
223
-autoplot(mstl(yts))
224
-
225
-ytsd <- ts(ytra, frequency = 48)
226
-ydharm. <- harmonic(ytsd, 10)
227
-sdmod <- lm(ytra ~ ydharm.)
228
-sddf <- data.frame(x = 1:length(ytra), y = ytra, f = fitted(sdmod), r = resid(sdmod))
229
-ggplot(sddf[1:(48*7*2), ], aes(x = x, y = y)) + geom_point() +
230
-    geom_line(aes(y = f), colour = "blue", size = 2)
231
-
232
-plot(resid(sdmod)[1:(48*7*2)])
233
-
234
-
235
-
236
-
237
-
238
-
239
-c9tsy <- ts(c9ts, frequency = floor(48 * 365.25))
240
-c9tsw <- ts(c9ts, frequency = floor(48 * 7))
241
-c9tsd <- ts(c9ts, frequency = floor(48))
242
-
243
-harm.y <- harmonic(c9tsy, 2)
244
-harm.w <- harmonic(c9tsw, 2)
245
-harm.d <- harmonic(c9tsd, 6)
246
-
247
-cmod <- lm(c9ts ~ harm.y + harm.w + harm.d)
248
-summary(cmod)
249
-plot(fitted(cmod))
250
-plot(resid(cmod))
251
-
252
-
253
-cmodi <- lm(c9ts ~ harm.y * harm.w * harm.d)
254
-
255
-summary(cmodi)
256
-
257
-plot(fitted(cmodi))
258
-plot(c9ts)
259
-plot(resid(cmodi))
260
-hist(resid(cmodi), breaks = 100)
261
-# Heavy tails
262
-qqnorm(resid(cmodi))
263
-
264
-
265
-clust = "5"
266
-
267
-cts <- filter(aggdf, cluster == clust)$kwh_tot_mean
268
-ctsy <- ts(cts, frequency = floor(48 * 365.25))
269
-ctsw <- ts(cts, frequency = floor(48 * 7))
270
-ctsd <- ts(cts, frequency = floor(48))
271
-harm.y <- harmonic(ctsy, 4)
272
-harm.w <- harmonic(ctsw, 3)
273
-harm.d <- harmonic(ctsd, 4)
274
-cmodi <- lm(cts ~ harm.y * harm.w * harm.d)
275
-summary(cmodi)
276
-cldf <- data.frame(x = unique(aggdf$read_time), y = cts, f = fitted(cmodi), r = resid(cmodi))
277
-ggplot(cldf, aes(x = x, y = y)) + geom_line(aes(y = f), color = "blue", size = 2) + geom_point() +
278
-    geom_point(aes(y = r), color = "darkgreen") +
279
-    coord_cartesian(xlim = c(as.POSIXct("2017-01-01", tz = "UTC"), as.POSIXct("2017-12-31", tz = "UTC")))
280
-
281
-cwcombdf <- left_join(cldf, mtempdf, by = "x")
282
-str(cwcombdf)
283
-
284
-
285
-# A variety of models
286
-plot(f.x ~ f.y, data = cwcombdf)
287
-plot(r.x ~ r.y, data = cwcombdf)
288
-plot(r.x ~ y.y, data = cwcombdf)
289
-plot(y.x ~ r.y, data = cwcombdf)
290
-plot(y.x ~ x, data = cwcombdf)
291
-plot(y.x ~ y.y, data = cwcombdf)
292
-plot(y.x ~ y.y, data = cwcombdf)
293
-plot(y.y ~ r.x, data = cwcombdf)
294
-plot(y.y ~ x, data = cwcombdf)
295
-
104
+ggsave(paste0("all_fix", args$postfix, ".png"), allcon, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
105
+ggsave(paste0("all_fre", args$postfix, ".png"), allfre, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
106
+ggsave(paste0("jan_fix", args$postfix, ".png"), jancon, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
107
+ggsave(paste0("jan_fre", args$postfix, ".png"), janfre, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
108
+ggsave(paste0("apr_fix", args$postfix, ".png"),  apcon, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
109
+ggsave(paste0("apr_fre", args$postfix, ".png"),  apfre, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
110
+ggsave(paste0("jul_fix", args$postfix, ".png"), julcon, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
111
+ggsave(paste0("jul_fre", args$postfix, ".png"), julfre, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
112
+ggsave(paste0("oct_fix", args$postfix, ".png"), octcon, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
113
+ggsave(paste0("oct_fre", args$postfix, ".png"), octfre, path = args$img_path, dpi = "retina", width = args$width, height = args$height, units = "cm")
296 114
 
297
-summary(lm(f.x ~ f.y, data = cwcombdf))
298
-summary(lm(r.x ~ r.y, data = cwcombdf))
299
-summary(lm(r.x ~ y.y, data = cwcombdf))
300
-summary(lm(y.x ~ r.y, data = cwcombdf))
301
-summary(lm(y.x ~ x, data = cwcombdf))
302
-summary(lm(y.x ~ y.y, data = cwcombdf))
303
-summary(lm(y.x ~ y.y, data = cwcombdf))
304
-summary(lm(y.y ~ r.x, data = cwcombdf))
305
-summary(lm(y.y ~ x, data = cwcombdf))
306 115
 
307
-summary(lm(r.x ~ r.y * f.y, data = cwcombdf))
308
-imod <- lm(r.x ~ r.y * f.y, data = cwcombdf)
309
-plot(fitted(imod))
310
-plot(resid(imod))
311
-plot(cwcombdf$r.x)
312
-plot(cwcombdf$r.x ~ resid(imod))

+ 1
- 1
README.md View File

@@ -200,5 +200,5 @@ sudo ln -s /usr/lib64/libquadmath.so.0 /usr/lib64/libquadmath.so
200 200
 To install the packages needed for these scripts you should run the following from within `R` itself:
201 201
 
202 202
 ```R
203
-install.packages(c("dplyr", "tidyr", "ggplot2", "forecast", "TSA", "reticulate", "caTools", "scales", "optparse"))
203
+install.packages(c("dplyr", "tidyr", "ggplot2", "forecast", "TSA", "reticulate", "caTools", "scales", "argparse"))
204 204
 ```