Repository for Petra's work at ampli Jan-Feb 2019

pymodels.py 6.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. from argparse import ArgumentParser, FileType
  2. import numpy as np
  3. import pandas as p
  4. import statsmodels.formula.api as smf
  5. import datetime as dt
  6. import pickle
  7. from tqdm import tqdm
  8. epoch = dt.datetime(2017, 1, 1)
  9. def thirtyoffset(date, epoch=epoch):
  10. """Get the offset from the epoch, in multiples of 30 minutes,
  11. for a particular date/time
  12. """
  13. return (date - epoch).total_seconds() / (60 * 30)
  14. def harmonic(length, period, start=0, harmonics=1, prefix=""):
  15. x = np.arange(length) + start
  16. y = np.ndarray((harmonics * 2, length), dtype=np.float64)
  17. ns = []
  18. nc = []
  19. for i in range(harmonics):
  20. ysi = np.sin(x/period * 2 * np.pi * (i + 1))
  21. yci = np.cos(x/period * 2 * np.pi * (i + 1))
  22. y[i] = ysi
  23. ns.append("{}sin{}".format(prefix, i + 1))
  24. y[i + harmonics] = yci
  25. nc.append("{}cos{}".format(prefix, i + 1))
  26. y = p.DataFrame.from_records(y.transpose(), columns=ns + nc)
  27. return y
  28. def fitWeather(weather, harmonics):
  29. """Fit a weather model with a specified number of harmonics
  30. in order to detrend Returns models for minimum and maximum
  31. rolling temperature, along with a summary dataframe
  32. """
  33. # Make sure weather dataset is complete
  34. wantedtimes = p.date_range(weather.temp_timestamp.min(),
  35. weather.temp_timestamp.max(), freq="30 min")
  36. wantedtimes = p.Series(wantedtimes.round(freq="30 min"))
  37. wanteddf = p.DataFrame(index=wantedtimes)
  38. weather = wanteddf.join(weather.set_index(['temp_timestamp']),
  39. how='left').drop('temp_date', axis=1)
  40. weather.index.name = "read_time"
  41. weather = weather.fillna(method='ffill', axis=0)
  42. # Add rolling columns
  43. weather['min_rolling'] = weather['tmin_c'].rolling("1D").min()
  44. weather['max_rolling'] = weather['tmax_c'].rolling("1D").max()
  45. # Calculate harmonics
  46. wharmstart = thirtyoffset(weather.index.min())
  47. wharm = harmonic(weather.shape[0], period=365.25 * 48, start=wharmstart,
  48. harmonics=harmonics)
  49. # Weather harmonic column names in format for model
  50. weatherharms = " + ".join(wharm.columns.tolist())
  51. # Set up dataframe for modeling
  52. wharm.index = weather.index
  53. wharm['min_rolling'] = weather['min_rolling']
  54. wharm['max_rolling'] = weather['max_rolling']
  55. # Fitting models
  56. minmodel = smf.ols("min_rolling ~ " + weatherharms, data=wharm).fit()
  57. maxmodel = smf.ols("max_rolling ~ " + weatherharms, data=wharm).fit()
  58. # Residuals and fitted values
  59. sumdf = p.DataFrame(data={
  60. "max_rolling": wharm["max_rolling"],
  61. "max_resid": maxmodel.resid,
  62. "max_fitted": maxmodel.fittedvalues,
  63. "min_rolling": wharm["min_rolling"],
  64. "min_resid": minmodel.resid,
  65. "min_fitted": minmodel.fittedvalues
  66. })
  67. return minmodel, maxmodel, sumdf
  68. def predweather(model, datestart, dateend, harmonics=2):
  69. """Predict weather values from a weather model and date range
  70. Note: it's the residuals (true value - prediction) that go
  71. into the demand model
  72. """
  73. drange = p.date_range(datestart, dateend, freq="30min")
  74. pharm = harmonic(len(drange),
  75. period=365.25 * 48,
  76. start=thirtyoffset(drange.min()),
  77. harmonics=harmonics)
  78. pharm.index = drange
  79. return model.predict(pharm)
  80. def fitdemand(df, wmodsum, harmonics=[2, 3, 3]):
  81. harmstart = thirtyoffset(df.index.min())
  82. clusters = df.cluster.unique()
  83. clusters.sort()
  84. utv = df.index.unique()
  85. harmlen = len(utv)
  86. yharm = harmonic(harmlen, period=365.25 * 48, start=harmstart,
  87. harmonics=harmonics[0], prefix="y")
  88. yharm.index = utv
  89. wharm = harmonic(harmlen, period=7 * 48, start=harmstart,
  90. harmonics=harmonics[1], prefix="w")
  91. wharm.index = utv
  92. dharm = harmonic(harmlen, period=48, start=harmstart,
  93. harmonics=harmonics[2], prefix="d")
  94. dharm.index = utv
  95. y_params = " + ".join(yharm.columns.tolist())
  96. w_params = " + ".join(wharm.columns.tolist())
  97. d_params = " + ".join(dharm.columns.tolist())
  98. hcomb = p.concat([yharm, wharm, dharm], axis=1)
  99. modellist = {}
  100. modelstring = f"""kwh_tot_mean ~ {y_params} + {w_params} + {d_params}
  101. + ({y_params}):({d_params})
  102. + ({w_params}):({d_params})
  103. + max_resid + max_resid:({y_params})
  104. + max_resid:({w_params}) + max_resid:({d_params})
  105. + min_resid + min_resid:({y_params})
  106. + min_resid:({w_params}) + min_resid:({d_params})
  107. """.replace("\n", "").replace(" ", "")
  108. for c in tqdm(clusters):
  109. dfc = df[df['cluster'] == c].join(hcomb,
  110. how='left').join(wmodsum,
  111. how='left')
  112. # print(dfc)
  113. cmod = smf.ols(modelstring, data=dfc).fit()
  114. modellist[c] = cmod
  115. return modellist
  116. def main():
  117. parser = ArgumentParser(description='Harmonic models of all '
  118. 'clusters in specified pickle')
  119. parser.add_argument("-i", "--input", dest="input",
  120. help="input aggregated ICP pickle path",
  121. metavar="PATH", required=True,
  122. type=FileType('rb'))
  123. parser.add_argument("-w", "--weather", dest="weather",
  124. help="input weather pickle path",
  125. metavar="PATH", required=True,
  126. type=FileType('rb'))
  127. parser.add_argument("-m", "--model-file", dest="model_file",
  128. help="filename for model to store",
  129. required=True,
  130. type=FileType('wb'))
  131. parser.add_argument("--weather-harmonics", dest="weather_harmonics",
  132. help="number of harmonics for weather; default: 2",
  133. type=int, default=2, metavar="NUM")
  134. parser.add_argument("--icp-harmonics", dest="icp_harmonics", nargs=3,
  135. help="harmonics for icp fitting, default 2 3 3",
  136. default=[2, 3, 3], type=int,
  137. metavar="NUM")
  138. args = parser.parse_args()
  139. wdat = p.read_pickle(args.weather).drop(['record_no', 'station'], axis=1)
  140. icpdat = p.read_pickle(args.input).set_index('read_time')
  141. minm, maxm, sumdf = fitWeather(wdat, args.weather_harmonics)
  142. clusters = icpdat.cluster.unique()
  143. clusters.sort()
  144. mods = {
  145. "icp": fitdemand(icpdat, sumdf, args.icp_harmonics),
  146. "min_temp": minm,
  147. "max_temp": maxm,
  148. "clusters": clusters,
  149. "weather_harmonics": args.weather_harmonics,
  150. "icp_harmonics": args.icp_harmonics
  151. }
  152. pickle.dump(mods, args.model_file)
  153. args.input.close()
  154. args.weather.close()
  155. args.model_file.close()
  156. if __name__ == "__main__":
  157. main()