Repository for Petra's work at ampli Jan-Feb 2019

queries.pgsql 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. -- Create "tall" table for april 2017 (from Jason)
  2. CREATE TABLE public.coup_tall_april AS
  3. SELECT a.icp_id
  4. , a.read_date
  5. , c.period
  6. , sum(c.read_kwh) as kwh_tot
  7. , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
  8. , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
  9. FROM coup_prd.coupdatamaster a,
  10. unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
  11. WHERE a.read_date >= to_date('01/04/2017','dd/mm/yyyy')
  12. and a.read_date < to_date('01/05/2017','dd/mm/yyyy')
  13. and a.content_code ~ ('UN|CN|EG')
  14. GROUP BY 1, 2, 3
  15. ORDER BY 1, 2, 3;
  16. -- "Head" of the data
  17. SELECT * FROM public.coup_tall_april limit 10;
  18. -- Count distinct ICPs for each date
  19. SELECT read_date, COUNT(DISTINCT icp_id) AS d_icp
  20. FROM public.coup_tall_april
  21. GROUP BY read_date;
  22. -- Calculate mininmums, maximums, averages for each date
  23. SELECT read_date, MIN(kwh_tot), AVG(kwh_tot), MAX(kwh_tot)
  24. FROM public.coup_tall_april
  25. GROUP BY read_date;
  26. -- Find negative values
  27. SELECT * FROM public.coup_tall_april WHERE kwh_tot < 0 OR kwh_un < 0 OR kwh_cn < 0;
  28. -- Count number of ICPs that end in "17"
  29. SELECT COUNT (DISTINCT icp_id) FROM public.coup_tall_april WHERE icp_id LIKE '%17';
  30. -- List first 10 ICPs that end with number 17, sorted
  31. SELECT DISTINCT icp_id FROM public.coup_tall_april WHERE icp_id LIKE '%17' ORDER BY icp_id LIMIT 10;
  32. -- Verify that all ICP IDs are of length 7
  33. SELECT MIN(CHAR_LENGTH(icp_id)) as minl, MAX(CHAR_LENGTH(icp_id)) as maxl FROM public.coup_tall_april;
  34. -- String conversion
  35. SELECT DISTINCT CAST(SUBSTRING(icp_id FROM 2 FOR 6) AS int) AS icp_val, icp_id FROM public.coup_tall_april
  36. -- Count nulls in cast (none)
  37. SELECT COUNT(*) - COUNT(CAST(SUBSTRING(icp_id FROM 2 FOR 6) AS int)) FROM public.coup_tall_april;
  38. -- Verify that there are only "period" values from 1 to 48
  39. SELECT DISTINCT period FROM public.coup_tall_april ORDER BY period;
  40. -- Look at the 2nd specifically (DST ends, 2017)
  41. SELECT * FROM public.coup_tall_april
  42. WHERE read_date = to_date('02/04/2017', 'dd/mm/yy')
  43. ORDER BY icp_id, period
  44. LIMIT 60;
  45. -- September interlude STARTS
  46. -- Create "tall" table for Septeber 24 2017 (mod from Jason, above) to look at DST starting
  47. CREATE TABLE public.coup_dst_2017 AS
  48. SELECT a.icp_id
  49. , a.read_date
  50. , c.period
  51. , sum(c.read_kwh) as kwh_tot
  52. , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
  53. , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
  54. FROM coup_prd.coupdatamaster a,
  55. unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
  56. WHERE a.read_date = to_date('24/09/2017','dd/mm/yyyy')
  57. and a.content_code ~ ('UN|CN|EG')
  58. GROUP BY 1, 2, 3
  59. ORDER BY 1, 2, 3;
  60. -- Look at September 24 2017 table
  61. SELECT * FROM public.coup_dst_2017
  62. ORDER BY icp_id, period
  63. LIMIT 60;
  64. -- Verify that there are only "period" values from 1 to 48
  65. SELECT COUNT(DISTINCT period) FROM public.coup_dst_2017;
  66. -- September interlude ENDS
  67. -- Timestamp conversion
  68. SELECT DISTINCT DATE_PART('year', read_date), DATE_PART('month', read_date),
  69. DATE_PART('day', read_date) FROM public.coup_tall_april ORDER BY 1, 2, 3;
  70. -- Creating "time" values
  71. SELECT *, (CONCAT(q.h, ':', q.m, ':00'))::time AS t FROM
  72. (
  73. SELECT DISTINCT period, period / 2 as h, period % 2 * 30 AS m
  74. FROM public.coup_tall_april
  75. ) AS q ORDER BY period;
  76. -- Creating timestamp column
  77. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  78. FROM public.coup_tall_april ORDER BY icp_id, read_time limit 50;
  79. -- Possible subset for further testing
  80. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  81. FROM public.coup_tall_april WHERE icp_id LIKE '%1117' ORDER BY icp_id, read_time;
  82. -- Calculate averages for each half-hour
  83. SELECT read_date, period, AVG(kwh_tot) AS average
  84. FROM public.coup_tall_april
  85. GROUP BY read_date, period
  86. ORDER BY read_date, period;
  87. -- Gerate timestamp list
  88. SELECT read_date
  89. FROM GENERATE_SERIES('2017-04-01'::timestamp, '2017-04-02'::timestamp, '30 minutes'::interval) read_date;
  90. -- Fraction of icp's in april with 1440 entries
  91. SELECT SUM(CASE WHEN isum.c = 1440 THEN 1 ELSE 0 END)::numeric / COUNT(*)::numeric As frac FROM
  92. (
  93. SELECT COUNT(*) AS c, icp_id FROM public.coup_tall_april
  94. GROUP BY icp_id
  95. ) AS isum;
  96. -- Get numbers of ICPs present over whole dataset
  97. SELECT COUNT(*) as c, icp_id, read_date FROM coup_prd.coupdatamaster
  98. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  99. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  100. AND content_code = 'UN'
  101. GROUP BY icp_id, read_date ORDER BY c DESC, read_date LIMIT 40;
  102. -- Get the range of dates present in the dataset
  103. SELECT MIN(read_date) as min_date, MAX(read_date) as max_date FROM coup_prd.coupdatamaster;
  104. -- Because some people have multiple meters, for a variety of reasons, this query does not work
  105. SELECT SUM(CASE WHEN cir.c > 365 THEN 1 ELSE 0 END), COUNT(*)
  106. FROM
  107. (
  108. SELECT COUNT(*) as c, icp_id FROM coup_prd.coupdatamaster
  109. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  110. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  111. AND content_code = 'UN'
  112. GROUP BY icp_id ORDER BY c DESC
  113. ) AS cir;
  114. -- Find the number of ICPs with less than 360 days of data in 2017
  115. SELECT SUM(CASE WHEN cir.c < 360 THEN 1 ELSE 0 END), COUNT(*)
  116. FROM
  117. (
  118. SELECT icp_id, COUNT(DISTINCT read_date) AS c FROM coup_prd.coupdatamaster
  119. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  120. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  121. AND content_code = 'UN'
  122. GROUP BY icp_id ORDER BY c DESC
  123. ) AS cir;
  124. -- Produces a table of all the icp's with at least 360 days of data, along with a column of days of data
  125. SELECT *
  126. FROM
  127. (
  128. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  129. FROM coup_prd.coupdatamaster
  130. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  131. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  132. AND content_code = 'UN'
  133. GROUP BY icp_id
  134. ) AS cir
  135. WHERE data_days >= 360;
  136. -- Gerate timestamp list for 2017
  137. SELECT read_date
  138. FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2018-01-01 00:00:00'::timestamp,
  139. '30 minutes'::interval) read_date;
  140. -- --------------------------------
  141. -- Processed dataset for April 2017
  142. -- --------------------------------
  143. -- All combinations of read_time and icp_id for April
  144. SELECT read_time, icp_id
  145. FROM
  146. (
  147. SELECT read_time
  148. FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-05-01 00:00:00'::timestamp,
  149. '30 minutes'::interval) read_time
  150. ) AS tsdata CROSS JOIN
  151. (
  152. SELECT *
  153. FROM
  154. (
  155. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  156. FROM coup_prd.coupdatamaster
  157. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  158. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  159. AND content_code = 'UN'
  160. GROUP BY icp_id
  161. ) AS cir
  162. WHERE data_days >= 360
  163. ) AS qual_icp;
  164. -- Creating timestamp column on April dataset
  165. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  166. FROM public.coup_tall_april;
  167. -- Combined query (for april only)
  168. SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
  169. FROM
  170. (
  171. SELECT read_time, icp_id
  172. FROM
  173. (
  174. SELECT read_time
  175. FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-05-01 00:00:00'::timestamp,
  176. '30 minutes'::interval) read_time
  177. ) AS tsdata CROSS JOIN
  178. (
  179. SELECT *
  180. FROM
  181. (
  182. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  183. FROM coup_prd.coupdatamaster
  184. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  185. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  186. AND content_code = 'UN'
  187. GROUP BY icp_id
  188. ) AS cir
  189. WHERE data_days >= 360
  190. ) AS qual_icp
  191. ) AS comb
  192. LEFT JOIN
  193. (
  194. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  195. FROM public.coup_tall_april
  196. ) AS tall_timestamp
  197. ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;
  198. -- Combined first week testing query
  199. -- SELECT comb.icp_id, COUNT(*), COUNT(kwh_tot)
  200. SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
  201. FROM
  202. (
  203. SELECT read_time, icp_id
  204. FROM
  205. (
  206. SELECT read_time
  207. FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-08-01 00:00:00'::timestamp,
  208. '30 minutes'::interval) read_time
  209. ) AS tsdata CROSS JOIN
  210. (
  211. SELECT *
  212. FROM
  213. (
  214. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  215. FROM coup_prd.coupdatamaster
  216. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  217. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  218. AND content_code = 'UN'
  219. AND icp_id LIKE '%17'
  220. GROUP BY icp_id
  221. ) AS cir
  222. WHERE data_days >= 360
  223. ) AS qual_icp
  224. ) AS comb
  225. LEFT JOIN
  226. (
  227. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  228. FROM public.coup_tall_april
  229. ) AS tall_timestamp
  230. ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id
  231. GROUP BY comb.icp_id;
  232. -- Move to July
  233. -- Create "tall" table for July 2017 (mod from above)
  234. CREATE TABLE public.coup_tall_july AS
  235. SELECT a.icp_id
  236. , a.read_date
  237. , c.period
  238. , sum(c.read_kwh) as kwh_tot
  239. , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
  240. , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
  241. FROM coup_prd.coupdatamaster a,
  242. unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
  243. WHERE a.read_date >= to_date('01/07/2017','dd/mm/yyyy')
  244. and a.read_date < to_date('01/08/2017','dd/mm/yyyy')
  245. and a.content_code ~ ('UN|CN|EG')
  246. GROUP BY 1, 2, 3
  247. ORDER BY 1, 2, 3;
  248. -- Move to January
  249. -- Create "tall" table for Jan 2017 (mod from above)
  250. CREATE TABLE public.coup_tall_jan AS
  251. SELECT a.icp_id
  252. , a.read_date
  253. , c.period
  254. , sum(c.read_kwh) as kwh_tot
  255. , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
  256. , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
  257. FROM coup_prd.coupdatamaster a,
  258. unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
  259. WHERE a.read_date >= to_date('01/01/2017','dd/mm/yyyy')
  260. and a.read_date < to_date('01/02/2017','dd/mm/yyyy')
  261. and a.content_code ~ ('UN|CN|EG')
  262. GROUP BY 1, 2, 3
  263. ORDER BY 1, 2, 3;
  264. -- Really big query (jan)
  265. SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
  266. FROM
  267. (
  268. SELECT read_time, icp_id
  269. FROM
  270. (
  271. SELECT read_time
  272. FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2017-02-01 00:00:00'::timestamp,
  273. '30 minutes'::interval) read_time
  274. ) AS tsdata CROSS JOIN
  275. (
  276. SELECT *
  277. FROM
  278. (
  279. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  280. FROM coup_prd.coupdatamaster
  281. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  282. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  283. AND content_code = 'UN'
  284. AND icp_id LIKE '%%19'
  285. GROUP BY icp_id
  286. ) AS cir
  287. WHERE data_days >= 360
  288. ) AS qual_icp
  289. ) AS comb
  290. LEFT JOIN
  291. (
  292. SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
  293. FROM (
  294. SELECT a.icp_id
  295. , a.read_date
  296. , c.period
  297. , sum(c.read_kwh) as kwh_tot
  298. , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
  299. , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
  300. FROM coup_prd.coupdatamaster a,
  301. unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
  302. WHERE a.read_date >= to_date('01/01/2017','dd/mm/yyyy')
  303. and a.read_date < to_date('01/02/2017','dd/mm/yyyy')
  304. and a.content_code ~ ('UN|CN|EG')
  305. GROUP BY 1, 2, 3
  306. ) AS coup_tall
  307. ) AS tall_timestamp
  308. ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;
  309. -- Produces a view of all the icp's with at least 360 days of data, along with a column of days of data
  310. CREATE VIEW public.best_icp AS
  311. SELECT *
  312. FROM
  313. (
  314. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  315. FROM coup_prd.coupdatamaster
  316. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  317. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  318. AND content_code = 'UN'
  319. GROUP BY icp_id
  320. ) AS cir
  321. WHERE data_days >= 360;
  322. -- Produces sample table
  323. CREATE TABLE public.icp_sample AS
  324. SELECT * FROM public.best_icp
  325. ORDER BY random()
  326. LIMIT 1000;
  327. -- range of date values
  328. SELECT MIN(read_date) AS mindate, MAX(read_date) AS maxdate
  329. FROM coup_prd.coupdatamaster;
  330. -- View for best data in 18 month set
  331. CREATE VIEW public.best_icp_18m AS
  332. SELECT *
  333. FROM
  334. (
  335. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  336. FROM coup_prd.coupdatamaster
  337. WHERE read_date >= to_date('01/07/2016','dd/mm/yyyy')
  338. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  339. AND content_code = 'UN'
  340. GROUP BY icp_id
  341. ) AS cir
  342. WHERE data_days >= 540;
  343. -- Produces sample table for 18m data
  344. CREATE TABLE public.icp_sample_18m AS
  345. SELECT * FROM public.best_icp_18m
  346. ORDER BY random()
  347. LIMIT 1000;
  348. -- Number of ICPs with data for each day
  349. SELECT read_date, COUNT(DISTINCT icp_id) as icps
  350. FROM coup_prd.coupdatamaster
  351. GROUP BY read_date
  352. ORDER BY read_date;
  353. -- View for best data in 2016-18 set
  354. CREATE VIEW public.best_icp_1618 AS
  355. SELECT *
  356. FROM
  357. (
  358. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  359. FROM coup_prd.coupdatamaster
  360. WHERE read_date >= to_date('01/04/2016','dd/mm/yyyy')
  361. AND read_date < to_date('01/04/2018','dd/mm/yyyy')
  362. AND content_code = 'UN'
  363. GROUP BY icp_id
  364. ) AS cir
  365. WHERE data_days >= 720;
  366. -- Produces sample table for 1618 data
  367. CREATE TABLE public.icp_sample_1618 AS
  368. SELECT * FROM public.best_icp_1618
  369. ORDER BY random()
  370. LIMIT 1000;
  371. -- Produces 5k sample table for 2017 data
  372. CREATE TABLE public.icp_sample_5k AS
  373. SELECT * FROM public.best_icp
  374. ORDER BY random()
  375. LIMIT 5000;