Repository for Petra's work at ampli Jan-Feb 2019

queries.pgsql 8.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. -- Create "tall" table for april 2017 (from Jason)
  2. CREATE TABLE public.coup_tall_april AS
  3. SELECT a.icp_id
  4. , a.read_date
  5. , c.period
  6. , sum(c.read_kwh) as kwh_tot
  7. , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
  8. , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
  9. FROM coup_prd.coupdatamaster a,
  10. unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
  11. WHERE a.read_date >= to_date('01/04/2017','dd/mm/yyyy')
  12. and a.read_date < to_date('01/05/2017','dd/mm/yyyy')
  13. and a.content_code ~ ('UN|CN|EG')
  14. GROUP BY 1, 2, 3
  15. ORDER BY 1, 2, 3;
  16. -- "Head" of the data
  17. SELECT * FROM public.coup_tall_april limit 10;
  18. -- Count distinct ICPs for each date
  19. SELECT read_date, COUNT(DISTINCT icp_id) AS d_icp
  20. FROM public.coup_tall_april
  21. GROUP BY read_date;
  22. -- Calculate mininmums, maximums, averages for each date
  23. SELECT read_date, MIN(kwh_tot), AVG(kwh_tot), MAX(kwh_tot)
  24. FROM public.coup_tall_april
  25. GROUP BY read_date;
  26. -- Find negative values
  27. SELECT * FROM public.coup_tall_april WHERE kwh_tot < 0 OR kwh_un < 0 OR kwh_cn < 0;
  28. -- Count number of ICPs that end in "17"
  29. SELECT COUNT (DISTINCT icp_id) FROM public.coup_tall_april WHERE icp_id LIKE '%17';
  30. -- List first 10 ICPs that end with number 17, sorted
  31. SELECT DISTINCT icp_id FROM public.coup_tall_april WHERE icp_id LIKE '%17' ORDER BY icp_id LIMIT 10;
  32. -- Verify that all ICP IDs are of length 7
  33. SELECT MIN(CHAR_LENGTH(icp_id)) as minl, MAX(CHAR_LENGTH(icp_id)) as maxl FROM public.coup_tall_april;
  34. -- String conversion
  35. SELECT DISTINCT CAST(SUBSTRING(icp_id FROM 2 FOR 6) AS int) AS icp_val, icp_id FROM public.coup_tall_april
  36. -- Count nulls in cast (none)
  37. SELECT COUNT(*) - COUNT(CAST(SUBSTRING(icp_id FROM 2 FOR 6) AS int)) FROM public.coup_tall_april;
  38. -- Verify that there are only "period" values from 1 to 48
  39. SELECT DISTINCT period FROM public.coup_tall_april ORDER BY period;
  40. -- Look at the 2nd specifically (DST ends, 2017)
  41. SELECT * FROM public.coup_tall_april
  42. WHERE read_date = to_date('02/04/2017', 'dd/mm/yy')
  43. ORDER BY icp_id, period
  44. LIMIT 60;
  45. -- September interlude STARTS
  46. -- Create "tall" table for Septeber 24 2017 (mod from Jason, above) to look at DST starting
  47. CREATE TABLE public.coup_dst_2017 AS
  48. SELECT a.icp_id
  49. , a.read_date
  50. , c.period
  51. , sum(c.read_kwh) as kwh_tot
  52. , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
  53. , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
  54. FROM coup_prd.coupdatamaster a,
  55. unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
  56. WHERE a.read_date = to_date('24/09/2017','dd/mm/yyyy')
  57. and a.content_code ~ ('UN|CN|EG')
  58. GROUP BY 1, 2, 3
  59. ORDER BY 1, 2, 3;
  60. -- Look at September 24 2017 table
  61. SELECT * FROM public.coup_dst_2017
  62. ORDER BY icp_id, period
  63. LIMIT 60;
  64. -- Verify that there are only "period" values from 1 to 48
  65. SELECT COUNT(DISTINCT period) FROM public.coup_dst_2017;
  66. -- September interlude ENDS
  67. -- Timestamp conversion
  68. SELECT DISTINCT DATE_PART('year', read_date), DATE_PART('month', read_date),
  69. DATE_PART('day', read_date) FROM public.coup_tall_april ORDER BY 1, 2, 3;
  70. -- Creating "time" values
  71. SELECT *, (CONCAT(q.h, ':', q.m, ':00'))::time AS t FROM
  72. (
  73. SELECT DISTINCT period, period / 2 as h, period % 2 * 30 AS m
  74. FROM public.coup_tall_april
  75. ) AS q ORDER BY period;
  76. -- Creating timestamp column
  77. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  78. FROM public.coup_tall_april ORDER BY icp_id, read_time limit 50;
  79. -- Possible subset for further testing
  80. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  81. FROM public.coup_tall_april WHERE icp_id LIKE '%1117' ORDER BY icp_id, read_time;
  82. -- Calculate averages for each half-hour
  83. SELECT read_date, period, AVG(kwh_tot) AS average
  84. FROM public.coup_tall_april
  85. GROUP BY read_date, period
  86. ORDER BY read_date, period;
  87. -- Gerate timestamp list
  88. SELECT read_date
  89. FROM GENERATE_SERIES('2017-04-01'::timestamp, '2017-04-02'::timestamp, '30 minutes'::interval) read_date;
  90. -- Fraction of icp's in april with 1440 entries
  91. SELECT SUM(CASE WHEN isum.c = 1440 THEN 1 ELSE 0 END)::numeric / COUNT(*)::numeric As frac FROM
  92. (
  93. SELECT COUNT(*) AS c, icp_id FROM public.coup_tall_april
  94. GROUP BY icp_id
  95. ) AS isum;
  96. -- Get numbers of ICPs present over whole dataset
  97. SELECT COUNT(*) as c, icp_id, read_date FROM coup_prd.coupdatamaster
  98. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  99. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  100. AND content_code = 'UN'
  101. GROUP BY icp_id, read_date ORDER BY c DESC, read_date LIMIT 40;
  102. -- Get the range of dates present in the dataset
  103. SELECT MIN(read_date) as min_date, MAX(read_date) as max_date FROM coup_prd.coupdatamaster;
  104. -- Because some people have multiple meters, for a variety of reasons, this query does not work
  105. SELECT SUM(CASE WHEN cir.c > 365 THEN 1 ELSE 0 END), COUNT(*)
  106. FROM
  107. (
  108. SELECT COUNT(*) as c, icp_id FROM coup_prd.coupdatamaster
  109. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  110. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  111. AND content_code = 'UN'
  112. GROUP BY icp_id ORDER BY c DESC
  113. ) AS cir;
  114. -- Find the number of ICPs with less than 360 days of data in 2017
  115. SELECT SUM(CASE WHEN cir.c < 360 THEN 1 ELSE 0 END), COUNT(*)
  116. FROM
  117. (
  118. SELECT icp_id, COUNT(DISTINCT read_date) AS c FROM coup_prd.coupdatamaster
  119. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  120. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  121. AND content_code = 'UN'
  122. GROUP BY icp_id ORDER BY c DESC
  123. ) AS cir;
  124. -- Produces a table of all the icp's with at least 360 days of data, along with a column of days of data
  125. SELECT *
  126. FROM
  127. (
  128. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  129. FROM coup_prd.coupdatamaster
  130. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  131. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  132. AND content_code = 'UN'
  133. GROUP BY icp_id
  134. ) AS cir
  135. WHERE data_days >= 360;
  136. -- Gerate timestamp list for 2017
  137. SELECT read_date
  138. FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2018-01-01 00:00:00'::timestamp,
  139. '30 minutes'::interval) read_date;
  140. -- --------------------------------
  141. -- Processed dataset for April 2017
  142. -- --------------------------------
  143. -- All combinations of read_time and icp_id for April
  144. SELECT read_time, icp_id
  145. FROM
  146. (
  147. SELECT read_time
  148. FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-05-01 00:00:00'::timestamp,
  149. '30 minutes'::interval) read_time
  150. ) AS tsdata CROSS JOIN
  151. (
  152. SELECT *
  153. FROM
  154. (
  155. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  156. FROM coup_prd.coupdatamaster
  157. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  158. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  159. AND content_code = 'UN'
  160. GROUP BY icp_id
  161. ) AS cir
  162. WHERE data_days >= 360
  163. ) AS qual_icp;
  164. -- Creating timestamp column on April dataset
  165. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  166. FROM public.coup_tall_april;
  167. -- Combined query (for april only)
  168. SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
  169. FROM
  170. (
  171. SELECT read_time, icp_id
  172. FROM
  173. (
  174. SELECT read_time
  175. FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-05-01 00:00:00'::timestamp,
  176. '30 minutes'::interval) read_time
  177. ) AS tsdata CROSS JOIN
  178. (
  179. SELECT *
  180. FROM
  181. (
  182. SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
  183. FROM coup_prd.coupdatamaster
  184. WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
  185. AND read_date < to_date('01/01/2018','dd/mm/yyyy')
  186. AND content_code = 'UN'
  187. GROUP BY icp_id
  188. ) AS cir
  189. WHERE data_days >= 360
  190. ) AS qual_icp
  191. ) AS comb
  192. LEFT JOIN
  193. (
  194. SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
  195. FROM public.coup_tall_april
  196. ) AS tall_timestamp
  197. ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;