-- Create "tall" table for april 2017 (from Jason) CREATE TABLE public.coup_tall_april AS SELECT a.icp_id , a.read_date , c.period , sum(c.read_kwh) as kwh_tot , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn FROM coup_prd.coupdatamaster a, unnest(a.read_array) WITH ORDINALITY c(read_kwh, period) WHERE a.read_date >= to_date('01/04/2017','dd/mm/yyyy') and a.read_date < to_date('01/05/2017','dd/mm/yyyy') and a.content_code ~ ('UN|CN|EG') GROUP BY 1, 2, 3 ORDER BY 1, 2, 3; -- "Head" of the data SELECT * FROM public.coup_tall_april limit 10; -- Count distinct ICPs for each date SELECT read_date, COUNT(DISTINCT icp_id) AS d_icp FROM public.coup_tall_april GROUP BY read_date; -- Calculate mininmums, maximums, averages for each date SELECT read_date, MIN(kwh_tot), AVG(kwh_tot), MAX(kwh_tot) FROM public.coup_tall_april GROUP BY read_date; -- Find negative values SELECT * FROM public.coup_tall_april WHERE kwh_tot < 0 OR kwh_un < 0 OR kwh_cn < 0; -- Count number of ICPs that end in "17" SELECT COUNT (DISTINCT icp_id) FROM public.coup_tall_april WHERE icp_id LIKE '%17'; -- List first 10 ICPs that end with number 17, sorted SELECT DISTINCT icp_id FROM public.coup_tall_april WHERE icp_id LIKE '%17' ORDER BY icp_id LIMIT 10; -- Verify that all ICP IDs are of length 7 SELECT MIN(CHAR_LENGTH(icp_id)) as minl, MAX(CHAR_LENGTH(icp_id)) as maxl FROM public.coup_tall_april; -- String conversion SELECT DISTINCT CAST(SUBSTRING(icp_id FROM 2 FOR 6) AS int) AS icp_val, icp_id FROM public.coup_tall_april -- Count nulls in cast (none) SELECT COUNT(*) - COUNT(CAST(SUBSTRING(icp_id FROM 2 FOR 6) AS int)) FROM public.coup_tall_april; -- Verify that there are only "period" values from 1 to 48 SELECT DISTINCT period FROM public.coup_tall_april ORDER BY period; -- Look at the 2nd specifically (DST ends, 2017) SELECT * FROM public.coup_tall_april WHERE read_date = to_date('02/04/2017', 'dd/mm/yy') ORDER BY icp_id, period LIMIT 60; -- September interlude STARTS -- Create "tall" table for Septeber 24 2017 (mod from Jason, above) to look at DST starting CREATE TABLE public.coup_dst_2017 AS SELECT a.icp_id , a.read_date , c.period , sum(c.read_kwh) as kwh_tot , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn FROM coup_prd.coupdatamaster a, unnest(a.read_array) WITH ORDINALITY c(read_kwh, period) WHERE a.read_date = to_date('24/09/2017','dd/mm/yyyy') and a.content_code ~ ('UN|CN|EG') GROUP BY 1, 2, 3 ORDER BY 1, 2, 3; -- Look at September 24 2017 table SELECT * FROM public.coup_dst_2017 ORDER BY icp_id, period LIMIT 60; -- Verify that there are only "period" values from 1 to 48 SELECT COUNT(DISTINCT period) FROM public.coup_dst_2017; -- September interlude ENDS -- Timestamp conversion SELECT DISTINCT DATE_PART('year', read_date), DATE_PART('month', read_date), DATE_PART('day', read_date) FROM public.coup_tall_april ORDER BY 1, 2, 3; -- Creating "time" values SELECT *, (CONCAT(q.h, ':', q.m, ':00'))::time AS t FROM ( SELECT DISTINCT period, period / 2 as h, period % 2 * 30 AS m FROM public.coup_tall_april ) AS q ORDER BY period; -- Creating timestamp column SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time FROM public.coup_tall_april ORDER BY icp_id, read_time limit 50; -- Possible subset for further testing SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time FROM public.coup_tall_april WHERE icp_id LIKE '%1117' ORDER BY icp_id, read_time; -- Calculate averages for each half-hour SELECT read_date, period, AVG(kwh_tot) AS average FROM public.coup_tall_april GROUP BY read_date, period ORDER BY read_date, period; -- Gerate timestamp list SELECT read_date FROM GENERATE_SERIES('2017-04-01'::timestamp, '2017-04-02'::timestamp, '30 minutes'::interval) read_date; -- Fraction of icp's in april with 1440 entries SELECT SUM(CASE WHEN isum.c = 1440 THEN 1 ELSE 0 END)::numeric / COUNT(*)::numeric As frac FROM ( SELECT COUNT(*) AS c, icp_id FROM public.coup_tall_april GROUP BY icp_id ) AS isum; -- Get numbers of ICPs present over whole dataset SELECT COUNT(*) as c, icp_id, read_date FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id, read_date ORDER BY c DESC, read_date LIMIT 40; -- Get the range of dates present in the dataset SELECT MIN(read_date) as min_date, MAX(read_date) as max_date FROM coup_prd.coupdatamaster; -- Because some people have multiple meters, for a variety of reasons, this query does not work SELECT SUM(CASE WHEN cir.c > 365 THEN 1 ELSE 0 END), COUNT(*) FROM ( SELECT COUNT(*) as c, icp_id FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id ORDER BY c DESC ) AS cir; -- Find the number of ICPs with less than 360 days of data in 2017 SELECT SUM(CASE WHEN cir.c < 360 THEN 1 ELSE 0 END), COUNT(*) FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS c FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id ORDER BY c DESC ) AS cir; -- Produces a table of all the icp's with at least 360 days of data, along with a column of days of data SELECT * FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS data_days FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id ) AS cir WHERE data_days >= 360; -- Gerate timestamp list for 2017 SELECT read_date FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2018-01-01 00:00:00'::timestamp, '30 minutes'::interval) read_date; -- -------------------------------- -- Processed dataset for April 2017 -- -------------------------------- -- All combinations of read_time and icp_id for April SELECT read_time, icp_id FROM ( SELECT read_time FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-05-01 00:00:00'::timestamp, '30 minutes'::interval) read_time ) AS tsdata CROSS JOIN ( SELECT * FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS data_days FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id ) AS cir WHERE data_days >= 360 ) AS qual_icp; -- Creating timestamp column on April dataset SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time FROM public.coup_tall_april; -- Combined query (for april only) SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot FROM ( SELECT read_time, icp_id FROM ( SELECT read_time FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-05-01 00:00:00'::timestamp, '30 minutes'::interval) read_time ) AS tsdata CROSS JOIN ( SELECT * FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS data_days FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id ) AS cir WHERE data_days >= 360 ) AS qual_icp ) AS comb LEFT JOIN ( SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time FROM public.coup_tall_april ) AS tall_timestamp ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id; -- Combined first week testing query -- SELECT comb.icp_id, COUNT(*), COUNT(kwh_tot) SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot FROM ( SELECT read_time, icp_id FROM ( SELECT read_time FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-08-01 00:00:00'::timestamp, '30 minutes'::interval) read_time ) AS tsdata CROSS JOIN ( SELECT * FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS data_days FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' AND icp_id LIKE '%17' GROUP BY icp_id ) AS cir WHERE data_days >= 360 ) AS qual_icp ) AS comb LEFT JOIN ( SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time FROM public.coup_tall_april ) AS tall_timestamp ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id GROUP BY comb.icp_id; -- Move to July -- Create "tall" table for July 2017 (mod from above) CREATE TABLE public.coup_tall_july AS SELECT a.icp_id , a.read_date , c.period , sum(c.read_kwh) as kwh_tot , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn FROM coup_prd.coupdatamaster a, unnest(a.read_array) WITH ORDINALITY c(read_kwh, period) WHERE a.read_date >= to_date('01/07/2017','dd/mm/yyyy') and a.read_date < to_date('01/08/2017','dd/mm/yyyy') and a.content_code ~ ('UN|CN|EG') GROUP BY 1, 2, 3 ORDER BY 1, 2, 3; -- Move to January -- Create "tall" table for Jan 2017 (mod from above) CREATE TABLE public.coup_tall_jan AS SELECT a.icp_id , a.read_date , c.period , sum(c.read_kwh) as kwh_tot , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn FROM coup_prd.coupdatamaster a, unnest(a.read_array) WITH ORDINALITY c(read_kwh, period) WHERE a.read_date >= to_date('01/01/2017','dd/mm/yyyy') and a.read_date < to_date('01/02/2017','dd/mm/yyyy') and a.content_code ~ ('UN|CN|EG') GROUP BY 1, 2, 3 ORDER BY 1, 2, 3; -- Really big query (jan) SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot FROM ( SELECT read_time, icp_id FROM ( SELECT read_time FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2017-02-01 00:00:00'::timestamp, '30 minutes'::interval) read_time ) AS tsdata CROSS JOIN ( SELECT * FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS data_days FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' AND icp_id LIKE '%%19' GROUP BY icp_id ) AS cir WHERE data_days >= 360 ) AS qual_icp ) AS comb LEFT JOIN ( SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time FROM ( SELECT a.icp_id , a.read_date , c.period , sum(c.read_kwh) as kwh_tot , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn FROM coup_prd.coupdatamaster a, unnest(a.read_array) WITH ORDINALITY c(read_kwh, period) WHERE a.read_date >= to_date('01/01/2017','dd/mm/yyyy') and a.read_date < to_date('01/02/2017','dd/mm/yyyy') and a.content_code ~ ('UN|CN|EG') GROUP BY 1, 2, 3 ) AS coup_tall ) AS tall_timestamp ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id; -- Produces a view of all the icp's with at least 360 days of data, along with a column of days of data CREATE VIEW public.best_icp AS SELECT * FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS data_days FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id ) AS cir WHERE data_days >= 360; -- Produces sample table CREATE TABLE public.icp_sample AS SELECT * FROM public.best_icp ORDER BY random() LIMIT 1000; -- range of date values SELECT MIN(read_date) AS mindate, MAX(read_date) AS maxdate FROM coup_prd.coupdatamaster; -- View for best data in 18 month set CREATE VIEW public.best_icp_18m AS SELECT * FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS data_days FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/07/2016','dd/mm/yyyy') AND read_date < to_date('01/01/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id ) AS cir WHERE data_days >= 540; -- Produces sample table for 18m data CREATE TABLE public.icp_sample_18m AS SELECT * FROM public.best_icp_18m ORDER BY random() LIMIT 1000; -- Number of ICPs with data for each day SELECT read_date, COUNT(DISTINCT icp_id) as icps FROM coup_prd.coupdatamaster GROUP BY read_date ORDER BY read_date; -- View for best data in 2016-18 set CREATE VIEW public.best_icp_1618 AS SELECT * FROM ( SELECT icp_id, COUNT(DISTINCT read_date) AS data_days FROM coup_prd.coupdatamaster WHERE read_date >= to_date('01/04/2016','dd/mm/yyyy') AND read_date < to_date('01/04/2018','dd/mm/yyyy') AND content_code = 'UN' GROUP BY icp_id ) AS cir WHERE data_days >= 720; -- Produces sample table for 1618 data CREATE TABLE public.icp_sample_1618 AS SELECT * FROM public.best_icp_1618 ORDER BY random() LIMIT 1000; -- Produces 5k sample table for 2017 data CREATE TABLE public.icp_sample_5k AS SELECT * FROM public.best_icp ORDER BY random() LIMIT 5000;