123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422 |
- -- Create "tall" table for april 2017 (from Jason)
- CREATE TABLE public.coup_tall_april AS
- SELECT a.icp_id
- , a.read_date
- , c.period
- , sum(c.read_kwh) as kwh_tot
- , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
- , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
- FROM coup_prd.coupdatamaster a,
- unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
- WHERE a.read_date >= to_date('01/04/2017','dd/mm/yyyy')
- and a.read_date < to_date('01/05/2017','dd/mm/yyyy')
- and a.content_code ~ ('UN|CN|EG')
- GROUP BY 1, 2, 3
- ORDER BY 1, 2, 3;
-
- -- "Head" of the data
- SELECT * FROM public.coup_tall_april limit 10;
-
- -- Count distinct ICPs for each date
- SELECT read_date, COUNT(DISTINCT icp_id) AS d_icp
- FROM public.coup_tall_april
- GROUP BY read_date;
-
- -- Calculate mininmums, maximums, averages for each date
- SELECT read_date, MIN(kwh_tot), AVG(kwh_tot), MAX(kwh_tot)
- FROM public.coup_tall_april
- GROUP BY read_date;
-
- -- Find negative values
- SELECT * FROM public.coup_tall_april WHERE kwh_tot < 0 OR kwh_un < 0 OR kwh_cn < 0;
-
- -- Count number of ICPs that end in "17"
- SELECT COUNT (DISTINCT icp_id) FROM public.coup_tall_april WHERE icp_id LIKE '%17';
-
- -- List first 10 ICPs that end with number 17, sorted
- SELECT DISTINCT icp_id FROM public.coup_tall_april WHERE icp_id LIKE '%17' ORDER BY icp_id LIMIT 10;
-
- -- Verify that all ICP IDs are of length 7
- SELECT MIN(CHAR_LENGTH(icp_id)) as minl, MAX(CHAR_LENGTH(icp_id)) as maxl FROM public.coup_tall_april;
-
- -- String conversion
- SELECT DISTINCT CAST(SUBSTRING(icp_id FROM 2 FOR 6) AS int) AS icp_val, icp_id FROM public.coup_tall_april
-
- -- Count nulls in cast (none)
- SELECT COUNT(*) - COUNT(CAST(SUBSTRING(icp_id FROM 2 FOR 6) AS int)) FROM public.coup_tall_april;
-
- -- Verify that there are only "period" values from 1 to 48
- SELECT DISTINCT period FROM public.coup_tall_april ORDER BY period;
-
- -- Look at the 2nd specifically (DST ends, 2017)
- SELECT * FROM public.coup_tall_april
- WHERE read_date = to_date('02/04/2017', 'dd/mm/yy')
- ORDER BY icp_id, period
- LIMIT 60;
-
- -- September interlude STARTS
- -- Create "tall" table for Septeber 24 2017 (mod from Jason, above) to look at DST starting
- CREATE TABLE public.coup_dst_2017 AS
- SELECT a.icp_id
- , a.read_date
- , c.period
- , sum(c.read_kwh) as kwh_tot
- , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
- , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
- FROM coup_prd.coupdatamaster a,
- unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
- WHERE a.read_date = to_date('24/09/2017','dd/mm/yyyy')
- and a.content_code ~ ('UN|CN|EG')
- GROUP BY 1, 2, 3
- ORDER BY 1, 2, 3;
-
- -- Look at September 24 2017 table
- SELECT * FROM public.coup_dst_2017
- ORDER BY icp_id, period
- LIMIT 60;
-
- -- Verify that there are only "period" values from 1 to 48
- SELECT COUNT(DISTINCT period) FROM public.coup_dst_2017;
- -- September interlude ENDS
-
- -- Timestamp conversion
- SELECT DISTINCT DATE_PART('year', read_date), DATE_PART('month', read_date),
- DATE_PART('day', read_date) FROM public.coup_tall_april ORDER BY 1, 2, 3;
-
- -- Creating "time" values
- SELECT *, (CONCAT(q.h, ':', q.m, ':00'))::time AS t FROM
- (
- SELECT DISTINCT period, period / 2 as h, period % 2 * 30 AS m
- FROM public.coup_tall_april
- ) AS q ORDER BY period;
-
- -- Creating timestamp column
- SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
- FROM public.coup_tall_april ORDER BY icp_id, read_time limit 50;
-
- -- Possible subset for further testing
- SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
- FROM public.coup_tall_april WHERE icp_id LIKE '%1117' ORDER BY icp_id, read_time;
-
- -- Calculate averages for each half-hour
- SELECT read_date, period, AVG(kwh_tot) AS average
- FROM public.coup_tall_april
- GROUP BY read_date, period
- ORDER BY read_date, period;
-
- -- Gerate timestamp list
- SELECT read_date
- FROM GENERATE_SERIES('2017-04-01'::timestamp, '2017-04-02'::timestamp, '30 minutes'::interval) read_date;
-
- -- Fraction of icp's in april with 1440 entries
- SELECT SUM(CASE WHEN isum.c = 1440 THEN 1 ELSE 0 END)::numeric / COUNT(*)::numeric As frac FROM
- (
- SELECT COUNT(*) AS c, icp_id FROM public.coup_tall_april
- GROUP BY icp_id
- ) AS isum;
-
- -- Get numbers of ICPs present over whole dataset
- SELECT COUNT(*) as c, icp_id, read_date FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id, read_date ORDER BY c DESC, read_date LIMIT 40;
-
- -- Get the range of dates present in the dataset
- SELECT MIN(read_date) as min_date, MAX(read_date) as max_date FROM coup_prd.coupdatamaster;
-
- -- Because some people have multiple meters, for a variety of reasons, this query does not work
- SELECT SUM(CASE WHEN cir.c > 365 THEN 1 ELSE 0 END), COUNT(*)
- FROM
- (
- SELECT COUNT(*) as c, icp_id FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id ORDER BY c DESC
- ) AS cir;
-
- -- Find the number of ICPs with less than 360 days of data in 2017
- SELECT SUM(CASE WHEN cir.c < 360 THEN 1 ELSE 0 END), COUNT(*)
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS c FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id ORDER BY c DESC
- ) AS cir;
-
- -- Produces a table of all the icp's with at least 360 days of data, along with a column of days of data
- SELECT *
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
- FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id
- ) AS cir
- WHERE data_days >= 360;
-
- -- Gerate timestamp list for 2017
- SELECT read_date
- FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2018-01-01 00:00:00'::timestamp,
- '30 minutes'::interval) read_date;
-
-
- -- --------------------------------
- -- Processed dataset for April 2017
- -- --------------------------------
-
- -- All combinations of read_time and icp_id for April
- SELECT read_time, icp_id
- FROM
- (
- SELECT read_time
- FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-05-01 00:00:00'::timestamp,
- '30 minutes'::interval) read_time
- ) AS tsdata CROSS JOIN
- (
- SELECT *
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
- FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id
- ) AS cir
- WHERE data_days >= 360
- ) AS qual_icp;
-
- -- Creating timestamp column on April dataset
- SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
- FROM public.coup_tall_april;
-
- -- Combined query (for april only)
- SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
- FROM
- (
- SELECT read_time, icp_id
- FROM
- (
- SELECT read_time
- FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-05-01 00:00:00'::timestamp,
- '30 minutes'::interval) read_time
- ) AS tsdata CROSS JOIN
- (
- SELECT *
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
- FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id
- ) AS cir
- WHERE data_days >= 360
- ) AS qual_icp
- ) AS comb
- LEFT JOIN
- (
- SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
- FROM public.coup_tall_april
- ) AS tall_timestamp
- ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;
-
- -- Combined first week testing query
- -- SELECT comb.icp_id, COUNT(*), COUNT(kwh_tot)
- SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
- FROM
- (
- SELECT read_time, icp_id
- FROM
- (
- SELECT read_time
- FROM GENERATE_SERIES('2017-04-01 00:30:00'::timestamp, '2017-08-01 00:00:00'::timestamp,
- '30 minutes'::interval) read_time
- ) AS tsdata CROSS JOIN
- (
- SELECT *
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
- FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- AND icp_id LIKE '%17'
- GROUP BY icp_id
- ) AS cir
- WHERE data_days >= 360
- ) AS qual_icp
- ) AS comb
- LEFT JOIN
- (
- SELECT *, read_date + CONCAT(period / 2, ':', period % 2 * 30, ':00')::time AS read_time
- FROM public.coup_tall_april
- ) AS tall_timestamp
- ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id
- GROUP BY comb.icp_id;
-
- -- Move to July
-
- -- Create "tall" table for July 2017 (mod from above)
- CREATE TABLE public.coup_tall_july AS
- SELECT a.icp_id
- , a.read_date
- , c.period
- , sum(c.read_kwh) as kwh_tot
- , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
- , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
- FROM coup_prd.coupdatamaster a,
- unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
- WHERE a.read_date >= to_date('01/07/2017','dd/mm/yyyy')
- and a.read_date < to_date('01/08/2017','dd/mm/yyyy')
- and a.content_code ~ ('UN|CN|EG')
- GROUP BY 1, 2, 3
- ORDER BY 1, 2, 3;
-
- -- Move to January
-
- -- Create "tall" table for Jan 2017 (mod from above)
- CREATE TABLE public.coup_tall_jan AS
- SELECT a.icp_id
- , a.read_date
- , c.period
- , sum(c.read_kwh) as kwh_tot
- , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
- , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
- FROM coup_prd.coupdatamaster a,
- unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
- WHERE a.read_date >= to_date('01/01/2017','dd/mm/yyyy')
- and a.read_date < to_date('01/02/2017','dd/mm/yyyy')
- and a.content_code ~ ('UN|CN|EG')
- GROUP BY 1, 2, 3
- ORDER BY 1, 2, 3;
-
- -- Really big query (jan)
- SELECT comb.icp_id, comb.read_time, COALESCE(kwh_tot, 0) AS kwh_tot
- FROM
- (
- SELECT read_time, icp_id
- FROM
- (
- SELECT read_time
- FROM GENERATE_SERIES('2017-01-01 00:30:00'::timestamp, '2017-02-01 00:00:00'::timestamp,
- '30 minutes'::interval) read_time
- ) AS tsdata CROSS JOIN
- (
- SELECT *
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
- FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- AND icp_id LIKE '%%19'
- GROUP BY icp_id
- ) AS cir
- WHERE data_days >= 360
- ) AS qual_icp
- ) AS comb
- LEFT JOIN
- (
- SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
- FROM (
- SELECT a.icp_id
- , a.read_date
- , c.period
- , sum(c.read_kwh) as kwh_tot
- , sum(case when a.content_code = 'UN' then c.read_kwh else 0 end) as kwh_un
- , sum(case when a.content_code in ('CN','EG') then c.read_kwh else 0 end) as kwh_cn
- FROM coup_prd.coupdatamaster a,
- unnest(a.read_array) WITH ORDINALITY c(read_kwh, period)
- WHERE a.read_date >= to_date('01/01/2017','dd/mm/yyyy')
- and a.read_date < to_date('01/02/2017','dd/mm/yyyy')
- and a.content_code ~ ('UN|CN|EG')
- GROUP BY 1, 2, 3
- ) AS coup_tall
- ) AS tall_timestamp
- ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;
-
- -- Produces a view of all the icp's with at least 360 days of data, along with a column of days of data
- CREATE VIEW public.best_icp AS
- SELECT *
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
- FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id
- ) AS cir
- WHERE data_days >= 360;
-
- -- Produces sample table
- CREATE TABLE public.icp_sample AS
- SELECT * FROM public.best_icp
- ORDER BY random()
- LIMIT 1000;
-
- -- range of date values
- SELECT MIN(read_date) AS mindate, MAX(read_date) AS maxdate
- FROM coup_prd.coupdatamaster;
-
- -- View for best data in 18 month set
- CREATE VIEW public.best_icp_18m AS
- SELECT *
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
- FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/07/2016','dd/mm/yyyy')
- AND read_date < to_date('01/01/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id
- ) AS cir
- WHERE data_days >= 540;
-
- -- Produces sample table for 18m data
- CREATE TABLE public.icp_sample_18m AS
- SELECT * FROM public.best_icp_18m
- ORDER BY random()
- LIMIT 1000;
-
- -- Number of ICPs with data for each day
- SELECT read_date, COUNT(DISTINCT icp_id) as icps
- FROM coup_prd.coupdatamaster
- GROUP BY read_date
- ORDER BY read_date;
-
- -- View for best data in 2016-18 set
- CREATE VIEW public.best_icp_1618 AS
- SELECT *
- FROM
- (
- SELECT icp_id, COUNT(DISTINCT read_date) AS data_days
- FROM coup_prd.coupdatamaster
- WHERE read_date >= to_date('01/04/2016','dd/mm/yyyy')
- AND read_date < to_date('01/04/2018','dd/mm/yyyy')
- AND content_code = 'UN'
- GROUP BY icp_id
- ) AS cir
- WHERE data_days >= 720;
-
- -- Produces sample table for 1618 data
- CREATE TABLE public.icp_sample_1618 AS
- SELECT * FROM public.best_icp_1618
- ORDER BY random()
- LIMIT 1000;
-
- -- Produces 5k sample table for 2017 data
- CREATE TABLE public.icp_sample_5k AS
- SELECT * FROM public.best_icp
- ORDER BY random()
- LIMIT 5000;
|