Browse Source

Futher optimisation

Petra Lamborn 5 years ago
parent
commit
036f012d25
4 changed files with 28 additions and 20 deletions
  1. 1
    1
      py/clustering.py
  2. 1
    1
      py/downkwh.py
  3. 6
    18
      py/util.py
  4. 20
    0
      sql/queries.pgsql

+ 1
- 1
py/clustering.py View File

@@ -36,7 +36,7 @@ from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
36 36
 # plt.show()
37 37
 
38 38
 numclusts = 7
39
-df = p.read_pickle('../data/2017-1s.pkl')
39
+df = p.read_pickle('../data/2017-sample.pkl')
40 40
 dforig = df
41 41
 
42 42
 print(df.info())

+ 1
- 1
py/downkwh.py View File

@@ -44,4 +44,4 @@ kwhdata = getkwh('2017-01-01', '2018-01-01', '2017-01-01 00:30:00', '2018-01-01
44 44
 print(kwhdata.info())
45 45
 
46 46
 print("Pickling")
47
-kwhdata.to_pickle("../data/2017-1s.pkl")
47
+kwhdata.to_pickle("../data/2017-sample.pkl")

+ 6
- 18
py/util.py View File

@@ -77,21 +77,7 @@ def getkwh(datestart, dateend, timestart, timeend, subset):
77 77
             SELECT read_time 
78 78
             FROM GENERATE_SERIES(%(tsstart)s::timestamp, %(tsend)s::timestamp, 
79 79
                 '30 minutes'::interval) read_time
80
-        ) AS tsdata CROSS JOIN
81
-        (
82
-            SELECT *
83
-            FROM
84
-            (
85
-                SELECT icp_id, COUNT(DISTINCT read_date) AS data_days 
86
-                FROM coup_prd.coupdatamaster
87
-                WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
88
-                    AND read_date <  to_date('01/01/2018','dd/mm/yyyy')
89
-                    AND content_code = 'UN'
90
-                    AND icp_id LIKE %(subset)s
91
-                GROUP BY icp_id
92
-            ) AS cir 
93
-            WHERE data_days >= 360
94
-        ) AS qual_icp
80
+        ) AS tsdata CROSS JOIN public.icp_sample
95 81
     ) AS comb
96 82
     LEFT JOIN
97 83
     (
@@ -108,7 +94,9 @@ def getkwh(datestart, dateend, timestart, timeend, subset):
108 94
             WHERE   a.read_date >= to_date(%(datestart)s,'yyyy-mm-dd')
109 95
              and   a.read_date <  to_date(%(dateend)s,'yyyy-mm-dd')
110 96
              and   a.content_code  ~ ('UN|CN|EG')
111
-             AND   a.icp_id LIKE %(subset)s
97
+             AND   a.icp_id IN (
98
+                SELECT icp_id FROM public.icp_sample
99
+             )
112 100
             GROUP BY 1, 2, 3
113 101
         ) AS coup_tall
114 102
     ) AS tall_timestamp 
@@ -118,8 +106,8 @@ def getkwh(datestart, dateend, timestart, timeend, subset):
118 106
         'datestart': datestart,
119 107
         'dateend': dateend,
120 108
         'tsstart': timestart,
121
-        'tsend': timeend,
122
-        'subset': subset
109
+        'tsend': timeend
110
+        # 'subset': subset
123 111
     }
124 112
     print("Getting data with parameters:")
125 113
     print(pdict)

+ 20
- 0
sql/queries.pgsql View File

@@ -344,3 +344,23 @@ LEFT JOIN
344 344
     ) AS coup_tall
345 345
 ) AS tall_timestamp 
346 346
 ON comb.read_time = tall_timestamp.read_time AND comb.icp_id = tall_timestamp.icp_id;
347
+
348
+-- Produces a view of all the icp's with at least 360 days of data, along with a column of days of data
349
+CREATE VIEW public.best_icp AS
350
+SELECT *
351
+FROM
352
+(
353
+    SELECT icp_id, COUNT(DISTINCT read_date) AS data_days 
354
+    FROM coup_prd.coupdatamaster
355
+    WHERE read_date >= to_date('01/01/2017','dd/mm/yyyy')
356
+        AND read_date <  to_date('01/01/2018','dd/mm/yyyy')
357
+        AND content_code = 'UN'
358
+    GROUP BY icp_id
359
+) AS cir 
360
+WHERE data_days >= 360;
361
+
362
+-- Produces sample table
363
+CREATE TABLE public.icp_sample AS
364
+SELECT * FROM public.best_icp
365
+ORDER BY random()
366
+LIMIT 1000;