|
@@ -1,77 +1,56 @@
|
1
|
1
|
# An algorithm for assigning a dataset to pre-existing clusters
|
2
|
2
|
import pandas as p
|
|
3
|
+import numpy as np
|
3
|
4
|
from pprint import pprint
|
4
|
|
-
|
5
|
|
-# Pre-existing aggregated clusters
|
6
|
|
-clusfile = '../data/9-clusters.agg.pkl'
|
7
|
|
-
|
8
|
|
-# A new dataset
|
9
|
|
-ndsfile = '../data/2016-17-sample.pkl'
|
10
|
|
-
|
11
|
|
-# Table of assigned clusters
|
12
|
|
-aclusfile = '../data/1617-asgn-table.pkl'
|
13
|
|
-
|
14
|
|
-# Aggregated dataset
|
15
|
|
-aggfile = '../data/1617-agg.pkl'
|
16
|
|
-
|
17
|
|
-
|
18
|
|
-clusdf = p.read_pickle(clusfile)
|
19
|
|
-clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
|
20
|
|
-del clusdf.columns.name
|
21
|
|
-print(clusdf.info())
|
22
|
|
-
|
23
|
|
-
|
24
|
|
-
|
25
|
|
-newdf = p.read_pickle(ndsfile).pivot(index = 'read_time',
|
26
|
|
- columns = 'icp_id',
|
27
|
|
- values = 'kwh_tot').loc[clusdf.index, :]
|
28
|
|
-print(newdf)
|
29
|
|
-print(newdf.info())
|
30
|
|
-
|
31
|
|
-clusdict = {}
|
32
|
|
-
|
33
|
|
-clusters = list(clusdf.columns)
|
34
|
|
-icps = list(newdf.columns)
|
35
|
|
-
|
36
|
|
-print(clusters)
|
37
|
|
-
|
38
|
|
-for i in icps:
|
39
|
|
- bestc = -1
|
40
|
|
- s = newdf.loc[:, i]
|
41
|
|
- if (s.min() != s.max()):
|
42
|
|
- bestr = -1
|
43
|
|
- for c in clusters:
|
44
|
|
- thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
|
45
|
|
- if thisr > bestr:
|
46
|
|
- bestr = thisr
|
47
|
|
- bestc = c
|
48
|
|
- print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
|
49
|
|
- else:
|
50
|
|
- print('ICP {} has constant value; assigning to cluster -1'.format(i))
|
51
|
|
- clusdict[i] = bestc
|
52
|
|
-
|
53
|
|
-newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
|
54
|
|
-newclusdf.index.name = 'icp_id'
|
55
|
|
-newclusdf = newclusdf.reset_index()
|
56
|
|
-print(newclusdf)
|
57
|
|
-newclusdf.to_pickle(aclusfile)
|
58
|
|
-
|
59
|
|
-
|
60
|
|
-newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')
|
61
|
|
-
|
62
|
|
-print(newdf.info())
|
63
|
|
-print(newclusdf.info())
|
64
|
|
-
|
65
|
|
-anndf = newdf.set_index('icp_id').join(newclusdf.set_index('icp_id')).reset_index()
|
66
|
|
-print(anndf)
|
67
|
|
-
|
68
|
|
-qlow = lambda x: x.quantile(0.250)
|
69
|
|
-qhigh = lambda x: x.quantile(0.750)
|
70
|
|
-newagg = anndf.groupby(['read_time', 'cluster']).agg({
|
71
|
|
- 'kwh': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
|
72
|
|
-})
|
73
|
|
-newagg.columns = ['_tot_'.join(x) for x in newagg.columns.ravel()]
|
74
|
|
-newagg = newagg.reset_index()
|
75
|
|
-
|
76
|
|
-print(newagg)
|
77
|
|
-newagg.to_pickle(aggfile)
|
|
5
|
+from argparse import ArgumentParser
|
|
6
|
+from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+def AssignClusters(df, agg, threshold = -1):
|
|
9
|
+ agg = agg.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
|
|
10
|
+ del agg.columns.name
|
|
11
|
+ df = df.loc[agg.index, :]
|
|
12
|
+
|
|
13
|
+ clusdict = {}
|
|
14
|
+
|
|
15
|
+ clusters = list(agg.columns)
|
|
16
|
+ icps = list(df.columns)
|
|
17
|
+
|
|
18
|
+ for i in tqdm(icps):
|
|
19
|
+ bestc = -1
|
|
20
|
+ s = df.loc[:, i]
|
|
21
|
+ if (s.min() != s.max()):
|
|
22
|
+ bestr = -1
|
|
23
|
+ for c in clusters:
|
|
24
|
+ thisr = s.corr(agg.loc[:, c], method = 'pearson')
|
|
25
|
+ if thisr > bestr:
|
|
26
|
+ bestr = thisr
|
|
27
|
+ bestc = c
|
|
28
|
+ if bestr > threshold:
|
|
29
|
+ clusdict[i] = bestc
|
|
30
|
+ else:
|
|
31
|
+ clusdict[i] = -1
|
|
32
|
+
|
|
33
|
+ newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
|
|
34
|
+ newclusdf.index.name = 'icp_id'
|
|
35
|
+ newclusdf = newclusdf.reset_index()
|
|
36
|
+ return newclusdf
|
|
37
|
+
|
|
38
|
+def main():
|
|
39
|
+ parser = ArgumentParser(description='Assign clusters found from one dataset to the values of another')
|
|
40
|
+ parser.add_argument("-i", "--input", dest="input", help = "input pickle path", metavar="PATH", required = True)
|
|
41
|
+ parser.add_argument("-a", "--agg", dest="agg", help = "Aggregated dataset to take cluster information from", metavar="PATH", required = True)
|
|
42
|
+ parser.add_argument("-c", "--clusters", dest="clusfile", help = "output cluster pickle path", metavar="PATH", required = True)
|
|
43
|
+ parser.add_argument("-t", "--threshold", dest = "threshold", help = "Set threshold for clustering; default = -1, range from -1 to 1", default = -1, metavar = "NUM", type = np.float32)
|
|
44
|
+ args = parser.parse_args()
|
|
45
|
+ if args.threshold < -1 or args.threshold >= 1:
|
|
46
|
+ parser.error("-t/--threshold must be at least -1 and less than 1, is {}".format(args.threshold))
|
|
47
|
+
|
|
48
|
+ idf = p.read_pickle(args.input)
|
|
49
|
+ adf = p.read_pickle(args.agg)
|
|
50
|
+
|
|
51
|
+ cdf = AssignClusters(idf, adf, args.threshold)
|
|
52
|
+ cdf.to_pickle(args.clusfile)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+if __name__ == "__main__":
|
|
56
|
+ main()
|