Browse Source

Cluster Assign algorithm

Petra Lamborn 5 years ago
parent
commit
8256a93941
2 changed files with 72 additions and 74 deletions
  1. 19
    0
      README.md
  2. 53
    74
      py/clusAssign.py

+ 19
- 0
README.md View File

@@ -154,3 +154,22 @@ python pickletocsv.py -i ../data/test1kagg.pkl | less
154 154
 ```
155 155
 
156 156
 Reads file at `../data/test1kagg.pkl` and views it in the UNIX pager `less`.
157
+
158
+### `clusAssign.py`
159
+
160
+Assigns clusters found from one dataset to the values of another. **Note**: this algorithm can assign some ICPs to cluster -1, which means that it failed to assign to a cluster. **Further note**: this method requires both datasets to be on the same timespan.
161
+
162
+* `-i PATH`: The path for the file which contains the dataset.
163
+* `-a PATH`: The path for aggregated dataset to compare to this one.
164
+* `-c PATH`: Path for the output.
165
+* `-t NUM`: Correlation threshold for clustering: if best correlation is less than this number the icp will be assigned to cluster -1. Default -1, can be any value greater than or equal to -1 and less than 1.
166
+
167
+Example:
168
+
169
+```bash
170
+python downkwh.py -o ../data/test1kb.pkl -t public.icp_sample_18m
171
+python clusAssign.py -i ../data/test1kb.pkl -c ../data/test1kbclustertable.pkl -a ../data/test1kagg.pkl -t 0.1
172
+python agg.py -i ../data/test1kb.pkl -c ../data/test1kbclustertable.pkl -o ../data/test1kbagg.pkl
173
+```
174
+
175
+Downloads new dataset from the `public.icp_sample_18m` sample and saves it to `../data/test1kb.pkl`. Then assigns clusters to this from the `../data/test1kagg.pkl` dataset with threshold 0.1, saving into `../data/test1kbclustertable.pkl`. Then aggregates this dataset and saves in `..data/test1kbagg.pkl`.

+ 53
- 74
py/clusAssign.py View File

@@ -1,77 +1,56 @@
1 1
 # An algorithm for assigning a dataset to pre-existing clusters
2 2
 import pandas as p
3
+import numpy as np
3 4
 from pprint import pprint
4
-
5
-# Pre-existing aggregated clusters
6
-clusfile = '../data/9-clusters.agg.pkl'
7
-
8
-# A new dataset
9
-ndsfile = '../data/2016-17-sample.pkl'
10
-
11
-# Table of assigned clusters
12
-aclusfile = '../data/1617-asgn-table.pkl'
13
-
14
-# Aggregated dataset
15
-aggfile = '../data/1617-agg.pkl'
16
-
17
-
18
-clusdf = p.read_pickle(clusfile)
19
-clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
20
-del clusdf.columns.name
21
-print(clusdf.info())
22
-
23
-
24
-
25
-newdf = p.read_pickle(ndsfile).pivot(index = 'read_time', 
26
-                                     columns = 'icp_id', 
27
-                                     values = 'kwh_tot').loc[clusdf.index, :]
28
-print(newdf)
29
-print(newdf.info())
30
-
31
-clusdict = {}
32
-
33
-clusters = list(clusdf.columns)
34
-icps = list(newdf.columns)
35
-
36
-print(clusters)
37
-
38
-for i in icps:
39
-    bestc = -1
40
-    s = newdf.loc[:, i]
41
-    if (s.min() != s.max()):
42
-        bestr = -1
43
-        for c in clusters:
44
-            thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
45
-            if thisr > bestr:
46
-                bestr = thisr
47
-                bestc = c
48
-        print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
49
-    else:
50
-        print('ICP {} has constant value; assigning to cluster -1'.format(i))
51
-    clusdict[i] = bestc
52
-
53
-newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
54
-newclusdf.index.name = 'icp_id'
55
-newclusdf = newclusdf.reset_index()
56
-print(newclusdf)
57
-newclusdf.to_pickle(aclusfile)
58
-
59
-
60
-newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')
61
-
62
-print(newdf.info())
63
-print(newclusdf.info())
64
-
65
-anndf = newdf.set_index('icp_id').join(newclusdf.set_index('icp_id')).reset_index()
66
-print(anndf)
67
-
68
-qlow  = lambda x: x.quantile(0.250)
69
-qhigh = lambda x: x.quantile(0.750)
70
-newagg = anndf.groupby(['read_time', 'cluster']).agg({
71
-        'kwh': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
72
-})
73
-newagg.columns = ['_tot_'.join(x) for x in newagg.columns.ravel()]
74
-newagg = newagg.reset_index()
75
-
76
-print(newagg)
77
-newagg.to_pickle(aggfile)
5
+from argparse import ArgumentParser
6
+from tqdm import tqdm
7
+
8
+def AssignClusters(df, agg, threshold = -1):
9
+    agg = agg.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
10
+    del agg.columns.name
11
+    df = df.loc[agg.index, :]
12
+
13
+    clusdict = {}
14
+
15
+    clusters = list(agg.columns)
16
+    icps = list(df.columns)
17
+
18
+    for i in tqdm(icps):
19
+        bestc = -1
20
+        s = df.loc[:, i]
21
+        if (s.min() != s.max()):
22
+            bestr = -1
23
+            for c in clusters:
24
+                thisr = s.corr(agg.loc[:, c], method = 'pearson')
25
+                if thisr > bestr:
26
+                    bestr = thisr
27
+                    bestc = c
28
+        if bestr > threshold:
29
+            clusdict[i] = bestc
30
+        else:
31
+            clusdict[i] = -1
32
+
33
+    newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
34
+    newclusdf.index.name = 'icp_id'
35
+    newclusdf = newclusdf.reset_index()
36
+    return newclusdf
37
+
38
+def main():
39
+    parser = ArgumentParser(description='Assign clusters found from one dataset to the values of another')
40
+    parser.add_argument("-i", "--input",  dest="input",      help = "input pickle path",  metavar="PATH", required = True)
41
+    parser.add_argument("-a", "--agg", dest="agg",     help = "Aggregated dataset to take cluster information from", metavar="PATH", required = True)
42
+    parser.add_argument("-c", "--clusters", dest="clusfile", help = "output cluster pickle path", metavar="PATH", required = True)
43
+    parser.add_argument("-t", "--threshold", dest = "threshold", help = "Set threshold for clustering; default = -1, range from -1 to 1", default = -1, metavar = "NUM", type = np.float32)
44
+    args = parser.parse_args()
45
+    if args.threshold < -1 or args.threshold >= 1:
46
+        parser.error("-t/--threshold must be at least -1 and less than 1, is {}".format(args.threshold))
47
+
48
+    idf = p.read_pickle(args.input)
49
+    adf = p.read_pickle(args.agg)
50
+
51
+    cdf = AssignClusters(idf, adf, args.threshold)
52
+    cdf.to_pickle(args.clusfile)
53
+
54
+
55
+if __name__ == "__main__":
56
+    main()