5 years ago · 8256a93941
--- a/README.md
+++ b/README.md
@@ -154,3 +154,22 @@ python pickletocsv.py -i ../data/test1kagg.pkl | less
 
				 ```
			
 
				 
			
 
				 Reads file at `../data/test1kagg.pkl` and views it in the UNIX pager `less`.
			
 
				+
			
 
				+### `clusAssign.py`
			
 
				+
			
 
				+Assigns clusters found from one dataset to the values of another. **Note**: this algorithm can assign some ICPs to cluster -1, which means that it failed to assign to a cluster. **Further note**: this method requires both datasets to be on the same timespan.
			
 
				+
			
 
				+* `-i PATH`: The path for the file which contains the dataset.
			
 
				+* `-a PATH`: The path for aggregated dataset to compare to this one.
			
 
				+* `-c PATH`: Path for the output.
			
 
				+* `-t NUM`: Correlation threshold for clustering: if best correlation is less than this number the icp will be assigned to cluster -1. Default -1, can be any value greater than or equal to -1 and less than 1.
			
 
				+
			
 
				+Example:
			
 
				+
			
 
				+```bash
			
 
				+python downkwh.py -o ../data/test1kb.pkl -t public.icp_sample_18m
			
 
				+python clusAssign.py -i ../data/test1kb.pkl -c ../data/test1kbclustertable.pkl -a ../data/test1kagg.pkl -t 0.1
			
 
				+python agg.py -i ../data/test1kb.pkl -c ../data/test1kbclustertable.pkl -o ../data/test1kbagg.pkl
			
 
				+```
			
 
				+
			
 
				+Downloads new dataset from the `public.icp_sample_18m` sample and saves it to `../data/test1kb.pkl`. Then assigns clusters to this from the `../data/test1kagg.pkl` dataset with threshold 0.1, saving into `../data/test1kbclustertable.pkl`. Then aggregates this dataset and saves in `..data/test1kbagg.pkl`.
			
--- a/py/clusAssign.py
+++ b/py/clusAssign.py
@@ -1,77 +1,56 @@
 
				 # An algorithm for assigning a dataset to pre-existing clusters
			
 
				 import pandas as p
			
 
				+import numpy as np
			
 
				 from pprint import pprint
			
 
				-
			
 
				-# Pre-existing aggregated clusters
			
 
				-clusfile = '../data/9-clusters.agg.pkl'
			
 
				-
			
 
				-# A new dataset
			
 
				-ndsfile = '../data/2016-17-sample.pkl'
			
 
				-
			
 
				-# Table of assigned clusters
			
 
				-aclusfile = '../data/1617-asgn-table.pkl'
			
 
				-
			
 
				-# Aggregated dataset
			
 
				-aggfile = '../data/1617-agg.pkl'
			
 
				-
			
 
				-
			
 
				-clusdf = p.read_pickle(clusfile)
			
 
				-clusdf = clusdf.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
			
 
				-del clusdf.columns.name
			
 
				-print(clusdf.info())
			
 
				-
			
 
				-
			
 
				-
			
 
				-newdf = p.read_pickle(ndsfile).pivot(index = 'read_time', 
			
 
				-                                     columns = 'icp_id', 
			
 
				-                                     values = 'kwh_tot').loc[clusdf.index, :]
			
 
				-print(newdf)
			
 
				-print(newdf.info())
			
 
				-
			
 
				-clusdict = {}
			
 
				-
			
 
				-clusters = list(clusdf.columns)
			
 
				-icps = list(newdf.columns)
			
 
				-
			
 
				-print(clusters)
			
 
				-
			
 
				-for i in icps:
			
 
				-    bestc = -1
			
 
				-    s = newdf.loc[:, i]
			
 
				-    if (s.min() != s.max()):
			
 
				-        bestr = -1
			
 
				-        for c in clusters:
			
 
				-            thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
			
 
				-            if thisr > bestr:
			
 
				-                bestr = thisr
			
 
				-                bestc = c
			
 
				-        print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
			
 
				-    else:
			
 
				-        print('ICP {} has constant value; assigning to cluster -1'.format(i))
			
 
				-    clusdict[i] = bestc
			
 
				-
			
 
				-newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
			
 
				-newclusdf.index.name = 'icp_id'
			
 
				-newclusdf = newclusdf.reset_index()
			
 
				-print(newclusdf)
			
 
				-newclusdf.to_pickle(aclusfile)
			
 
				-
			
 
				-
			
 
				-newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')
			
 
				-
			
 
				-print(newdf.info())
			
 
				-print(newclusdf.info())
			
 
				-
			
 
				-anndf = newdf.set_index('icp_id').join(newclusdf.set_index('icp_id')).reset_index()
			
 
				-print(anndf)
			
 
				-
			
 
				-qlow  = lambda x: x.quantile(0.250)
			
 
				-qhigh = lambda x: x.quantile(0.750)
			
 
				-newagg = anndf.groupby(['read_time', 'cluster']).agg({
			
 
				-        'kwh': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
			
 
				-})
			
 
				-newagg.columns = ['_tot_'.join(x) for x in newagg.columns.ravel()]
			
 
				-newagg = newagg.reset_index()
			
 
				-
			
 
				-print(newagg)
			
 
				-newagg.to_pickle(aggfile)
			
 
				+from argparse import ArgumentParser
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+def AssignClusters(df, agg, threshold = -1):
			
 
				+    agg = agg.pivot(index = 'read_time', columns = 'cluster', values = 'kwh_tot_mean')
			
 
				+    del agg.columns.name
			
 
				+    df = df.loc[agg.index, :]
			
 
				+
			
 
				+    clusdict = {}
			
 
				+
			
 
				+    clusters = list(agg.columns)
			
 
				+    icps = list(df.columns)
			
 
				+
			
 
				+    for i in tqdm(icps):
			
 
				+        bestc = -1
			
 
				+        s = df.loc[:, i]
			
 
				+        if (s.min() != s.max()):
			
 
				+            bestr = -1
			
 
				+            for c in clusters:
			
 
				+                thisr = s.corr(agg.loc[:, c], method = 'pearson')
			
 
				+                if thisr > bestr:
			
 
				+                    bestr = thisr
			
 
				+                    bestc = c
			
 
				+        if bestr > threshold:
			
 
				+            clusdict[i] = bestc
			
 
				+        else:
			
 
				+            clusdict[i] = -1
			
 
				+
			
 
				+    newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
			
 
				+    newclusdf.index.name = 'icp_id'
			
 
				+    newclusdf = newclusdf.reset_index()
			
 
				+    return newclusdf
			
 
				+
			
 
				+def main():
			
 
				+    parser = ArgumentParser(description='Assign clusters found from one dataset to the values of another')
			
 
				+    parser.add_argument("-i", "--input",  dest="input",      help = "input pickle path",  metavar="PATH", required = True)
			
 
				+    parser.add_argument("-a", "--agg", dest="agg",     help = "Aggregated dataset to take cluster information from", metavar="PATH", required = True)
			
 
				+    parser.add_argument("-c", "--clusters", dest="clusfile", help = "output cluster pickle path", metavar="PATH", required = True)
			
 
				+    parser.add_argument("-t", "--threshold", dest = "threshold", help = "Set threshold for clustering; default = -1, range from -1 to 1", default = -1, metavar = "NUM", type = np.float32)
			
 
				+    args = parser.parse_args()
			
 
				+    if args.threshold < -1 or args.threshold >= 1:
			
 
				+        parser.error("-t/--threshold must be at least -1 and less than 1, is {}".format(args.threshold))
			
 
				+
			
 
				+    idf = p.read_pickle(args.input)
			
 
				+    adf = p.read_pickle(args.agg)
			
 
				+
			
 
				+    cdf = AssignClusters(idf, adf, args.threshold)
			
 
				+    cdf.to_pickle(args.clusfile)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()