5 years ago · 99e9aca478
--- a/README.md
+++ b/README.md
@@ -128,6 +128,7 @@ Aggregates data based on clusters. **Note**: columns `CI_low` and `CI_high` do n
 
				 * `-i PATH`: The path for the python "pickle" file retrieve the original data from (i.e. the data downloaded from `downkwh.py`).
			
 
				 * `-c PATH`: The path for the python "pickle" file retrieve the cluster data from.
			
 
				 * `-o PATH`: The path for the python "pickle" file to store the result in.
			
 
				+* `-d`: Drop cluster '-1', which represents the miscellaneous/unclustered pseudo-cluster produced by `clusAssign.py`.
			
 
				 * `-p`: If the dataframe is not in wide form, pivots it so that it is first. Note: untested and unused, representing past behaviour.
			
 
				 
			
 
				 Example:
			
@@ -147,7 +148,7 @@ Helper function to transform a pickle into a csv file, for easier importing into
 
				 * `-r`: Include row names/index labels in csv. This may be essential for proper exporting of some datasets
			
 
				 * `-v`: Output extra information, including dimensions of dataset.
			
 
				 
			
 
				-Example:
			
 
				+Examples:
			
 
				 
			
 
				 ```bash
			
 
				 python pickletocsv.py ../data/test1kagg.pkl | less
			
@@ -155,6 +156,12 @@ python pickletocsv.py ../data/test1kagg.pkl | less
 
				 
			
 
				 Reads file at `../data/test1kagg.pkl` and views it in the UNIX pager `less`.
			
 
				 
			
 
				+```bash
			
 
				+python pickletocsv.py ../data/test1kagg.pkl - | tabview -
			
 
				+```
			
 
				+
			
 
				+Reads the same file and views it using the `tabview` python module (included in `requirements.txt`). `-` in this case is shorthand for `stdout` and `stdin` respectively, allowing the pipe.
			
 
				+
			
 
				 ### `clusAssign.py`
			
 
				 
			
 
				 Assigns clusters found from one dataset to the values of another. **Note**: this algorithm can assign some ICPs to cluster -1, which means that it failed to assign to a cluster. **Further note**: this method requires both datasets to be on the same timespan.
			
@@ -169,7 +176,7 @@ Example:
 
				 ```bash
			
 
				 python downkwh.py -o ../data/test1kb.pkl -t public.icp_sample_18m
			
 
				 python clusAssign.py -i ../data/test1kb.pkl -c ../data/test1kbclustertable.pkl -a ../data/test1kagg.pkl -t 0.1
			
 
				-python agg.py -i ../data/test1kb.pkl -c ../data/test1kbclustertable.pkl -o ../data/test1kbagg.pkl
			
 
				+python agg.py -i ../data/test1kb.pkl -c ../data/test1kbclustertable.pkl -o ../data/test1kbagg.pkl -d
			
 
				 ```
			
 
				 
			
 
				-Downloads new dataset from the `public.icp_sample_18m` sample and saves it to `../data/test1kb.pkl`. Then assigns clusters to this from the `../data/test1kagg.pkl` dataset with threshold 0.1, saving into `../data/test1kbclustertable.pkl`. Then aggregates this dataset and saves in `../data/test1kbagg.pkl`.
			
 
				+Downloads new dataset from the `public.icp_sample_18m` sample and saves it to `../data/test1kb.pkl`. Then assigns clusters to this (excluding the misc/'-1' cluster) from the `../data/test1kagg.pkl` dataset with threshold 0.1, saving into `../data/test1kbclustertable.pkl`. Then aggregates this dataset and saves in `../data/test1kbagg.pkl`.
			
--- a/py/agg.py
+++ b/py/agg.py
@@ -3,10 +3,12 @@ from argparse import ArgumentParser
 
				 import pandas as p
			
 
				 from tqdm import tqdm
			
 
				 
			
 
				-def aggregator(widedf, clusdf):
			
 
				+def aggregator(widedf, clusdf, drop_misc = False):
			
 
				     """Aggregate a (wide-form) dataframe by the cluster mappings in a second dataframe
			
 
				     """
			
 
				-    clusters = clusdf['cluster'].unique()
			
 
				+    clusters = list(clusdf['cluster'].unique())
			
 
				+    if drop_misc and -1 in clusters:
			
 
				+        clusters.remove(-1)
			
 
				     clusters.sort()
			
 
				     dflis = []
			
 
				     qlow  = lambda x: x.quantile(0.250)
			
@@ -35,6 +37,7 @@ def main():
 
				     parser.add_argument("-i", "--input",  dest="input",      help = "input pickle path",  metavar="PATH", required = True)
			
 
				     parser.add_argument("-c", "--clusters", dest="clusfile", help = "cluster pickle path", metavar="PATH", required = True)
			
 
				     parser.add_argument("-o", "--output", dest="output",     help = "output pickle path", metavar="PATH", required = True)
			
 
				+    parser.add_argument("-d", "--drop-misc", dest="drop_misc", help = "drop 'misc' (-1) pseudocluster", action = "store_true")
			
 
				     parser.add_argument("-p", "--pivot", dest = "istall",    help = "input dataframe is in tall format and must be pivoted", action ="store_true")
			
 
				     args = parser.parse_args()
			
 
				     wd = p.read_pickle(args.input)
			
@@ -42,7 +45,7 @@ def main():
 
				     if (args.istall):
			
 
				         wd = wd.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
			
 
				 
			
 
				-    agged = aggregator(wd, cd)
			
 
				+    agged = aggregator(wd, cd, args.drop_misc)
			
 
				     agged.to_pickle(args.output)
			
 
				 
			
 
				 
			
--- a/py/requirements.txt
+++ b/py/requirements.txt
@@ -0,0 +1,19 @@
 
				+brewer2mpl==1.4.1
			
 
				+cycler==0.10.0
			
 
				+ggplot==0.11.5
			
 
				+kiwisolver==1.0.1
			
 
				+matplotlib==3.0.2
			
 
				+numpy==1.15.4
			
 
				+pandas==0.23.4
			
 
				+patsy==0.5.1
			
 
				+pkg-resources==0.0.0
			
 
				+psycopg2-binary==2.7.6.1
			
 
				+pyparsing==2.3.0
			
 
				+python-dateutil==2.7.5
			
 
				+pytz==2018.9
			
 
				+scipy==1.2.0
			
 
				+seaborn==0.9.0
			
 
				+six==1.12.0
			
 
				+statsmodels==0.9.0
			
 
				+tabview==1.4.3
			
 
				+tqdm==4.30.0