12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- # Takes the correlation part out of clustering file
- from argparse import ArgumentParser
- import numpy as np
- import pandas as p
- from tqdm import tqdm
- from itertools import combinations
- from math import factorial as f
-
- def tqcorr(df):
- """Calculates a (distance) correlation matrix for wide-form dataframe, with progressbar
- """
- cols = df.columns
- ncols = len(cols)
- cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
- for c in tqdm(cols):
- cdf.loc[c, c] = 0
- comb = combinations(cols, 2)
- ncomb = f(ncols) // f(2) // f(ncols - 2)
- for c1, c2 in tqdm(comb, total = ncomb):
- dv = 1 - df[c1].corr(df[c2])
- cdf.loc[c1, c2] = dv
- cdf.loc[c2, c1] = dv
- return cdf
-
-
- def createCorr(source, output, piv):
- """Load a pkl in wide form from source, process, run tqcorr() and save response to output
- """
- df = p.read_pickle(source)
- if piv:
- df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
- df = df[df.columns[df.max() != df.min()]]
- cmat = tqcorr(df)
- cmat.to_pickle(output)
-
-
- if __name__ == "__main__":
- parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe')
- parser.add_argument("-i", "--input", dest="input", help = "input pickle path; default: ../data/2017-5k-wide.pkl", metavar="PATH", default = "../data/2017-5k-wide.pkl")
- parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="PATH", default = "../data/5kdcorrmatrix.pkl")
- parser.add_argument("-p", "--pivot", dest = "istall", help = "input dataframe is in tall format and must be pivoted", action ="store_true")
- args = parser.parse_args()
- createCorr(args.input, args.output, args.istall)
|