petra
/
ampli


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
							from util import getQuery, pickleQuery
import numpy as np
import pandas as p
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from tqdm import tqdm
from itertools import combinations
from math import factorial as f

def tqcorr(df):
    cols = df.columns
    ncols = len(cols)
    cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
    print(cdf.info())
    for c in tqdm(cols):
        cdf.loc[c, c] = 0
    print(cdf.info())
    comb = combinations(cols, 2)
    ncomb = f(ncols) // f(2) // f(ncols - 2)
    for c1, c2 in tqdm(comb, total = ncomb):
        dv = 1 - df[c1].corr(df[c2])
        cdf.loc[c1, c2] = dv
        cdf.loc[c2, c1] = dv
    print(cdf.info())
    return cdf


tqdm.pandas()

Sourcedata =   '../data/2017-5k-wide.pkl'
lableddata =   '../data/9-clusters-5k.pkl'
aggdata =      '../data/9-clusters-5k-agg.pkl'
clustertable = '../data/9-clusters-5k-table.pkl'

numclusts = 9
df = p.read_pickle(Sourcedata)
# dforig = df

# print(df)

print(df.info())
# print(df.icp_id.nunique())
# print(df.read_time.nunique())
# print(df.groupby('icp_id').read_time.nunique().nunique())
# df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
# print(df.info())
df = df[df.columns[df.max() != df.min()]]
print(df.info())
cmat = tqcorr(df)
print(cmat)
print(cmat.info())

cmat.to_pickle('../data/fulldcorrmatrix.pkl')

# lmat = squareform(1 - cmat)

# lobj = linkage(lmat, method = 'ward')
# print(lobj)
# print(cophenet(lobj, lmat))


# clabs = [x + 1 for x in range(numclusts)]
# cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))

# clusts = fcluster(lobj, numclusts, criterion='maxclust')
# print(clusts)
# print(cmat.index.values)
# clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
# print(clustdf)
# clustdf.to_pickle(clustertable)
# mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
# print(mdf)
# print(mdf.info())
# qlow  = lambda x: x.quantile(0.250)
# qhigh = lambda x: x.quantile(0.750)
# print(mdf.cluster.describe())
# mdagg = mdf.groupby(['read_time', 'cluster']).agg({
#         'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
# }, q = 0.025)
# mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
# mdagg = mdagg.reset_index()
# print(mdagg)
# print(mdagg.info())
# print(mdagg.describe())
# # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
# print("Saving")
# mdf.to_pickle(lableddata)
# mdagg.to_pickle(aggdata)
# print("saved")

# # Algorithm via 
# # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
# ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
# link_cols = {}
# for i, i12 in enumerate(lobj[:,:2].astype(int)):
#   c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
#     for x in i12)
#   link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'

# plt.figure(figsize = (25, 10))
# plt.title('ICP Clustering Dendrogram')
# plt.xlabel('ICP ID/(Number of ICPs)')
# plt.ylabel('distance')
# dendrogram(
#     lobj,
#     labels = cmat.index.values,
#     leaf_rotation=90,
#     leaf_font_size=8,
#     # show_leaf_counts = True,
#     # truncate_mode = 'lastp',
#     # p = 50,
#     # show_contracted = True,
#     link_color_func = lambda x: link_cols[x],
#     color_threshold = None
# )
# # plt.show()
# plt.savefig("../img/sample-9-dendro.png")

# sns.set()

# f, axes = plt.subplots(3,3)

# for i, c in enumerate(clabs):
#     fds = mdagg[mdagg.cluster == c]
#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
# # plt.show()
# plt.savefig("../img/sample-9-panedtrends.png")