from util import getQuery, pickleQuery
import numpy as np
import pandas as p
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from tqdm import tqdm
from itertools import combinations
from math import factorial as f

def tqcorr(df):
    cols = df.columns
    ncols = len(cols)
    cdf = p.DataFrame(index = cols, columns = cols, dtype = np.float16)
    print(cdf.info())
    for c in tqdm(cols):
        cdf.loc[c, c] = 0
    print(cdf.info())
    comb = combinations(cols, 2)
    ncomb = f(ncols) // f(2) // f(ncols - 2)
    for c1, c2 in tqdm(comb, total = ncomb):
        dv = 1 - df[c1].corr(df[c2])
        cdf.loc[c1, c2] = dv
        cdf.loc[c2, c1] = dv
    print(cdf.info())
    return cdf


tqdm.pandas()

Sourcedata =   '../data/2017-5k-wide.pkl'
lableddata =   '../data/9-clusters-5k.pkl'
aggdata =      '../data/9-clusters-5k-agg.pkl'
clustertable = '../data/9-clusters-5k-table.pkl'

numclusts = 9
df = p.read_pickle(Sourcedata)
# dforig = df

# print(df)

print(df.info())
# print(df.icp_id.nunique())
# print(df.read_time.nunique())
# print(df.groupby('icp_id').read_time.nunique().nunique())
# df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
# print(df.info())
df = df[df.columns[df.max() != df.min()]]
print(df.info())
cmat = tqcorr(df)
print(cmat)
print(cmat.info())

cmat.to_pickle('../data/fulldcorrmatrix.pkl')

# lmat = squareform(1 - cmat)

# lobj = linkage(lmat, method = 'ward')
# print(lobj)
# print(cophenet(lobj, lmat))


# clabs = [x + 1 for x in range(numclusts)]
# cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))

# clusts = fcluster(lobj, numclusts, criterion='maxclust')
# print(clusts)
# print(cmat.index.values)
# clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
# print(clustdf)
# clustdf.to_pickle(clustertable)
# mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
# print(mdf)
# print(mdf.info())
# qlow  = lambda x: x.quantile(0.250)
# qhigh = lambda x: x.quantile(0.750)
# print(mdf.cluster.describe())
# mdagg = mdf.groupby(['read_time', 'cluster']).agg({
#         'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
# }, q = 0.025)
# mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
# mdagg = mdagg.reset_index()
# print(mdagg)
# print(mdagg.info())
# print(mdagg.describe())
# # mdf.to_csv('~/windows/Documents/clusters-ward.csv')
# print("Saving")
# mdf.to_pickle(lableddata)
# mdagg.to_pickle(aggdata)
# print("saved")

# # Algorithm via 
# # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
# ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
# link_cols = {}
# for i, i12 in enumerate(lobj[:,:2].astype(int)):
#   c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
#     for x in i12)
#   link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'

# plt.figure(figsize = (25, 10))
# plt.title('ICP Clustering Dendrogram')
# plt.xlabel('ICP ID/(Number of ICPs)')
# plt.ylabel('distance')
# dendrogram(
#     lobj,
#     labels = cmat.index.values,
#     leaf_rotation=90,
#     leaf_font_size=8,
#     # show_leaf_counts = True,
#     # truncate_mode = 'lastp',
#     # p = 50,
#     # show_contracted = True,
#     link_color_func = lambda x: link_cols[x],
#     color_threshold = None
# )
# # plt.show()
# plt.savefig("../img/sample-9-dendro.png")

# sns.set()

# f, axes = plt.subplots(3,3)

# for i, c in enumerate(clabs):
#     fds = mdagg[mdagg.cluster == c]
#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
# # plt.show()
# plt.savefig("../img/sample-9-panedtrends.png")