petra
/
ampli


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
							from util import getQuery, pickleQuery
import numpy as np
import pandas as p
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster


# query = """
# SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
# FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy') 
# ORDER BY icp_id, read_time;
# """
# 
# qparams = ['%%1117', '20/04/2017']

# query = """
# SELECT read_date, period, AVG(kwh_tot) AS average
# FROM public.coup_tall_april
# GROUP BY read_date, period
# ORDER BY read_date, period;
# """
# 
# qparams = []
# 
# df = getQuery(query, qparams)
# 
# print(df.info())
# 
# sns.set()
# 
# #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
# sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
# 
# plt.show()

numclusts = 9
df = p.read_pickle('../data/2016-17-sample.pkl')
dforig = df

# print(df)

print(df.info())
print(df.icp_id.nunique())
print(df.read_time.nunique())
# print(df.groupby('icp_id').read_time.nunique().nunique())
df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
print(df.info())
df = df[df.columns[df.max() != df.min()]]
print(df.info())
cmat = df.corr()
print(cmat.info())

lmat = squareform(1 - cmat)

lobj = linkage(lmat, method = 'ward')
print(lobj)
print(cophenet(lobj, lmat))


clabs = [x + 1 for x in range(numclusts)]
cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))

clusts = fcluster(lobj, numclusts, criterion='maxclust')
print(clusts)
print(cmat.index.values)
clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : clusts})
print(clustdf)
mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
print(mdf)
print(mdf.info())
qlow  = lambda x: x.quantile(0.250)
qhigh = lambda x: x.quantile(0.750)
print(mdf.cluster.describe())
mdagg = mdf.groupby(['read_time', 'cluster']).agg({
        'kwh_tot': ['median', 'mean', ('CI_low', qlow), ('CI_high', qhigh)]
}, q = 0.025)
mdagg.columns = ['_'.join(x) for x in mdagg.columns.ravel()]
mdagg = mdagg.reset_index()
print(mdagg)
print(mdagg.info())
print(mdagg.describe())
# mdf.to_csv('~/windows/Documents/clusters-ward.csv')
print("Saving")
mdf.to_pickle('../data/9-clusters-1617.pkl')
mdagg.to_pickle('../data/9-clusters-1617.agg.pkl')
print("saved")

# Algorithm via 
# <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
link_cols = {}
for i, i12 in enumerate(lobj[:,:2].astype(int)):
  c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
    for x in i12)
  link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'

plt.figure(figsize = (25, 10))
plt.title('ICP Clustering Dendrogram')
plt.xlabel('ICP ID/(Number of ICPs)')
plt.ylabel('distance')
dendrogram(
    lobj,
    labels = cmat.index.values,
    leaf_rotation=90,
    leaf_font_size=8,
    # show_leaf_counts = True,
    # truncate_mode = 'lastp',
    # p = 50,
    # show_contracted = True,
    link_color_func = lambda x: link_cols[x],
    color_threshold = None
)
plt.show()

# sns.set()
# 
# f, axes = plt.subplots(3,3)
# 
# for i, c in enumerate(clabs):
#     fds = mdagg[mdagg.cluster == c]
#     sns.lineplot(x = 'read_time', y = 'kwh_tot_mean', color = cpal[c], ax = axes[i//3][i%3], data = fds)
#     axes[i//3][i%3].fill_between(fds.read_time.dt.to_pydatetime(), fds.kwh_tot_CI_low, fds.kwh_tot_CI_high, alpha = 0.1, color = cpal[c])
# plt.show()