petra
/
ampli


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
							from util import getQuery, pickleQuery
import pandas as p
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster


# query = """
# SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
# FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy') 
# ORDER BY icp_id, read_time;
# """
# 
# qparams = ['%%1117', '20/04/2017']

# query = """
# SELECT read_date, period, AVG(kwh_tot) AS average
# FROM public.coup_tall_april
# GROUP BY read_date, period
# ORDER BY read_date, period;
# """
# 
# qparams = []
# 
# df = getQuery(query, qparams)
# 
# print(df.info())
# 
# sns.set()
# 
# #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
# sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
# 
# plt.show()

df = p.read_pickle('../data/jan19s.pkl')
dforig = df

print(df.info())
print(df.icp_id.nunique())
print(df.read_time.nunique())
print(df.groupby('icp_id').read_time.nunique().nunique())
df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
df = df[df.columns[df.max() != df.min()]]
print(df.info())
cmat = df.corr()
print(cmat.info())

lmat = squareform(1 - cmat)

lobj = linkage(lmat, method = 'ward')
print(lobj)
print(cophenet(lobj, lmat))

#plt.figure(figsize = (25, 10))
#plt.title('ICP Clustering Dendrogram')
#plt.xlabel('ICP ID/(Number of ICPs)')
#plt.ylabel('distance')
#dendrogram(
#    lobj,
#    labels = cmat.index.values,
#    leaf_rotation=90,
#    leaf_font_size=8,
#    #show_leaf_counts = True,
#    #truncate_mode = 'lastp',
#    #p = 50,
#    #show_contracted = True,
#    color_threshold = 1.9
#)
#plt.show()

clusts = fcluster(lobj, 6, criterion='maxclust')
print(clusts)
print(cmat.index.values)
clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : [chr(x + ord('A') - 1) for x in clusts]})
print(clustdf)
mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
print(mdf)
print(mdf.info())
print(mdf.cluster.describe())

mdf.to_csv('~/windows/Documents/clusters-ward.csv')

sns.set()

sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'cluster', data = mdf)
plt.show()