12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- from util import getQuery, pickleQuery
- import pandas as p
- import matplotlib.pyplot as plt
- import seaborn as sns
- from scipy.spatial.distance import squareform
- from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
-
-
- # query = """
- # SELECT *, read_date + CONCAT(period / 2, ':', period %% 2 * 30, ':00')::time AS read_time
- # FROM public.coup_tall_april WHERE icp_id LIKE (%s) AND read_date = to_date(%s, 'dd/mm/yyyy')
- # ORDER BY icp_id, read_time;
- # """
- #
- # qparams = ['%%1117', '20/04/2017']
-
- # query = """
- # SELECT read_date, period, AVG(kwh_tot) AS average
- # FROM public.coup_tall_april
- # GROUP BY read_date, period
- # ORDER BY read_date, period;
- # """
- #
- # qparams = []
- #
- # df = getQuery(query, qparams)
- #
- # print(df.info())
- #
- # sns.set()
- #
- # #sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'icp_id', data = df)
- # sns.lineplot(x = 'period', y = 'average', hue = 'read_date', data = df)
- #
- # plt.show()
-
- df = p.read_pickle('../data/jan19s.pkl')
- dforig = df
-
- print(df.info())
- print(df.icp_id.nunique())
- print(df.read_time.nunique())
- print(df.groupby('icp_id').read_time.nunique().nunique())
- df = df.pivot(index = 'read_time', columns = 'icp_id', values = 'kwh_tot')
- df = df[df.columns[df.max() != df.min()]]
- print(df.info())
- cmat = df.corr()
- print(cmat.info())
-
- lmat = squareform(1 - cmat)
-
- lobj = linkage(lmat, method = 'ward')
- print(lobj)
- print(cophenet(lobj, lmat))
-
- #plt.figure(figsize = (25, 10))
- #plt.title('ICP Clustering Dendrogram')
- #plt.xlabel('ICP ID/(Number of ICPs)')
- #plt.ylabel('distance')
- #dendrogram(
- # lobj,
- # labels = cmat.index.values,
- # leaf_rotation=90,
- # leaf_font_size=8,
- # #show_leaf_counts = True,
- # #truncate_mode = 'lastp',
- # #p = 50,
- # #show_contracted = True,
- # color_threshold = 1.9
- #)
- #plt.show()
-
- clusts = fcluster(lobj, 6, criterion='maxclust')
- print(clusts)
- print(cmat.index.values)
- clustdf = p.DataFrame({'icp_id' : cmat.index.values, 'cluster' : [chr(x + ord('A') - 1) for x in clusts]})
- print(clustdf)
- mdf = p.merge(clustdf, dforig, on = 'icp_id', how = 'left')
- print(mdf)
- print(mdf.info())
- print(mdf.cluster.describe())
-
- mdf.to_csv('~/windows/Documents/clusters-ward.csv')
-
- sns.set()
-
- sns.lineplot(x = 'read_time', y = 'kwh_tot', hue = 'cluster', data = mdf)
- plt.show()
|