|
@@ -1,5 +1,6 @@
|
1
|
1
|
# An algorithm for assigning a dataset to pre-existing clusters
|
2
|
2
|
import pandas as p
|
|
3
|
+from pprint import pprint
|
3
|
4
|
|
4
|
5
|
# Pre-existing aggregated clusters
|
5
|
6
|
clusfile = '../data/9-clusters.agg.pkl'
|
|
@@ -20,3 +21,37 @@ newdf = p.read_pickle(ndsfile).pivot(index = 'read_time',
|
20
|
21
|
values = 'kwh_tot').loc[clusdf.index, :]
|
21
|
22
|
print(newdf)
|
22
|
23
|
print(newdf.info())
|
|
24
|
+
|
|
25
|
+clusdict = {}
|
|
26
|
+
|
|
27
|
+clusters = list(clusdf.columns)
|
|
28
|
+icps = list(newdf.columns)
|
|
29
|
+
|
|
30
|
+print(clusters)
|
|
31
|
+
|
|
32
|
+for i in icps:
|
|
33
|
+ bestc = -1
|
|
34
|
+ s = newdf.loc[:, i]
|
|
35
|
+ if (s.min() != s.max()):
|
|
36
|
+ bestr = -1
|
|
37
|
+ for c in clusters:
|
|
38
|
+ thisr = s.corr(clusdf.loc[:, c], method = 'pearson')
|
|
39
|
+ if thisr > bestr:
|
|
40
|
+ bestr = thisr
|
|
41
|
+ bestc = c
|
|
42
|
+ print('Assigning ICP {} to cluster {} with correlation {}.'.format(i, bestc, bestr))
|
|
43
|
+ else:
|
|
44
|
+ print('ICP {} has constant value; assigning to cluster -1'.format(i))
|
|
45
|
+ clusdict[i] = bestc
|
|
46
|
+
|
|
47
|
+# Need to make sure cluster is integer ZZZ
|
|
48
|
+newclusdf = p.DataFrame.from_dict(clusdict, orient = 'index', columns = ['cluster'])
|
|
49
|
+newclusdf.index.name = 'icp_id'
|
|
50
|
+newclusdf = newclusdf.reset_index()
|
|
51
|
+# print(newclusdf)
|
|
52
|
+
|
|
53
|
+newdf = p.melt(newdf.reset_index(), 'read_time', var_name = 'icp_id', value_name = 'kwh')
|
|
54
|
+
|
|
55
|
+######## ZZZ Something isn't working here
|
|
56
|
+anndf = newdf.set_index('icp_id').join(newclusdf)
|
|
57
|
+print(anndf)
|