Browse Source

Documentation for clustering scripts

Petra Lamborn 5 years ago
parent
commit
68fbbcc788
4 changed files with 70 additions and 10 deletions
  1. 53
    0
      README.md
  2. 3
    3
      py/agg.py
  3. 12
    5
      py/clustering.py
  4. 2
    2
      py/dcorr.py

+ 53
- 0
README.md View File

@@ -84,3 +84,56 @@ Example:
84 84
 python downweather.py -o ../data/weathertest.pkl
85 85
 ```
86 86
 Downloads data from the default period into `../data/weathertest.pkl`.
87
+
88
+### `dcorr.py`
89
+
90
+Create a distance correlation matrix from pickled pandas dataframe. Note that this takes some time, but has a progress bar.
91
+
92
+* `-i PATH`: The path for the python "pickle" file retrieve the original data from.
93
+* `-o PATH`: The path for the python "pickle" file to store the result in.
94
+* `-p`: If the dataframe is not in wide form, pivots it so that it is first. Note: untested and unused, representing past behaviour.
95
+
96
+Example:
97
+
98
+```bash
99
+python dcorr.py -i ../data/test1k.pkl -o ../data/test1kdcorr.pkl
100
+```
101
+
102
+Calculates correlation matrix from the dataset previously downloaded to `../data/test1k.pkl` and store in `../data/test1kdcorr.pkl`.
103
+
104
+### `clustering.py`
105
+
106
+Algorithm for hierarchical clustering. **Note**: different datasets and parameters may make similar clusters in a different order.
107
+
108
+* `-i PATH`: The path for the python "pickle" file retrieve the original data from.
109
+* `-o PATH`: The path for the python "pickle" file to store the result in.
110
+* `--method METHOD`: Clustering method for hierarchical clustering; default is 'ward'. See [the scipy docs](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html) for more information and other options.
111
+* `--clusters NUM`: The number of clusters; default is 9.
112
+* `-d`: Include dendrogram; requires `-t/--tree`.
113
+* `-t PATH`: Path to save dendrogram image in if `-d`
114
+* `-v`: Output some extra progress information as it goes; mostly useful for debugging.
115
+
116
+Example:
117
+
118
+```bash
119
+python clustering.py -i ../data/test1kdcorr.pkl -o ../data/test1kclustable.pkl -d -t ../img/test1kdendro.png
120
+```
121
+
122
+Clusters data from `../data/test1kdcorr.pkl`; saves table of clusters/icps at `../data/test1kclustable.pkl`, save dendrogram at `../img/test1kdendro.png`. 
123
+
124
+### `agg.py`
125
+
126
+Aggregates data based on clusters. **Note**: columns `CI_low` and `CI_high` do not represent formal statistical confidence intervals, merely lower and upper quartiles; for indicative purposes only. 
127
+
128
+* `-i PATH`: The path for the python "pickle" file retrieve the original data from (i.e. the data downloaded from `downkwh.py`).
129
+* `-c PATH`: The path for the python "pickle" file retrieve the cluster data from.
130
+* `-o PATH`: The path for the python "pickle" file to store the result in.
131
+* `-p`: If the dataframe is not in wide form, pivots it so that it is first. Note: untested and unused, representing past behaviour.
132
+
133
+Example:
134
+
135
+```bash
136
+python agg.py -i ../data/test1k.pkl -c ../data/test1kclustable.pkl -o ../data/test1kagg.pkl
137
+```
138
+
139
+Aggregates data from `../data/test1k.pkl` by cluster information in `../data/test1kclustable.pkl`, save in `../data/test1kagg.pkl`.

+ 3
- 3
py/agg.py View File

@@ -32,9 +32,9 @@ def aggregator(widedf, clusdf):
32 32
 
33 33
 def main():
34 34
     parser = ArgumentParser(description='Aggregate dataframe by specified clusters')
35
-    parser.add_argument("-i", "--input",  dest="input",      help = "input pickle path; default: ../data/2017-5k-wide.pkl",  metavar="PATH", default = "../data/2017-5k-wide.pkl")
36
-    parser.add_argument("-c", "--clusters", dest="clusfile", help = "cluster pickle path; default: ../data/5kclustable.pkl", metavar="PATH", default = "../data/5kclustable.pkl")
37
-    parser.add_argument("-o", "--output", dest="output",     help = "output pickle path; default: ../data/5k-ag.pkl", metavar="PATH", default = "../data/5k-ag.pkl")
35
+    parser.add_argument("-i", "--input",  dest="input",      help = "input pickle path",  metavar="PATH", required = True)
36
+    parser.add_argument("-c", "--clusters", dest="clusfile", help = "cluster pickle path", metavar="PATH", required = True)
37
+    parser.add_argument("-o", "--output", dest="output",     help = "output pickle path", metavar="PATH", required = True)
38 38
     parser.add_argument("-p", "--pivot", dest = "istall",    help = "input dataframe is in tall format and must be pivoted", action ="store_true")
39 39
     args = parser.parse_args()
40 40
     wd = p.read_pickle(args.input)

+ 12
- 5
py/clustering.py View File

@@ -55,21 +55,28 @@ def dendro(lobj, clustdf, numclusts, icps, fname):
55 55
 
56 56
 def main():
57 57
     parser = ArgumentParser(description='Cluster from pre-existing distance correlation matrix in pickled dataframe')
58
-    parser.add_argument("-i", "--input",  dest="input",  help = "input pickle path; default: ../data/5kdcorrmatrix.pkl",  metavar="PATH", default = "../data/5kdcorrmatrix.pkl")
59
-    parser.add_argument("-o", "--output", dest="output", help = "output pickle path; default: ../data/5kdclustable.pkl",  metavar="PATH", default = "../data/5kdclustable.pkl")
58
+    parser.add_argument("-i", "--input",  dest="input",  help = "input pickle path",  metavar="PATH", required = True)
59
+    parser.add_argument("-o", "--output", dest="output", help = "output pickle path",  metavar="PATH", required = True)
60 60
     parser.add_argument("--method", dest="method", help = "clustering method; default 'ward'", metavar = "METHOD", default = "ward")
61 61
     parser.add_argument("--clusters",  dest="numclusters",  help = "number of clusters; default: 9", metavar = "NUM", default = 9, type = int)
62 62
     parser.add_argument("-d", "--dendrogram", dest = "incdendro", help = "draw dendrogram", action ="store_true")
63
-    parser.add_argument("-t", "--tree", dest="treepath", help="Filename for dendrogram (if -d), default: ../img/59-9-dendro.png", metavar="PATH", default = "../img/5k-9-dendro.png")
63
+    parser.add_argument("-t", "--tree", dest="treepath", help="Filename for dendrogram (if -d)", metavar="PATH")
64
+    parser.add_argument("-v", "--verbose", dest = "verbose", action ="store_true")
64 65
     args = parser.parse_args()
65 66
 
66
-    print("Clustering")
67
+    if args.incdendro and args.treepath is None:
68
+        parser.error("-d/--dendrogram requires -t/--tree PATH")
69
+
70
+    if args.verbose:
71
+        print("Clustering")
72
+
67 73
     sourcep = p.read_pickle(args.input)
68 74
     l, c = cluster(sourcep, args.method, args.numclusters)
69 75
     c.to_pickle(args.output)
70 76
 
71
-    print("Drawing dendrogram")
72 77
     if args.incdendro:
78
+        if args.verbose:
79
+            print("Drawing dendrogram")
73 80
         icps = sourcep.index.values
74 81
         dendro(l, c, args.numclusters, icps, args.treepath)
75 82
 

+ 2
- 2
py/dcorr.py View File

@@ -36,8 +36,8 @@ def createCorr(source, output, piv):
36 36
 
37 37
 if __name__ == "__main__":
38 38
     parser = ArgumentParser(description='Create distance correlation matrix from pickled wideform pandas dataframe')
39
-    parser.add_argument("-i", "--input",  dest="input",   help = "input pickle path; default: ../data/2017-5k-wide.pkl",  metavar="PATH", default = "../data/2017-5k-wide.pkl")
40
-    parser.add_argument("-o", "--output", dest="output",  help = "output pickle path; default: ../data/5kdcorrmatrix.pkl", metavar="PATH", default = "../data/5kdcorrmatrix.pkl")
39
+    parser.add_argument("-i", "--input",  dest="input",   help = "input pickle path",  metavar="PATH", required = True)
40
+    parser.add_argument("-o", "--output", dest="output",  help = "output pickle path", metavar="PATH", required = True)
41 41
     parser.add_argument("-p", "--pivot", dest = "istall", help = "input dataframe is in tall format and must be pivoted", action ="store_true")
42 42
     args = parser.parse_args()
43 43
     createCorr(args.input, args.output, args.istall)