Repository for Petra's work at ampli Jan-Feb 2019

clustering.py 3.3KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. from pprint import pprint
  2. from argparse import ArgumentParser
  3. from util import getQuery, pickleQuery
  4. import numpy as np
  5. import pandas as p
  6. import matplotlib
  7. matplotlib.use('agg')
  8. import matplotlib.pyplot as plt
  9. import seaborn as sns
  10. from scipy.spatial.distance import squareform
  11. from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
  12. from tqdm import tqdm
  13. from itertools import combinations
  14. from math import factorial as f
  15. def cluster(dcmat, method, nclusters):
  16. """Cluster provided correlation dataframe
  17. """
  18. lmat = squareform(dcmat)
  19. lobj = linkage(lmat, method = method)
  20. clabs = [x + 1 for x in range(nclusters)]
  21. clusts = fcluster(lobj, nclusters, criterion='maxclust')
  22. clustdf = p.DataFrame({'icp_id' : dcmat.index.values, 'cluster' : clusts})
  23. return lobj, clustdf
  24. def dendro(lobj, clustdf, numclusts, icps, fname):
  25. clabs = [x + 1 for x in range(numclusts)]
  26. cpal = dict(zip(clabs, sns.color_palette("colorblind", numclusts).as_hex()))
  27. # Algorithm via
  28. # <https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func>
  29. ldict = {icp_id:cpal[cluster] for icp_id, cluster in zip(clustdf.icp_id, clustdf.cluster)}
  30. link_cols = {}
  31. for i, i12 in enumerate(lobj[:,:2].astype(int)):
  32. c1, c2 = (link_cols[x] if x > len(lobj) else ldict[clustdf.icp_id[x]]
  33. for x in i12)
  34. link_cols[i+1+len(lobj)] = c1 if c1 == c2 else '#000000'
  35. plt.figure(figsize = (25, 10))
  36. plt.title('ICP Clustering Dendrogram')
  37. plt.xlabel('ICP ID/(Number of ICPs)')
  38. plt.ylabel('distance')
  39. dendrogram(
  40. lobj,
  41. labels = icps,
  42. leaf_rotation=90,
  43. leaf_font_size=8,
  44. link_color_func = lambda x: link_cols[x],
  45. color_threshold = None
  46. )
  47. plt.savefig(fname)
  48. def main():
  49. parser = ArgumentParser(description='Cluster from pre-existing distance correlation matrix in pickled dataframe')
  50. parser.add_argument("-i", "--input", dest="input", help = "input pickle path", metavar="PATH", required = True)
  51. parser.add_argument("-o", "--output", dest="output", help = "output pickle path", metavar="PATH", required = True)
  52. parser.add_argument("--method", dest="method", help = "clustering method; default 'ward'", metavar = "METHOD", default = "ward")
  53. parser.add_argument("--clusters", dest="numclusters", help = "number of clusters; default: 9", metavar = "NUM", default = 9, type = int)
  54. parser.add_argument("-d", "--dendrogram", dest = "incdendro", help = "draw dendrogram", action ="store_true")
  55. parser.add_argument("-t", "--tree", dest="treepath", help="Filename for dendrogram (if -d)", metavar="PATH")
  56. parser.add_argument("-v", "--verbose", dest = "verbose", action ="store_true")
  57. args = parser.parse_args()
  58. if args.incdendro and args.treepath is None:
  59. parser.error("-d/--dendrogram requires -t/--tree PATH")
  60. if args.verbose:
  61. print("Clustering")
  62. sourcep = p.read_pickle(args.input)
  63. l, c = cluster(sourcep, args.method, args.numclusters)
  64. c.to_pickle(args.output)
  65. if args.incdendro:
  66. if args.verbose:
  67. print("Drawing dendrogram")
  68. icps = sourcep.index.values
  69. dendro(l, c, args.numclusters, icps, args.treepath)
  70. if __name__ == "__main__":
  71. main()