forked from WuLC/MachineLearningAlgorithm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHierarchicalClustering.py
160 lines (124 loc) · 5.02 KB
/
HierarchicalClustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
# @Author: WuLC
# @Date: 2017-02-12 15:41:09
# @Last Modified by: WuLC
# @Last Modified time: 2017-02-15 22:08:57
from GetData import read_data
from math import sqrt
from PIL import Image,ImageDraw
class hcluster:
"""describe a cluster as a node in a tree"""
def __init__(self, id, vector, distance=0, left = None, right = None):
"""structure to describe a cluster as a node in a tree
Args:
id (int): unique id of the node
vector (list): value of the node
distance (int, optional): distance between left tree and right tree of the node if there exists, 0 for leaf nodes
left (None, optional): root of the left tree
right (None, optional): root of the right tree
"""
self.id = id
self.vector = vector
self.distance = distance
self.left = left
self.right = right
def hierarchicalClustering(blog_data, distance = pearson):
"""hierachical clustering of data
Args:
blog_data (list[list]): data of each blogs, a list of integers represents the data of the blog
distance (TYPE, optional): standark to judge distance between data
Returns:
(hcluster): the root of the clustering tree
"""
# initi clusters, each node is a cluster
clusters = [hcluster(id = i, vector = blog_data[i]) for i in xrange(len(blog_data))]
# use negativ number to represent cluster with more than one node
clust_id = -1
# use distance to store caculated results
distances = {}
while len(clusters) > 1:
similar_pairs = (0,1)
closest_distance = distance(clusters[0].vector, clusters[1].vector)
for i in xrange(len(clusters)):
for j in xrange(i+1, len(clusters)):
if (clusters[i].id, clusters[j].id) not in distances:
distances[(clusters[i].id, clusters[j].id)] = distance(clusters[i].vector, clusters[j].vector)
d = distances[(clusters[i].id, clusters[j].id)]
if closest_distance > d:
closest_distance = d
similar_pairs = (i, j)
merged_vector = [(clusters[similar_pairs[0]].vector[i] + clusters[similar_pairs[1]].vector[i])/2.0
for i in xrange(len(clusters[similar_pairs[0]].vector))]
new_cluster = hcluster(id = clust_id, vector = merged_vector, distance = closest_distance,
left = clusters[similar_pairs[0]], right = clusters[similar_pairs[1]])
# must delete elements from higher index to lower index
del clusters[similar_pairs[1]]
del clusters[similar_pairs[0]]
clusters.append(new_cluster)
clust_id -= 1
return clusters[0]
def print_cluster(cluster, blog_names, n):
""" print the cluster in a rough way
Args:
cluster (hcluster): root of the clustering tree
blog_names (list): name of the blogs, identified by cluster id
n (int): indentation of each hierarchy
Returns:
None
"""
print ' '*n,
if cluster.id < 0:
print '-'
print_cluster(cluster.left, blog_names, n+1)
print_cluster(cluster.right, blog_names, n+1)
else:
print blog_names[cluster.id]
def get_height(cluster):
if cluster.left==None and cluster.right==None: return 1
# Otherwise the height is the same of the heights of
# each branch
return get_height(cluster.left)+get_height(cluster.right)
def get_depth(cluster):
# The distance of an endpoint is 0.0
if cluster.left==None and cluster.right==None: return 0
# The distance of a branch is the greater of its two sides
# plus its own distance
return max(get_depth(cluster.left),get_depth(cluster.right))+cluster.distance
def draw_node(draw,cluster,x,y,scaling,blog_names):
if cluster.id < 0:
h1=get_height(cluster.left)*20
h2=get_height(cluster.right)*20
top=y-(h1+h2)/2
bottom=y+(h1+h2)/2
# Line length
ll=cluster.distance*scaling
# Vertical line from this cluster to children
draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))
# Horizontal line to left item
draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))
# Horizontal line to right item
draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))
# Call the function to draw the left and right nodes
draw_node(draw,cluster.left,x+ll,top+h1/2,scaling,blog_names)
draw_node(draw,cluster.right,x+ll,bottom-h2/2,scaling,blog_names)
else:
# If this is an endpoint, draw the item label
draw.text((x+5,y-7),blog_names[cluster.id],(0,0,0))
def draw_cluster(cluster, blog_names, jpeg_path):
# height and width
h=get_height(cluster)*20
w=1200
depth=get_depth(cluster)
# width is fixed, so scale distances accordingly
scaling=float(w-150)/depth
# Create a new image with a white background
img=Image.new('RGB',(w,h),(255,255,255))
draw=ImageDraw.Draw(img)
draw.line((0,h/2,10,h/2),fill=(255,0,0))
# Draw the first node
draw_node(draw,cluster,10,h/2,scaling,blog_names)
img.save(jpeg_path,'JPEG')
if __name__ == '__main__':
col_names, blog_names, blog_data = read_data('Clustering_data/data')
cluster = hierarchicalClustering(blog_data)
draw_cluster(cluster, blog_names, 'Clustering_data/clusters.jpg')