forked from WuLC/MachineLearningAlgorithm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHierarchicalClustering.py
185 lines (145 loc) · 5.63 KB
/
HierarchicalClustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# -*- coding: utf-8 -*-
# @Author: WuLC
# @Date: 2017-02-12 15:41:09
# @Last Modified by: WuLC
# @Last Modified time: 2017-02-12 23:02:25
from GetData import read_data
from math import sqrt
from PIL import Image,ImageDraw
class hcluster:
"""describe a cluster as a node in a tree"""
def __init__(self, id, vector, distance=0, left = None, right = None):
"""structure to describe a cluster as a node in a tree
Args:
id (int): unique id of the node
vector (list): value of the node
distance (int, optional): distance between left tree and right tree of the node if there exists, 0 for leaf nodes
left (None, optional): root of the left tree
right (None, optional): root of the right tree
"""
self.id = id
self.vector = vector
self.distance = distance
self.left = left
self.right = right
def pearson(v1,v2):
"""use pearson coeffcient to caculate the distance between two vectors
Args:
v1 (list): values of vector1
v2 (list): values of vector2
Returns:
(flaot):1 - pearson coeffcient, the smaller, the more similar
"""
# Simple sums
sum1=sum(v1)
sum2=sum(v2)
# Sums of the squares
sum1Sq=sum([pow(v,2) for v in v1])
sum2Sq=sum([pow(v,2) for v in v2])
# Sum of the products
pSum=sum([v1[i]*v2[i] for i in xrange(len(v1))])
# Calculate r (Pearson score)
num=pSum-(sum1*sum2/len(v1))
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
if den==0: return 0
return 1.0-num/den
def hierarchicalClustering(blog_data, distance = pearson):
"""hierachical clustering of data
Args:
blog_data (list[list]): data of each blogs, a list of integers represents the data of the blog
distance (TYPE, optional): standark to judge distance between data
Returns:
(hcluster): the root of the clustering tree
"""
# initi clusters, each node is a cluster
clusters = [hcluster(id = i, vector = blog_data[i]) for i in xrange(len(blog_data))]
# use negativ number to represent cluster with more than one node
clust_id = -1
# use distance to store caculated results
distances = {}
while len(clusters) > 1:
similar_pairs = (0,1)
closest_distance = distance(clusters[0].vector, clusters[1].vector)
for i in xrange(len(clusters)):
for j in xrange(i+1, len(clusters)):
if (clusters[i].id, clusters[j].id) not in distances:
distances[(clusters[i].id, clusters[j].id)] = distance(clusters[i].vector, clusters[j].vector)
d = distances[(clusters[i].id, clusters[j].id)]
if closest_distance > d:
closest_distance = d
similar_pairs = (i, j)
merged_vector = [(clusters[similar_pairs[0]].vector[i] + clusters[similar_pairs[1]].vector[i])/2.0
for i in xrange(len(clusters[similar_pairs[0]].vector))]
new_cluster = hcluster(id = clust_id, vector = merged_vector, distance = closest_distance,
left = clusters[similar_pairs[0]], right = clusters[similar_pairs[1]])
# must delete elements from higher index to lower index
del clusters[similar_pairs[1]]
del clusters[similar_pairs[0]]
clusters.append(new_cluster)
clust_id -= 1
return clusters[0]
def print_cluster(cluster, blog_names, n):
""" print the cluster in a rough way
Args:
cluster (hcluster): root of the clustering tree
blog_names (list): name of the blogs, identified by cluster id
n (int): indentation of each hierarchy
Returns:
None
"""
print ' '*n,
if cluster.id < 0:
print '-'
print_cluster(cluster.left, blog_names, n+1)
print_cluster(cluster.right, blog_names, n+1)
else:
print blog_names[cluster.id]
def getheight(cluster):
if cluster.left==None and cluster.right==None: return 1
# Otherwise the height is the same of the heights of
# each branch
return getheight(cluster.left)+getheight(cluster.right)
def getdepth(cluster):
# The distance of an endpoint is 0.0
if cluster.left==None and cluster.right==None: return 0
# The distance of a branch is the greater of its two sides
# plus its own distance
return max(getdepth(cluster.left),getdepth(cluster.right))+cluster.distance
def drawnode(draw,cluster,x,y,scaling,blog_names):
if cluster.id < 0:
h1=getheight(cluster.left)*20
h2=getheight(cluster.right)*20
top=y-(h1+h2)/2
bottom=y+(h1+h2)/2
# Line length
ll=cluster.distance*scaling
# Vertical line from this cluster to children
draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))
# Horizontal line to left item
draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))
# Horizontal line to right item
draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))
# Call the function to draw the left and right nodes
drawnode(draw,cluster.left,x+ll,top+h1/2,scaling,blog_names)
drawnode(draw,cluster.right,x+ll,bottom-h2/2,scaling,blog_names)
else:
# If this is an endpoint, draw the item label
draw.text((x+5,y-7),blog_names[cluster.id],(0,0,0))
def draw_cluster(cluster, blog_names, jpeg_path):
# height and width
h=getheight(cluster)*20
w=1200
depth=getdepth(cluster)
# width is fixed, so scale distances accordingly
scaling=float(w-150)/depth
# Create a new image with a white background
img=Image.new('RGB',(w,h),(255,255,255))
draw=ImageDraw.Draw(img)
draw.line((0,h/2,10,h/2),fill=(255,0,0))
# Draw the first node
drawnode(draw,cluster,10,(h/2),scaling,blog_names)
img.save(jpeg_path,'JPEG')
if __name__ == '__main__':
col_names, blog_names, blog_data = read_data('Clustering_data/data')
cluster = hierarchicalClustering(blog_data)
draw_cluster(cluster, blog_names, 'Clustering_data/clusters.jpg')