Skip to content

Commit fa5d8fd

Browse files
committed
Make the results of KMeans-Clustering visible
1 parent 9ff9042 commit fa5d8fd

File tree

3 files changed

+163
-95
lines changed

3 files changed

+163
-95
lines changed
274 KB
Loading

python/Clustering/HierarchicalClustering.py

+1-26
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# @Author: WuLC
33
# @Date: 2017-02-12 15:41:09
44
# @Last Modified by: WuLC
5-
# @Last Modified time: 2017-02-12 23:02:25
5+
# @Last Modified time: 2017-02-14 23:05:08
66

77
from GetData import read_data
88
from math import sqrt
@@ -27,31 +27,6 @@ def __init__(self, id, vector, distance=0, left = None, right = None):
2727
self.right = right
2828

2929

30-
def pearson(v1,v2):
31-
"""use pearson coeffcient to caculate the distance between two vectors
32-
33-
Args:
34-
v1 (list): values of vector1
35-
v2 (list): values of vector2
36-
37-
Returns:
38-
(flaot):1 - pearson coeffcient, the smaller, the more similar
39-
"""
40-
# Simple sums
41-
sum1=sum(v1)
42-
sum2=sum(v2)
43-
# Sums of the squares
44-
sum1Sq=sum([pow(v,2) for v in v1])
45-
sum2Sq=sum([pow(v,2) for v in v2])
46-
# Sum of the products
47-
pSum=sum([v1[i]*v2[i] for i in xrange(len(v1))])
48-
# Calculate r (Pearson score)
49-
num=pSum-(sum1*sum2/len(v1))
50-
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
51-
if den==0: return 0
52-
return 1.0-num/den
53-
54-
5530
def hierarchicalClustering(blog_data, distance = pearson):
5631
"""hierachical clustering of data
5732

python/Clustering/KMeansClustering.py

+162-69
Original file line numberDiff line numberDiff line change
@@ -2,90 +2,183 @@
22
# @Author: WuLC
33
# @Date: 2017-02-13 09:03:42
44
# @Last Modified by: WuLC
5-
# @Last Modified time: 2017-02-13 10:17:27
5+
# @Last Modified time: 2017-02-15 20:54:58
66

77

88
# Clustering with KMeans algorithm
99

1010
import random
1111
from math import sqrt
12+
from PIL import Image,ImageDraw
1213
from GetData import read_data
1314

1415
def pearson(v1,v2):
15-
"""use pearson coeffcient to caculate the distance between two vectors
16-
17-
Args:
18-
v1 (list): values of vector1
19-
v2 (list): values of vector2
20-
21-
Returns:
22-
(flaot):1 - pearson coeffcient, the smaller, the more similar
23-
"""
24-
# Simple sums
25-
sum1=sum(v1)
26-
sum2=sum(v2)
27-
# Sums of the squares
28-
sum1Sq=sum([pow(v,2) for v in v1])
29-
sum2Sq=sum([pow(v,2) for v in v2])
30-
# Sum of the products
31-
pSum=sum([v1[i]*v2[i] for i in xrange(len(v1))])
32-
# Calculate r (Pearson score)
33-
num=pSum-(sum1*sum2/len(v1))
34-
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
35-
if den==0: return 0
36-
return 1.0-num/den
16+
"""use pearson coeffcient to caculate the distance between two vectors
17+
18+
Args:
19+
v1 (list): values of vector1
20+
v2 (list): values of vector2
21+
22+
Returns:
23+
(flaot):1 - pearson coeffcient, the smaller, the more similar
24+
"""
25+
# Simple sums
26+
sum1=sum(v1)
27+
sum2=sum(v2)
28+
# Sums of the squares
29+
sum1Sq=sum([pow(v,2) for v in v1])
30+
sum2Sq=sum([pow(v,2) for v in v2])
31+
# Sum of the products
32+
pSum=sum([v1[i]*v2[i] for i in xrange(len(v1))])
33+
# Calculate r (Pearson score)
34+
num=pSum-(sum1*sum2/len(v1))
35+
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
36+
if den==0: return 0
37+
return 1.0-num/den
38+
3739

3840
def kMeans(blog_data, distance = pearson, k = 5):
39-
m, n = len(blog_data), len(blog_data[0])
40-
max_value = [0 for i in xrange(n)]
41-
min_value = [0 for i in xrange(n)]
42-
for i in xrange(m):
43-
for j in xrange(n):
44-
max_value[j] = max(max_value[j], blog_data[i][j])
45-
min_value[j] = min(min_value[j], blog_data[i][j])
41+
m, n = len(blog_data), len(blog_data[0])
42+
max_value = [0 for i in xrange(n)]
43+
min_value = [0 for i in xrange(n)]
44+
for i in xrange(m):
45+
for j in xrange(n):
46+
max_value[j] = max(max_value[j], blog_data[i][j])
47+
min_value[j] = min(min_value[j], blog_data[i][j])
4648

4749
# initial random clusters
48-
clusters = []
49-
for i in xrange(k):
50-
clusters.append([min_value[j] + random.random()*(max_value[j] - min_value[j]) for j in xrange(n)])
51-
52-
count = 0
53-
previous_cluster_nodes = None
54-
while True:
55-
count += 1
56-
print 'iteration count %s'%count
57-
curr_cluster_nodes = [[] for i in xrange(k)]
58-
for i in xrange(m):
59-
closest_distance = distance(blog_data[i], clusters[0])
60-
cluster = 0
61-
for j in xrange(1, k):
62-
d = distance(blog_data[i], clusters[j])
63-
if closest_distance > d:
64-
closest_distance = d
65-
cluster = j
66-
curr_cluster_nodes[cluster].append(i)
67-
68-
if curr_cluster_nodes == previous_cluster_nodes:
69-
break
70-
71-
previous_cluster_nodes = curr_cluster_nodes
72-
# modify the core of each cluster
73-
for i in xrange(k):
74-
tmp = [0 for _ in xrange(n)]
75-
for node in curr_cluster_nodes[i]:
76-
for j in xrange(n):
77-
tmp[j] += blog_data[node][j]
78-
clusters[i] = [float(tmp[j])/len(curr_cluster_nodes) for j in xrange(n)]
79-
return clusters, curr_cluster_nodes
50+
clusters = []
51+
for i in xrange(k):
52+
clusters.append([min_value[j] + random.random()*(max_value[j] - min_value[j]) for j in xrange(n)])
53+
54+
count = 0
55+
previous_cluster_nodes = None
56+
while True:
57+
count += 1
58+
print 'iteration count %s'%count
59+
curr_cluster_nodes = [[] for i in xrange(k)]
60+
for i in xrange(m):
61+
closest_distance = distance(blog_data[i], clusters[0])
62+
cluster = 0
63+
for j in xrange(1, k):
64+
d = distance(blog_data[i], clusters[j])
65+
if closest_distance > d:
66+
closest_distance = d
67+
cluster = j
68+
curr_cluster_nodes[cluster].append(i)
69+
70+
if curr_cluster_nodes == previous_cluster_nodes:
71+
break
72+
73+
previous_cluster_nodes = curr_cluster_nodes
74+
# modify the core of each cluster
75+
for i in xrange(k):
76+
tmp = [0 for _ in xrange(n)]
77+
for node in curr_cluster_nodes[i]:
78+
for j in xrange(n):
79+
tmp[j] += blog_data[node][j]
80+
clusters[i] = [float(tmp[j])/len(curr_cluster_nodes) for j in xrange(n)]
81+
return clusters, curr_cluster_nodes
82+
83+
84+
def scale_dowm(blog_data,distance=pearson,rate=0.01):
85+
"""transform data in multiple-dimentional to two-dimentional
86+
87+
Args:
88+
data (list[list[]]): blog data in the form of a two-dimentional matrix
89+
distance (TYPE, optional): standark to caculate similarity between two vectors
90+
rate (float, optional): rate to move the position of the nodes
91+
92+
Returns:
93+
list[list[]]: position of nodes in a two dimentional coordinate
94+
"""
95+
n=len(blog_data)
96+
97+
# The real distances between every pair of items
98+
real_list=[[distance(blog_data[i],blog_data[j]) for j in xrange(n)]
99+
for i in xrange(n)]
100+
101+
# Randomly initialize the starting points of the locations in 2D
102+
loc=[[random.random(), random.random()] for i in xrange(n)]
103+
fake_list=[[0.0 for j in xrange(n)] for i in xrange(n)]
104+
105+
lasterror=None
106+
for m in range(0,1000):
107+
# Find projected distances
108+
for i in range(n):
109+
for j in range(n):
110+
fake_list[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2)
111+
for x in xrange(len(loc[i]))]))
112+
113+
# Move points
114+
grad=[[0.0,0.0] for i in range(n)]
115+
116+
totalerror=0
117+
for k in range(n):
118+
for j in range(n):
119+
if j==k or real_list[j][k] == 0: continue # acoid the case when real_list[j][k] == 0.0
120+
# The error is percent difference between the distances
121+
error_term=(fake_list[j][k]-real_list[j][k])/real_list[j][k]
122+
123+
# Each point needs to be moved away from or towards the other
124+
# point in proportion to how much error it has
125+
grad[k][0] += ((loc[k][0]-loc[j][0])/fake_list[j][k])*error_term
126+
grad[k][1] += ((loc[k][1]-loc[j][1])/fake_list[j][k])*error_term
127+
128+
# Keep track of the total error
129+
totalerror+=abs(error_term)
130+
# print 'curr error {0}'.format(totalerror)
131+
132+
# If the answer got worse by moving the points, we are done
133+
if lasterror and lasterror<totalerror: break
134+
lasterror=totalerror
135+
136+
# Move each of the points by the learning rate times the gradient
137+
for k in range(n):
138+
loc[k][0] -= rate*grad[k][0]
139+
loc[k][1] -= rate*grad[k][1]
140+
141+
return loc
142+
143+
144+
def draw_clusters(blog_data, clusters, cluster_nodes, blog_names, jpeg_path = 'Clustering_data/mds2d.jpg'):
145+
"""draw the result of KMeans clustering
146+
147+
Args:
148+
blog_data (list[list]): blog data that had been transfromed into two-dimentional form
149+
clusters (list[list]): center of clusters that had been transfromed into two-dimentional form
150+
cluster_nodes (list[list]): nodes of each cluster
151+
blog_names (list[str]): blog name corresponding to each node
152+
jpeg_path (str, optional): path of the photo to be stored
153+
154+
Returns:
155+
None
156+
"""
157+
img=Image.new('RGB',(2000,2000),(255,255,255))
158+
draw=ImageDraw.Draw(img)
159+
for i in xrange(len(clusters)):
160+
for node in cluster_nodes[i]:
161+
c_x,c_y = (clusters[i][0] + 0.5)*1000, (clusters[i][1] + 0.5)*1000
162+
x, y =(blog_data[node][0]+0.5)*1000, (blog_data[node][1]+0.5)*1000
163+
draw.line((c_x, c_y, x, y),fill=(255,0,0))
164+
draw.text((x,y),blog_names[node],(0,0,0))
165+
img.save(jpeg_path ,'JPEG')
166+
80167

81168
if __name__ == '__main__':
82-
col_names, blog_names, blog_data = read_data('Clustering_data/data')
83-
clusters, cluster_nodes = kMeans(blog_data)
84-
for i in xrange(len(cluster_nodes)):
85-
print '=============cluster %s==========='%i
86-
for node in cluster_nodes[i]:
87-
print blog_names[node]
88-
169+
cluster_num = 4
170+
col_names, blog_names, blog_data = read_data('Clustering_data/data')
171+
clusters, cluster_nodes = kMeans(blog_data, k = cluster_num)
172+
for i in xrange(len(cluster_nodes)):
173+
print '=============cluster %s==========='%i
174+
for node in cluster_nodes[i]:
175+
print blog_names[node]
176+
177+
scaled_data = scale_dowm(blog_data + clusters)
178+
scaled_blog_data = scaled_data[:len(blog_data)]
179+
scaled_clusters = scaled_data[len(blog_data):]
180+
draw_clusters(scaled_blog_data, scaled_clusters, cluster_nodes, blog_names)
181+
89182

90183

91184

0 commit comments

Comments
 (0)