|
2 | 2 | # @Author: WuLC
|
3 | 3 | # @Date: 2017-02-13 09:03:42
|
4 | 4 | # @Last Modified by: WuLC
|
5 |
| -# @Last Modified time: 2017-02-13 10:17:27 |
| 5 | +# @Last Modified time: 2017-02-15 20:54:58 |
6 | 6 |
|
7 | 7 |
|
8 | 8 | # Clustering with KMeans algorithm
|
9 | 9 |
|
10 | 10 | import random
|
11 | 11 | from math import sqrt
|
| 12 | +from PIL import Image,ImageDraw |
12 | 13 | from GetData import read_data
|
13 | 14 |
|
14 | 15 | def pearson(v1,v2):
|
15 |
| - """use pearson coeffcient to caculate the distance between two vectors |
16 |
| - |
17 |
| - Args: |
18 |
| - v1 (list): values of vector1 |
19 |
| - v2 (list): values of vector2 |
20 |
| - |
21 |
| - Returns: |
22 |
| - (flaot):1 - pearson coeffcient, the smaller, the more similar |
23 |
| - """ |
24 |
| - # Simple sums |
25 |
| - sum1=sum(v1) |
26 |
| - sum2=sum(v2) |
27 |
| - # Sums of the squares |
28 |
| - sum1Sq=sum([pow(v,2) for v in v1]) |
29 |
| - sum2Sq=sum([pow(v,2) for v in v2]) |
30 |
| - # Sum of the products |
31 |
| - pSum=sum([v1[i]*v2[i] for i in xrange(len(v1))]) |
32 |
| - # Calculate r (Pearson score) |
33 |
| - num=pSum-(sum1*sum2/len(v1)) |
34 |
| - den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) |
35 |
| - if den==0: return 0 |
36 |
| - return 1.0-num/den |
| 16 | + """use pearson coeffcient to caculate the distance between two vectors |
| 17 | + |
| 18 | + Args: |
| 19 | + v1 (list): values of vector1 |
| 20 | + v2 (list): values of vector2 |
| 21 | + |
| 22 | + Returns: |
| 23 | + (flaot):1 - pearson coeffcient, the smaller, the more similar |
| 24 | + """ |
| 25 | + # Simple sums |
| 26 | + sum1=sum(v1) |
| 27 | + sum2=sum(v2) |
| 28 | + # Sums of the squares |
| 29 | + sum1Sq=sum([pow(v,2) for v in v1]) |
| 30 | + sum2Sq=sum([pow(v,2) for v in v2]) |
| 31 | + # Sum of the products |
| 32 | + pSum=sum([v1[i]*v2[i] for i in xrange(len(v1))]) |
| 33 | + # Calculate r (Pearson score) |
| 34 | + num=pSum-(sum1*sum2/len(v1)) |
| 35 | + den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) |
| 36 | + if den==0: return 0 |
| 37 | + return 1.0-num/den |
| 38 | + |
37 | 39 |
|
38 | 40 | def kMeans(blog_data, distance = pearson, k = 5):
|
39 |
| - m, n = len(blog_data), len(blog_data[0]) |
40 |
| - max_value = [0 for i in xrange(n)] |
41 |
| - min_value = [0 for i in xrange(n)] |
42 |
| - for i in xrange(m): |
43 |
| - for j in xrange(n): |
44 |
| - max_value[j] = max(max_value[j], blog_data[i][j]) |
45 |
| - min_value[j] = min(min_value[j], blog_data[i][j]) |
| 41 | + m, n = len(blog_data), len(blog_data[0]) |
| 42 | + max_value = [0 for i in xrange(n)] |
| 43 | + min_value = [0 for i in xrange(n)] |
| 44 | + for i in xrange(m): |
| 45 | + for j in xrange(n): |
| 46 | + max_value[j] = max(max_value[j], blog_data[i][j]) |
| 47 | + min_value[j] = min(min_value[j], blog_data[i][j]) |
46 | 48 |
|
47 | 49 | # initial random clusters
|
48 |
| - clusters = [] |
49 |
| - for i in xrange(k): |
50 |
| - clusters.append([min_value[j] + random.random()*(max_value[j] - min_value[j]) for j in xrange(n)]) |
51 |
| - |
52 |
| - count = 0 |
53 |
| - previous_cluster_nodes = None |
54 |
| - while True: |
55 |
| - count += 1 |
56 |
| - print 'iteration count %s'%count |
57 |
| - curr_cluster_nodes = [[] for i in xrange(k)] |
58 |
| - for i in xrange(m): |
59 |
| - closest_distance = distance(blog_data[i], clusters[0]) |
60 |
| - cluster = 0 |
61 |
| - for j in xrange(1, k): |
62 |
| - d = distance(blog_data[i], clusters[j]) |
63 |
| - if closest_distance > d: |
64 |
| - closest_distance = d |
65 |
| - cluster = j |
66 |
| - curr_cluster_nodes[cluster].append(i) |
67 |
| - |
68 |
| - if curr_cluster_nodes == previous_cluster_nodes: |
69 |
| - break |
70 |
| - |
71 |
| - previous_cluster_nodes = curr_cluster_nodes |
72 |
| - # modify the core of each cluster |
73 |
| - for i in xrange(k): |
74 |
| - tmp = [0 for _ in xrange(n)] |
75 |
| - for node in curr_cluster_nodes[i]: |
76 |
| - for j in xrange(n): |
77 |
| - tmp[j] += blog_data[node][j] |
78 |
| - clusters[i] = [float(tmp[j])/len(curr_cluster_nodes) for j in xrange(n)] |
79 |
| - return clusters, curr_cluster_nodes |
| 50 | + clusters = [] |
| 51 | + for i in xrange(k): |
| 52 | + clusters.append([min_value[j] + random.random()*(max_value[j] - min_value[j]) for j in xrange(n)]) |
| 53 | + |
| 54 | + count = 0 |
| 55 | + previous_cluster_nodes = None |
| 56 | + while True: |
| 57 | + count += 1 |
| 58 | + print 'iteration count %s'%count |
| 59 | + curr_cluster_nodes = [[] for i in xrange(k)] |
| 60 | + for i in xrange(m): |
| 61 | + closest_distance = distance(blog_data[i], clusters[0]) |
| 62 | + cluster = 0 |
| 63 | + for j in xrange(1, k): |
| 64 | + d = distance(blog_data[i], clusters[j]) |
| 65 | + if closest_distance > d: |
| 66 | + closest_distance = d |
| 67 | + cluster = j |
| 68 | + curr_cluster_nodes[cluster].append(i) |
| 69 | + |
| 70 | + if curr_cluster_nodes == previous_cluster_nodes: |
| 71 | + break |
| 72 | + |
| 73 | + previous_cluster_nodes = curr_cluster_nodes |
| 74 | + # modify the core of each cluster |
| 75 | + for i in xrange(k): |
| 76 | + tmp = [0 for _ in xrange(n)] |
| 77 | + for node in curr_cluster_nodes[i]: |
| 78 | + for j in xrange(n): |
| 79 | + tmp[j] += blog_data[node][j] |
| 80 | + clusters[i] = [float(tmp[j])/len(curr_cluster_nodes) for j in xrange(n)] |
| 81 | + return clusters, curr_cluster_nodes |
| 82 | + |
| 83 | + |
| 84 | +def scale_dowm(blog_data,distance=pearson,rate=0.01): |
| 85 | + """transform data in multiple-dimentional to two-dimentional |
| 86 | + |
| 87 | + Args: |
| 88 | + data (list[list[]]): blog data in the form of a two-dimentional matrix |
| 89 | + distance (TYPE, optional): standark to caculate similarity between two vectors |
| 90 | + rate (float, optional): rate to move the position of the nodes |
| 91 | + |
| 92 | + Returns: |
| 93 | + list[list[]]: position of nodes in a two dimentional coordinate |
| 94 | + """ |
| 95 | + n=len(blog_data) |
| 96 | + |
| 97 | + # The real distances between every pair of items |
| 98 | + real_list=[[distance(blog_data[i],blog_data[j]) for j in xrange(n)] |
| 99 | + for i in xrange(n)] |
| 100 | + |
| 101 | + # Randomly initialize the starting points of the locations in 2D |
| 102 | + loc=[[random.random(), random.random()] for i in xrange(n)] |
| 103 | + fake_list=[[0.0 for j in xrange(n)] for i in xrange(n)] |
| 104 | + |
| 105 | + lasterror=None |
| 106 | + for m in range(0,1000): |
| 107 | + # Find projected distances |
| 108 | + for i in range(n): |
| 109 | + for j in range(n): |
| 110 | + fake_list[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) |
| 111 | + for x in xrange(len(loc[i]))])) |
| 112 | + |
| 113 | + # Move points |
| 114 | + grad=[[0.0,0.0] for i in range(n)] |
| 115 | + |
| 116 | + totalerror=0 |
| 117 | + for k in range(n): |
| 118 | + for j in range(n): |
| 119 | + if j==k or real_list[j][k] == 0: continue # acoid the case when real_list[j][k] == 0.0 |
| 120 | + # The error is percent difference between the distances |
| 121 | + error_term=(fake_list[j][k]-real_list[j][k])/real_list[j][k] |
| 122 | + |
| 123 | + # Each point needs to be moved away from or towards the other |
| 124 | + # point in proportion to how much error it has |
| 125 | + grad[k][0] += ((loc[k][0]-loc[j][0])/fake_list[j][k])*error_term |
| 126 | + grad[k][1] += ((loc[k][1]-loc[j][1])/fake_list[j][k])*error_term |
| 127 | + |
| 128 | + # Keep track of the total error |
| 129 | + totalerror+=abs(error_term) |
| 130 | + # print 'curr error {0}'.format(totalerror) |
| 131 | + |
| 132 | + # If the answer got worse by moving the points, we are done |
| 133 | + if lasterror and lasterror<totalerror: break |
| 134 | + lasterror=totalerror |
| 135 | + |
| 136 | + # Move each of the points by the learning rate times the gradient |
| 137 | + for k in range(n): |
| 138 | + loc[k][0] -= rate*grad[k][0] |
| 139 | + loc[k][1] -= rate*grad[k][1] |
| 140 | + |
| 141 | + return loc |
| 142 | + |
| 143 | + |
| 144 | +def draw_clusters(blog_data, clusters, cluster_nodes, blog_names, jpeg_path = 'Clustering_data/mds2d.jpg'): |
| 145 | + """draw the result of KMeans clustering |
| 146 | + |
| 147 | + Args: |
| 148 | + blog_data (list[list]): blog data that had been transfromed into two-dimentional form |
| 149 | + clusters (list[list]): center of clusters that had been transfromed into two-dimentional form |
| 150 | + cluster_nodes (list[list]): nodes of each cluster |
| 151 | + blog_names (list[str]): blog name corresponding to each node |
| 152 | + jpeg_path (str, optional): path of the photo to be stored |
| 153 | + |
| 154 | + Returns: |
| 155 | + None |
| 156 | + """ |
| 157 | + img=Image.new('RGB',(2000,2000),(255,255,255)) |
| 158 | + draw=ImageDraw.Draw(img) |
| 159 | + for i in xrange(len(clusters)): |
| 160 | + for node in cluster_nodes[i]: |
| 161 | + c_x,c_y = (clusters[i][0] + 0.5)*1000, (clusters[i][1] + 0.5)*1000 |
| 162 | + x, y =(blog_data[node][0]+0.5)*1000, (blog_data[node][1]+0.5)*1000 |
| 163 | + draw.line((c_x, c_y, x, y),fill=(255,0,0)) |
| 164 | + draw.text((x,y),blog_names[node],(0,0,0)) |
| 165 | + img.save(jpeg_path ,'JPEG') |
| 166 | + |
80 | 167 |
|
81 | 168 | if __name__ == '__main__':
|
82 |
| - col_names, blog_names, blog_data = read_data('Clustering_data/data') |
83 |
| - clusters, cluster_nodes = kMeans(blog_data) |
84 |
| - for i in xrange(len(cluster_nodes)): |
85 |
| - print '=============cluster %s==========='%i |
86 |
| - for node in cluster_nodes[i]: |
87 |
| - print blog_names[node] |
88 |
| - |
| 169 | + cluster_num = 4 |
| 170 | + col_names, blog_names, blog_data = read_data('Clustering_data/data') |
| 171 | + clusters, cluster_nodes = kMeans(blog_data, k = cluster_num) |
| 172 | + for i in xrange(len(cluster_nodes)): |
| 173 | + print '=============cluster %s==========='%i |
| 174 | + for node in cluster_nodes[i]: |
| 175 | + print blog_names[node] |
| 176 | + |
| 177 | + scaled_data = scale_dowm(blog_data + clusters) |
| 178 | + scaled_blog_data = scaled_data[:len(blog_data)] |
| 179 | + scaled_clusters = scaled_data[len(blog_data):] |
| 180 | + draw_clusters(scaled_blog_data, scaled_clusters, cluster_nodes, blog_names) |
| 181 | + |
89 | 182 |
|
90 | 183 |
|
91 | 184 |
|
|
0 commit comments