1
+ # -*- coding: utf-8 -*-
2
+ # @Author: WuLC
3
+ # @Date: 2017-02-12 15:41:09
4
+ # @Last Modified by: WuLC
5
+ # @Last Modified time: 2017-02-12 23:02:25
6
+
7
+ from GetData import read_data
8
+ from math import sqrt
9
+ from PIL import Image ,ImageDraw
10
+
11
+ class hcluster :
12
+ """describe a cluster as a node in a tree"""
13
+ def __init__ (self , id , vector , distance = 0 , left = None , right = None ):
14
+ """structure to describe a cluster as a node in a tree
15
+
16
+ Args:
17
+ id (int): unique id of the node
18
+ vector (list): value of the node
19
+ distance (int, optional): distance between left tree and right tree of the node if there exists, 0 for leaf nodes
20
+ left (None, optional): root of the left tree
21
+ right (None, optional): root of the right tree
22
+ """
23
+ self .id = id
24
+ self .vector = vector
25
+ self .distance = distance
26
+ self .left = left
27
+ self .right = right
28
+
29
+
30
+ def pearson (v1 ,v2 ):
31
+ """use pearson coeffcient to caculate the distance between two vectors
32
+
33
+ Args:
34
+ v1 (list): values of vector1
35
+ v2 (list): values of vector2
36
+
37
+ Returns:
38
+ (flaot):1 - pearson coeffcient, the smaller, the more similar
39
+ """
40
+ # Simple sums
41
+ sum1 = sum (v1 )
42
+ sum2 = sum (v2 )
43
+ # Sums of the squares
44
+ sum1Sq = sum ([pow (v ,2 ) for v in v1 ])
45
+ sum2Sq = sum ([pow (v ,2 ) for v in v2 ])
46
+ # Sum of the products
47
+ pSum = sum ([v1 [i ]* v2 [i ] for i in xrange (len (v1 ))])
48
+ # Calculate r (Pearson score)
49
+ num = pSum - (sum1 * sum2 / len (v1 ))
50
+ den = sqrt ((sum1Sq - pow (sum1 ,2 )/ len (v1 ))* (sum2Sq - pow (sum2 ,2 )/ len (v1 )))
51
+ if den == 0 : return 0
52
+ return 1.0 - num / den
53
+
54
+
55
+ def hierarchicalClustering (blog_data , distance = pearson ):
56
+ """hierachical clustering of data
57
+
58
+ Args:
59
+ blog_data (list[list]): data of each blogs, a list of integers represents the data of the blog
60
+ distance (TYPE, optional): standark to judge distance between data
61
+
62
+ Returns:
63
+ (hcluster): the root of the clustering tree
64
+ """
65
+ # initi clusters, each node is a cluster
66
+ clusters = [hcluster (id = i , vector = blog_data [i ]) for i in xrange (len (blog_data ))]
67
+ # use negativ number to represent cluster with more than one node
68
+ clust_id = - 1
69
+ # use distance to store caculated results
70
+ distances = {}
71
+
72
+ while len (clusters ) > 1 :
73
+ similar_pairs = (0 ,1 )
74
+ closest_distance = distance (clusters [0 ].vector , clusters [1 ].vector )
75
+
76
+ for i in xrange (len (clusters )):
77
+ for j in xrange (i + 1 , len (clusters )):
78
+ if (clusters [i ].id , clusters [j ].id ) not in distances :
79
+ distances [(clusters [i ].id , clusters [j ].id )] = distance (clusters [i ].vector , clusters [j ].vector )
80
+ d = distances [(clusters [i ].id , clusters [j ].id )]
81
+ if closest_distance > d :
82
+ closest_distance = d
83
+ similar_pairs = (i , j )
84
+
85
+ merged_vector = [(clusters [similar_pairs [0 ]].vector [i ] + clusters [similar_pairs [1 ]].vector [i ])/ 2.0
86
+ for i in xrange (len (clusters [similar_pairs [0 ]].vector ))]
87
+
88
+ new_cluster = hcluster (id = clust_id , vector = merged_vector , distance = closest_distance ,
89
+ left = clusters [similar_pairs [0 ]], right = clusters [similar_pairs [1 ]])
90
+
91
+ # must delete elements from higher index to lower index
92
+ del clusters [similar_pairs [1 ]]
93
+ del clusters [similar_pairs [0 ]]
94
+
95
+ clusters .append (new_cluster )
96
+ clust_id -= 1
97
+ return clusters [0 ]
98
+
99
+
100
+ def print_cluster (cluster , blog_names , n ):
101
+ """ print the cluster in a rough way
102
+
103
+ Args:
104
+ cluster (hcluster): root of the clustering tree
105
+ blog_names (list): name of the blogs, identified by cluster id
106
+ n (int): indentation of each hierarchy
107
+
108
+ Returns:
109
+ None
110
+ """
111
+ print ' ' * n ,
112
+ if cluster .id < 0 :
113
+ print '-'
114
+ print_cluster (cluster .left , blog_names , n + 1 )
115
+ print_cluster (cluster .right , blog_names , n + 1 )
116
+ else :
117
+ print blog_names [cluster .id ]
118
+
119
+
120
+ def getheight (cluster ):
121
+ if cluster .left == None and cluster .right == None : return 1
122
+ # Otherwise the height is the same of the heights of
123
+ # each branch
124
+ return getheight (cluster .left )+ getheight (cluster .right )
125
+
126
+
127
+ def getdepth (cluster ):
128
+ # The distance of an endpoint is 0.0
129
+ if cluster .left == None and cluster .right == None : return 0
130
+
131
+ # The distance of a branch is the greater of its two sides
132
+ # plus its own distance
133
+ return max (getdepth (cluster .left ),getdepth (cluster .right ))+ cluster .distance
134
+
135
+
136
+ def drawnode (draw ,cluster ,x ,y ,scaling ,blog_names ):
137
+ if cluster .id < 0 :
138
+ h1 = getheight (cluster .left )* 20
139
+ h2 = getheight (cluster .right )* 20
140
+ top = y - (h1 + h2 )/ 2
141
+ bottom = y + (h1 + h2 )/ 2
142
+ # Line length
143
+ ll = cluster .distance * scaling
144
+ # Vertical line from this cluster to children
145
+ draw .line ((x ,top + h1 / 2 ,x ,bottom - h2 / 2 ),fill = (255 ,0 ,0 ))
146
+
147
+ # Horizontal line to left item
148
+ draw .line ((x ,top + h1 / 2 ,x + ll ,top + h1 / 2 ),fill = (255 ,0 ,0 ))
149
+
150
+ # Horizontal line to right item
151
+ draw .line ((x ,bottom - h2 / 2 ,x + ll ,bottom - h2 / 2 ),fill = (255 ,0 ,0 ))
152
+
153
+ # Call the function to draw the left and right nodes
154
+ drawnode (draw ,cluster .left ,x + ll ,top + h1 / 2 ,scaling ,blog_names )
155
+ drawnode (draw ,cluster .right ,x + ll ,bottom - h2 / 2 ,scaling ,blog_names )
156
+ else :
157
+ # If this is an endpoint, draw the item label
158
+ draw .text ((x + 5 ,y - 7 ),blog_names [cluster .id ],(0 ,0 ,0 ))
159
+
160
+
161
+ def draw_cluster (cluster , blog_names , jpeg_path ):
162
+ # height and width
163
+ h = getheight (cluster )* 20
164
+ w = 1200
165
+ depth = getdepth (cluster )
166
+
167
+ # width is fixed, so scale distances accordingly
168
+ scaling = float (w - 150 )/ depth
169
+
170
+ # Create a new image with a white background
171
+ img = Image .new ('RGB' ,(w ,h ),(255 ,255 ,255 ))
172
+ draw = ImageDraw .Draw (img )
173
+
174
+ draw .line ((0 ,h / 2 ,10 ,h / 2 ),fill = (255 ,0 ,0 ))
175
+
176
+ # Draw the first node
177
+ drawnode (draw ,cluster ,10 ,(h / 2 ),scaling ,blog_names )
178
+ img .save (jpeg_path ,'JPEG' )
179
+
180
+
181
+ if __name__ == '__main__' :
182
+ col_names , blog_names , blog_data = read_data ('Clustering_data/data' )
183
+ cluster = hierarchicalClustering (blog_data )
184
+ draw_cluster (cluster , blog_names , 'Clustering_data/clusters.jpg' )
185
+
0 commit comments