Skip to content

Commit b36df93

Browse files
committed
Complete hierarchical clustering, draw the cluster in the from of a tree with PIL
1 parent c501ebd commit b36df93

File tree

6 files changed

+309
-2
lines changed

6 files changed

+309
-2
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.pyc
150 KB
Loading

python/Clustering/Clustering_data/data

+96
Large diffs are not rendered by default.

python/Clustering.py python/Clustering/GetData.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,13 @@
44
# @Author: WuLC
55
# @Date: 2016-12-11 22:04:08
66
# @Last modified by: WuLC
7-
# @Last Modified time: 2016-12-12 22:50:55
7+
# @Last Modified time: 2017-02-12 17:28:07
88
9+
# @Referer: chapter 3 of the book 《programming-collective-intelligence》
910

10-
# @Referer: chaper 3 of the book 《programming-collective-intelligence》
11+
############################################################################
12+
# get data from feedlist and store it in the data file for experiment
13+
###########################################################################
1114

1215
import os
1316
import io
@@ -100,6 +103,28 @@ def get_content_from_feedlist(feed_list, data_file):
100103
wf.write('\n'.decode('utf8'))
101104

102105

106+
def read_data(data_file):
107+
"""read content from the formatted data file
108+
109+
Args:
110+
data_file (str): path of the formatted data file
111+
112+
Returns:
113+
TYPE
114+
"""
115+
col_names = None
116+
blog_names = []
117+
blog_data = []
118+
with io.open(data_file, mode = 'r', encoding = 'utf8') as rf:
119+
for line in rf:
120+
if col_names == None:
121+
col_names = line.strip().split('\t')[1:]
122+
else:
123+
words = line.strip().split('\t')
124+
blog_names.append(words[0])
125+
blog_data.append([float(x) for x in words[1:]])
126+
return col_names, blog_names, blog_data
127+
103128
if __name__ == '__main__':
104129
feed_list = 'Clustering_data/feedlist.txt'
105130
data_file = 'Clustering_data/data'
+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
# -*- coding: utf-8 -*-
2+
# @Author: WuLC
3+
# @Date: 2017-02-12 15:41:09
4+
# @Last Modified by: WuLC
5+
# @Last Modified time: 2017-02-12 23:02:25
6+
7+
from GetData import read_data
8+
from math import sqrt
9+
from PIL import Image,ImageDraw
10+
11+
class hcluster:
12+
"""describe a cluster as a node in a tree"""
13+
def __init__(self, id, vector, distance=0, left = None, right = None):
14+
"""structure to describe a cluster as a node in a tree
15+
16+
Args:
17+
id (int): unique id of the node
18+
vector (list): value of the node
19+
distance (int, optional): distance between left tree and right tree of the node if there exists, 0 for leaf nodes
20+
left (None, optional): root of the left tree
21+
right (None, optional): root of the right tree
22+
"""
23+
self.id = id
24+
self.vector = vector
25+
self.distance = distance
26+
self.left = left
27+
self.right = right
28+
29+
30+
def pearson(v1,v2):
31+
"""use pearson coeffcient to caculate the distance between two vectors
32+
33+
Args:
34+
v1 (list): values of vector1
35+
v2 (list): values of vector2
36+
37+
Returns:
38+
(flaot):1 - pearson coeffcient, the smaller, the more similar
39+
"""
40+
# Simple sums
41+
sum1=sum(v1)
42+
sum2=sum(v2)
43+
# Sums of the squares
44+
sum1Sq=sum([pow(v,2) for v in v1])
45+
sum2Sq=sum([pow(v,2) for v in v2])
46+
# Sum of the products
47+
pSum=sum([v1[i]*v2[i] for i in xrange(len(v1))])
48+
# Calculate r (Pearson score)
49+
num=pSum-(sum1*sum2/len(v1))
50+
den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
51+
if den==0: return 0
52+
return 1.0-num/den
53+
54+
55+
def hierarchicalClustering(blog_data, distance = pearson):
56+
"""hierachical clustering of data
57+
58+
Args:
59+
blog_data (list[list]): data of each blogs, a list of integers represents the data of the blog
60+
distance (TYPE, optional): standark to judge distance between data
61+
62+
Returns:
63+
(hcluster): the root of the clustering tree
64+
"""
65+
# initi clusters, each node is a cluster
66+
clusters = [hcluster(id = i, vector = blog_data[i]) for i in xrange(len(blog_data))]
67+
# use negativ number to represent cluster with more than one node
68+
clust_id = -1
69+
# use distance to store caculated results
70+
distances = {}
71+
72+
while len(clusters) > 1:
73+
similar_pairs = (0,1)
74+
closest_distance = distance(clusters[0].vector, clusters[1].vector)
75+
76+
for i in xrange(len(clusters)):
77+
for j in xrange(i+1, len(clusters)):
78+
if (clusters[i].id, clusters[j].id) not in distances:
79+
distances[(clusters[i].id, clusters[j].id)] = distance(clusters[i].vector, clusters[j].vector)
80+
d = distances[(clusters[i].id, clusters[j].id)]
81+
if closest_distance > d:
82+
closest_distance = d
83+
similar_pairs = (i, j)
84+
85+
merged_vector = [(clusters[similar_pairs[0]].vector[i] + clusters[similar_pairs[1]].vector[i])/2.0
86+
for i in xrange(len(clusters[similar_pairs[0]].vector))]
87+
88+
new_cluster = hcluster(id = clust_id, vector = merged_vector, distance = closest_distance,
89+
left = clusters[similar_pairs[0]], right = clusters[similar_pairs[1]])
90+
91+
# must delete elements from higher index to lower index
92+
del clusters[similar_pairs[1]]
93+
del clusters[similar_pairs[0]]
94+
95+
clusters.append(new_cluster)
96+
clust_id -= 1
97+
return clusters[0]
98+
99+
100+
def print_cluster(cluster, blog_names, n):
101+
""" print the cluster in a rough way
102+
103+
Args:
104+
cluster (hcluster): root of the clustering tree
105+
blog_names (list): name of the blogs, identified by cluster id
106+
n (int): indentation of each hierarchy
107+
108+
Returns:
109+
None
110+
"""
111+
print ' '*n,
112+
if cluster.id < 0:
113+
print '-'
114+
print_cluster(cluster.left, blog_names, n+1)
115+
print_cluster(cluster.right, blog_names, n+1)
116+
else:
117+
print blog_names[cluster.id]
118+
119+
120+
def getheight(cluster):
121+
if cluster.left==None and cluster.right==None: return 1
122+
# Otherwise the height is the same of the heights of
123+
# each branch
124+
return getheight(cluster.left)+getheight(cluster.right)
125+
126+
127+
def getdepth(cluster):
128+
# The distance of an endpoint is 0.0
129+
if cluster.left==None and cluster.right==None: return 0
130+
131+
# The distance of a branch is the greater of its two sides
132+
# plus its own distance
133+
return max(getdepth(cluster.left),getdepth(cluster.right))+cluster.distance
134+
135+
136+
def drawnode(draw,cluster,x,y,scaling,blog_names):
137+
if cluster.id < 0:
138+
h1=getheight(cluster.left)*20
139+
h2=getheight(cluster.right)*20
140+
top=y-(h1+h2)/2
141+
bottom=y+(h1+h2)/2
142+
# Line length
143+
ll=cluster.distance*scaling
144+
# Vertical line from this cluster to children
145+
draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))
146+
147+
# Horizontal line to left item
148+
draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))
149+
150+
# Horizontal line to right item
151+
draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))
152+
153+
# Call the function to draw the left and right nodes
154+
drawnode(draw,cluster.left,x+ll,top+h1/2,scaling,blog_names)
155+
drawnode(draw,cluster.right,x+ll,bottom-h2/2,scaling,blog_names)
156+
else:
157+
# If this is an endpoint, draw the item label
158+
draw.text((x+5,y-7),blog_names[cluster.id],(0,0,0))
159+
160+
161+
def draw_cluster(cluster, blog_names, jpeg_path):
162+
# height and width
163+
h=getheight(cluster)*20
164+
w=1200
165+
depth=getdepth(cluster)
166+
167+
# width is fixed, so scale distances accordingly
168+
scaling=float(w-150)/depth
169+
170+
# Create a new image with a white background
171+
img=Image.new('RGB',(w,h),(255,255,255))
172+
draw=ImageDraw.Draw(img)
173+
174+
draw.line((0,h/2,10,h/2),fill=(255,0,0))
175+
176+
# Draw the first node
177+
drawnode(draw,cluster,10,(h/2),scaling,blog_names)
178+
img.save(jpeg_path,'JPEG')
179+
180+
181+
if __name__ == '__main__':
182+
col_names, blog_names, blog_data = read_data('Clustering_data/data')
183+
cluster = hierarchicalClustering(blog_data)
184+
draw_cluster(cluster, blog_names, 'Clustering_data/clusters.jpg')
185+

0 commit comments

Comments
 (0)