dataronio
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎python/Clustering/Clustering_data/clusters.jpg
150 KB b/‎python/Clustering/Clustering_data/clusters.jpg
150 KB
diff --git a/‎python/Clustering/Clustering_data/data
+96 b/‎python/Clustering/Clustering_data/data
+96
diff --git a/‎python/Clustering_data/feedlist.txt ‎python/Clustering/Clustering_data/feedlist.txt b/‎python/Clustering_data/feedlist.txt ‎python/Clustering/Clustering_data/feedlist.txt
diff --git a/‎python/Clustering.py ‎python/Clustering/GetData.py
+27-2 b/‎python/Clustering.py ‎python/Clustering/GetData.py
+27-2
diff --git a/‎python/Clustering/HierarchicalClustering.py
+185 b/‎python/Clustering/HierarchicalClustering.py
+185
@@ -0,0 +1 @@
+*.pyc
@@ -4,10 +4,13 @@
 # @Author: WuLC
 # @Date:   2016-12-11 22:04:08
 # @Last modified by:   WuLC
-# @Last Modified time: 2016-12-12 22:50:55
+# @Last Modified time: 2017-02-12 17:28:07
 # @Email: [email protected]
+# @Referer: chapter 3 of the book 《programming-collective-intelligence》
 
-# @Referer: chaper 3 of the book 《programming-collective-intelligence》
+############################################################################
+# get data from feedlist and store it in the data file for experiment
+###########################################################################
 
 import os
 import io
@@ -100,6 +103,28 @@ def get_content_from_feedlist(feed_list, data_file):
             wf.write('\n'.decode('utf8'))
 
 
+def read_data(data_file):
+    """read content from the formatted data file
+    
+    Args:
+        data_file (str): path of the formatted data file
+    
+    Returns:
+        TYPE
+    """
+    col_names = None
+    blog_names = []
+    blog_data = []
+    with io.open(data_file, mode = 'r', encoding = 'utf8') as rf:
+        for line in rf:
+            if col_names == None:
+                col_names = line.strip().split('\t')[1:]
+            else:
+                words = line.strip().split('\t')
+                blog_names.append(words[0])
+                blog_data.append([float(x) for x in words[1:]])
+    return col_names, blog_names, blog_data
+
 if __name__ == '__main__':
     feed_list = 'Clustering_data/feedlist.txt'
     data_file = 'Clustering_data/data'
 
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+# @Author: WuLC
+# @Date:   2017-02-12 15:41:09
+# @Last Modified by:   WuLC
+# @Last Modified time: 2017-02-12 23:02:25
+
+from GetData import read_data
+from math import sqrt
+from PIL import Image,ImageDraw
+
+class hcluster:
+	"""describe a cluster as a node in a tree"""
+	def __init__(self, id, vector, distance=0, left = None, right = None):
+		"""structure to describe a cluster as a node in a tree
+		
+		Args:
+		    id (int): unique id of the node 
+		    vector (list): value of the node
+		    distance (int, optional): distance between left tree and right tree of the node if there exists, 0 for leaf nodes
+		    left (None, optional): root of the left tree
+		    right (None, optional): root of the right tree
+		"""
+		self.id = id
+		self.vector = vector
+		self.distance = distance
+		self.left = left
+		self.right = right
+
+
+def pearson(v1,v2):
+	"""use pearson coeffcient to caculate the distance between two vectors
+	
+	Args:
+	    v1 (list): values of vector1
+	    v2 (list): values of vector2
+	
+	Returns:
+	   (flaot):1 - pearson coeffcient, the smaller, the more similar
+	"""
+	# Simple sums
+	sum1=sum(v1)
+	sum2=sum(v2)
+	# Sums of the squares
+	sum1Sq=sum([pow(v,2) for v in v1])
+	sum2Sq=sum([pow(v,2) for v in v2])
+	# Sum of the products
+	pSum=sum([v1[i]*v2[i] for i in xrange(len(v1))])
+	# Calculate r (Pearson score)
+	num=pSum-(sum1*sum2/len(v1))
+	den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
+	if den==0: return 0
+	return 1.0-num/den
+
+
+def hierarchicalClustering(blog_data, distance = pearson):
+	"""hierachical clustering of data
+	
+	Args:
+	    blog_data (list[list]): data of each blogs, a list of integers represents the data of the blog 
+	    distance (TYPE, optional): standark to judge distance between data
+	
+	Returns:
+	    (hcluster): the root of the clustering tree
+	"""
+	# initi clusters, each node is a cluster
+	clusters = [hcluster(id = i, vector = blog_data[i]) for i in xrange(len(blog_data))] 
+	# use negativ number to represent cluster with more than one node
+	clust_id = -1
+	# use distance to store caculated results
+	distances = {}
+
+	while len(clusters) > 1:
+		similar_pairs = (0,1)
+		closest_distance = distance(clusters[0].vector, clusters[1].vector)
+
+		for i in xrange(len(clusters)):
+			for j in xrange(i+1, len(clusters)):
+				if (clusters[i].id, clusters[j].id) not in distances:
+					distances[(clusters[i].id, clusters[j].id)] = distance(clusters[i].vector, clusters[j].vector)
+				d = distances[(clusters[i].id, clusters[j].id)]
+				if closest_distance > d:
+					closest_distance = d
+					similar_pairs = (i, j)
+
+		merged_vector = [(clusters[similar_pairs[0]].vector[i] + clusters[similar_pairs[1]].vector[i])/2.0 
+						   for i in xrange(len(clusters[similar_pairs[0]].vector))]
+
+		new_cluster = hcluster(id = clust_id, vector = merged_vector, distance = closest_distance, 
+								left = clusters[similar_pairs[0]], right = clusters[similar_pairs[1]])
+
+		# must delete elements from higher index to lower index
+		del clusters[similar_pairs[1]]
+		del clusters[similar_pairs[0]]
+
+		clusters.append(new_cluster)
+		clust_id -= 1
+	return clusters[0]
+
+
+def print_cluster(cluster, blog_names, n):
+	""" print the cluster in a rough way
+	
+	Args:
+	    cluster (hcluster): root of the clustering tree
+	    blog_names (list): name of the blogs, identified by cluster id
+	    n (int): indentation of each hierarchy
+	
+	Returns:
+	    None
+	"""
+	print ' '*n,
+	if cluster.id < 0:
+		print '-'
+		print_cluster(cluster.left, blog_names, n+1)
+		print_cluster(cluster.right, blog_names, n+1)
+	else:
+		print blog_names[cluster.id]
+
+
+def getheight(cluster):
+    if cluster.left==None and cluster.right==None:  return 1
+    # Otherwise the height is the same of the heights of
+    # each branch
+    return getheight(cluster.left)+getheight(cluster.right)
+
+
+def getdepth(cluster):
+    # The distance of an endpoint is 0.0
+    if cluster.left==None and cluster.right==None: return 0
+
+    # The distance of a branch is the greater of its two sides
+    # plus its own distance
+    return max(getdepth(cluster.left),getdepth(cluster.right))+cluster.distance
+
+
+def drawnode(draw,cluster,x,y,scaling,blog_names):
+    if cluster.id < 0:
+        h1=getheight(cluster.left)*20
+        h2=getheight(cluster.right)*20
+        top=y-(h1+h2)/2
+        bottom=y+(h1+h2)/2
+        # Line length
+        ll=cluster.distance*scaling
+        # Vertical line from this cluster to children    
+        draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))    
+
+        # Horizontal line to left item
+        draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))    
+
+        # Horizontal line to right item
+        draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))        
+
+        # Call the function to draw the left and right nodes    
+        drawnode(draw,cluster.left,x+ll,top+h1/2,scaling,blog_names)
+        drawnode(draw,cluster.right,x+ll,bottom-h2/2,scaling,blog_names)
+    else:   
+        # If this is an endpoint, draw the item label
+        draw.text((x+5,y-7),blog_names[cluster.id],(0,0,0))
+
+
+def draw_cluster(cluster, blog_names, jpeg_path):
+    # height and width
+    h=getheight(cluster)*20
+    w=1200
+    depth=getdepth(cluster)
+
+    # width is fixed, so scale distances accordingly
+    scaling=float(w-150)/depth
+
+    # Create a new image with a white background
+    img=Image.new('RGB',(w,h),(255,255,255))
+    draw=ImageDraw.Draw(img)
+
+    draw.line((0,h/2,10,h/2),fill=(255,0,0))    
+
+    # Draw the first node
+    drawnode(draw,cluster,10,(h/2),scaling,blog_names)
+    img.save(jpeg_path,'JPEG')
+
+	
+if __name__ == '__main__':
+	col_names, blog_names, blog_data = read_data('Clustering_data/data')
+	cluster = hierarchicalClustering(blog_data)
+	draw_cluster(cluster, blog_names, 'Clustering_data/clusters.jpg')
+