Skip to content

Commit

Permalink
KMeans, bisecting KMeans
Browse files Browse the repository at this point in the history
Python implement , based on numpy and matplotlib
  • Loading branch information
wepe committed Aug 20, 2015
1 parent ef4337a commit 0786c3b
Show file tree
Hide file tree
Showing 4 changed files with 266 additions and 9 deletions.
46 changes: 46 additions & 0 deletions KMeans/data.pkl

Large diffs are not rendered by default.

170 changes: 170 additions & 0 deletions KMeans/kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#coding=utf-8

'''
@author: wepon, http://2hwp.com
Reference:
Book: <<Machine Learning in Action>>
Software: sklearn.cluster.KMeans
'''
import numpy as np

class KMeans(object):
"""
- 参数
n_clusters:
聚类个数,即k
initCent:
质心初始化方式,可选"random"或指定一个具体的array,默认random,即随机初始化
max_iter:
最大迭代次数
"""
def __init__(self,n_clusters=5,initCent='random',max_iter=300):
if hasattr(initCent, '__array__'):
n_clusters = initCent.shape[0]
self.centroids = np.asarray(initCent, dtype=np.float)
else:
self.centroids = None

self.n_clusters = n_clusters
self.max_iter = max_iter
self.initCent = initCent
self.clusterAssment = None
self.labels = None
self.sse = None

#计算两点的欧式距离
def _distEclud(self, vecA, vecB):
return np.linalg.norm(vecA - vecB)

#随机选取k个质心,必须在数据集的边界内
def _randCent(self, X, k):
n = X.shape[1] #特征维数
centroids = np.empty((k,n)) #k*n的矩阵,用于存储质心
for j in range(n): #产生k个质心,一维一维地随机初始化
minJ = min(X[:,j])
rangeJ = float(max(X[:,j]) - minJ)
centroids[:,j] = (minJ + rangeJ * np.random.rand(k,1)).flatten()
return centroids

def fit(self, X):
#类型检查
if not isinstance(X,np.ndarray):
try:
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")

m = X.shape[0]#m代表样本数量
self.clusterAssment = np.empty((m,2))#m*2的矩阵,第一列存储样本点所属的族的索引值,
#第二列存储该点与所属族的质心的平方误差
if self.initCent == 'random':
self.centroids = self._randCent(X, self.n_clusters)

clusterChanged = True
for _ in range(self.max_iter):
clusterChanged = False
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
minDist = np.inf; minIndex = -1
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI; minIndex = j
if self.clusterAssment[i,0] != minIndex:
clusterChanged = True
self.clusterAssment[i,:] = minIndex,minDist**2

if not clusterChanged:#若所有样本点所属的族都不改变,则已收敛,结束迭代
break
for i in range(self.n_clusters):#更新质心,即将每个族中的点的均值作为质心
ptsInClust = X[np.nonzero(self.clusterAssment[:,0]==i)[0]]#取出属于第i个族的所有点
self.centroids[i,:] = np.mean(ptsInClust, axis=0)

self.labels = self.clusterAssment[:,0]
self.sse = sum(self.clusterAssment[:,1])


def predict(self,X):#根据聚类结果,预测新输入数据所属的族
#类型检查
if not isinstance(X,np.ndarray):
try:
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")

m = X.shape[0]#m代表样本数量
preds = np.empty((m,))
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
minDist = np.inf
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI
preds[i] = j
return preds


class biKMeans(object):
def __init__(self,n_clusters=5):
self.n_clusters = n_clusters
self.centroids = None
self.clusterAssment = None
self.labels = None
self.sse = None


#计算两点的欧式距离
def _distEclud(self, vecA, vecB):
return np.linalg.norm(vecA - vecB)

def fit(self,X):
m = X.shape[0]
self.clusterAssment = np.zeros((m,2))
centroid0 = np.mean(X, axis=0).tolist()
centList =[centroid0]
for j in range(m):#计算每个样本点与质心之间初始的平方误差
self.clusterAssment[j,1] = self._distEclud(np.asarray(centroid0), X[j,:])**2

while (len(centList) < self.n_clusters):
lowestSSE = np.inf
for i in range(len(centList)):#尝试划分每一族,选取使得误差最小的那个族进行划分
ptsInCurrCluster = X[np.nonzero(self.clusterAssment[:,0]==i)[0],:]
clf = KMeans(n_clusters=2)
clf.fit(ptsInCurrCluster)
centroidMat, splitClustAss = clf.centroids, clf.clusterAssment#划分该族后,所得到的质心、分配结果及误差矩阵
sseSplit = sum(splitClustAss[:,1])
sseNotSplit = sum(self.clusterAssment[np.nonzero(self.clusterAssment[:,0]!=i)[0],1])
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
#该族被划分成两个子族后,其中一个子族的索引变为原族的索引,另一个子族的索引变为len(centList),然后存入centList
bestClustAss[np.nonzero(bestClustAss[:,0] == 1)[0],0] = len(centList)
bestClustAss[np.nonzero(bestClustAss[:,0] == 0)[0],0] = bestCentToSplit
centList[bestCentToSplit] = bestNewCents[0,:].tolist()
centList.append(bestNewCents[1,:].tolist())
self.clusterAssment[np.nonzero(self.clusterAssment[:,0] == bestCentToSplit)[0],:]= bestClustAss

self.labels = self.clusterAssment[:,0]
self.sse = sum(self.clusterAssment[:,1])
self.centroids = np.asarray(centList)

def predict(self,X):#根据聚类结果,预测新输入数据所属的族
#类型检查
if not isinstance(X,np.ndarray):
try:
X = np.asarray(X)
except:
raise TypeError("numpy.ndarray required for X")

m = X.shape[0]#m代表样本数量
preds = np.empty((m,))
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
minDist = np.inf
for j in range(self.n_clusters):
distJI = self._distEclud(self.centroids[j,:],X[i,:])
if distJI < minDist:
minDist = distJI
preds[i] = j
return preds
37 changes: 37 additions & 0 deletions KMeans/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#coding=utf-8
import cPickle
import matplotlib.pyplot as plt
import numpy as np
from kmeans import KMeans,biKMeans

if __name__ == "__main__":
#加载数据
X,y = cPickle.load(open('data.pkl','r'))

#依次画出迭代1次、2次、3次...的图
for max_iter in range(6):
#设置参数
n_clusters = 10
initCent = X[50:60] #将初始质心初始化为X[50:60]
#训练模型
clf = KMeans(n_clusters,initCent,max_iter)
clf.fit(X)
cents = clf.centroids
labels = clf.labels
sse = clf.sse
#画出聚类结果,每一类用一种颜色
colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']
for i in range(n_clusters):
index = np.nonzero(labels==i)[0]
x0 = X[index,0]
x1 = X[index,1]
y_i = y[index]
for j in range(len(x0)):
plt.text(x0[j],x1[j],str(int(y_i[j])),color=colors[i],\
fontdict={'weight': 'bold', 'size': 9})
plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)
plt.title("SSE={:.2f}".format(sse))
plt.axis([-30,30,-30,30])
#plt.savefig("{}.png".format(max_iter))
#plt.close()
plt.show()
22 changes: 13 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
MachineLearning
====================

This project contain some machine learning algrithm demo.Maybe the code is also useful to you.

这个仓库包含一些常用的机器学习算法的实现代码,代码中也有会一些具体的小应用。

此外,每个算法我都会写一篇文章来地介绍它们,同时详细地解读代码。文章发表在我的CSDN专栏以及个人网站上:

CSDN:[wepon的专栏](http://blog.csdn.net/u012162613)
这个仓库包含一些常用的机器学习算法的实现代码,代码中也有会一些具体的小应用。

个人网站:[Wepon's blog](http://2hwp.com)
此外,每个算法我都会写一篇文章来地介绍它们,同时详细地解读代码。文章发表在我的CSDN专栏以及个人网站上。欢迎所有的机器学习爱好者参与进来,并请保证文章和代码高质量。


##目录介绍
Expand Down Expand Up @@ -58,9 +54,17 @@ CSDN:[wepon的专栏](http://blog.csdn.net/u012162613)

- **DecisionTree**

Python、Numpy、Matplotlib实现的ID3、C4.5,其中C4.5有待完善,后续加入CART。文章待总结
Python、Numpy、Matplotlib实现的ID3、C4.5,其中C4.5有待完善,后续加入CART。文章待总结。[代码](https://github.com/wepe/MachineLearning/tree/master/DecisionTree)

- **KMeans**

介绍了聚类分析中最常用的KMeans算法(及二分KMeans算法),基于NumPy的算法实现,以及基于Matplotlib的聚类过程可视化。[文章链接]()

##Contributor

- [wepon](https://github.com/wepe)
- [Gogary](https://github.com/enjoyhot)

##Contributing

欢迎加入本项目,任何机器学习/深度学习的demo都可以push进来,并且最好有相应的博文介绍代码
欢迎加入本项目,任何机器学习/深度学习的demo都可以push进来,辅以详细的博文介绍

0 comments on commit 0786c3b

Please sign in to comment.