Skip to content

Commit

Permalink
pca
Browse files Browse the repository at this point in the history
  • Loading branch information
吴晓军 committed Apr 19, 2017
1 parent 5d23b46 commit 3592b45
Show file tree
Hide file tree
Showing 8 changed files with 319 additions and 0 deletions.
Binary file added pca/data/bird_small.mat
Binary file not shown.
Binary file added pca/data/ex7data1.mat
Binary file not shown.
Binary file added pca/data/ex7data2.mat
Binary file not shown.
Binary file added pca/data/ex7faces.mat
Binary file not shown.
164 changes: 164 additions & 0 deletions pca/kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# coding: utf-8
# svm/kmeans.py
import numpy as np

def loadDataSet(filename):
"""
读取数据集
Args:
filename: 文件名
Returns:
dataMat: 数据样本矩阵
"""
dataMat = []
fr = open(filename)
for line in fr.readlines():
curLine = line.strip().split('\t')
# 通过map函数批量转换
fitLine = map(float, curLine)
dataMat.append(fitLine)
return dataMat

def distEclud(vecA, vecB):
"""
计算两向量的欧氏距离
Args:
vecA: 向量A
vecB: 向量B
Returns:
欧式距离
"""
return np.sqrt(np.sum(np.power(vecA - vecB, 2)))

def randCent(dataSet, k):
"""
随机生成k个聚类中心
Args:
dataSet: 数据集
k: 簇数目
Returns:
centroids: 聚类中心矩阵
"""
_, n = dataSet.shape
centroids = np.mat(np.zeros((k, n)))
for j in range(n):
# 随机聚类中心落在数据集的边界之内
minJ = np.min(dataSet[:, j])
maxJ = np.max(dataSet[:, j])
rangeJ = float(maxJ - minJ)
centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
return centroids

def kMeans(dataSet, k, maxIter = 5):
"""
K-Means
Args:
dataSet: 数据集
k: 聚类数
Returns:
centroids: 聚类中心
clusterAssment: 点分配结果
"""
# 随机初始化聚类中心
centroids = randCent(dataSet, k)
m, n = np.shape(dataSet)
# 点分配结果: 第一列指明样本所在的簇,第二列指明该样本到聚类中心的距离
clusterAssment = np.mat(np.zeros((m, 2)))
# 标识聚类中心是否仍在改变
clusterChanged = True
# 直至聚类中心不再变化
iterCount = 0
while clusterChanged and iterCount < maxIter:
iterCount += 1
clusterChanged = False
# 分配样本到簇
for i in range(m):
# 计算第i个样本到各个聚类中心的距离
minIndex = 0
minDist = np.inf
for j in range(k):
dist = distEclud(dataSet[i, :], centroids[j, :])
if(dist < minDist):
minIndex = j
minDist = dist
# 判断cluster是否改变
if(clusterAssment[i, 0] != minIndex):
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist**2
# 刷新聚类中心: 移动聚类中心到所在簇的均值位置
for cent in range(k):
# 通过数组过滤获得簇中的点
ptsInCluster = dataSet[np.nonzero(
clusterAssment[:, 0].A == cent)[0]]
if ptsInCluster.shape[0] > 0:
# 计算均值并移动
centroids[cent, :] = np.mean(ptsInCluster, axis=0)
return centroids, clusterAssment

def biKmeans(dataSet, k):
"""
二分kmeans算法
Args:
dataSet: 数据集
k: 聚类数
Returns:
centroids: 聚类中心
clusterAssment: 点分配结果
"""
m, n = np.shape(dataSet)
# 起始时,只有一个簇,该簇的聚类中心为所有样本的平均位置
centroid0 = np.mean(dataSet, axis=0).tolist()[0]
# 设置一个列表保存当前的聚类中心
currentCentroids = [centroid0]
# 点分配结果: 第一列指明样本所在的簇,第二列指明该样本到聚类中心的距离
clusterAssment = np.mat(np.zeros((m, 2)))
# 初始化点分配结果,默认将所有样本先分配到初始簇
for j in range(m):
clusterAssment[j, 1] = distEclud(dataSet[j, :], np.mat(centroid0))**2
# 直到簇的数目达标
while len(currentCentroids) < k:
# 当前最小的代价
lowestError = np.inf
# 对于每一个簇
for j in range(len(currentCentroids)):
# 获得该簇的样本
ptsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0], :]
# 在该簇上进行2-means聚类
# 注意,得到的centroids,其聚类编号含0,1
centroids, clusterAss = kMeans(ptsInCluster, 2)
# 获得划分后的误差之和
splitedError = np.sum(clusterAss[:, 1])
# 获得其他簇的样本
ptsNoInCluster = dataSet[np.nonzero(
clusterAssment[:, 0].A != j)[0]]
# 获得剩余数据集的误差
nonSplitedError = np.sum(ptsNoInCluster[:, 1])
# 比较,判断此次划分是否划算
if (splitedError + nonSplitedError) < lowestError:
# 如果划算,刷新总误差
lowestError = splitedError + nonSplitedError
# 记录当前的应当划分的簇
needToSplit = j
# 新获得的簇以及点分配结果
newCentroids = centroids.A
newClusterAss = clusterAss.copy()
# 更新簇的分配结果
# 第0簇应当修正为被划分的簇
newClusterAss[np.nonzero(newClusterAss[:, 0].A == 0)[
0], 0] = needToSplit
# 第1簇应当修正为最新一簇
newClusterAss[np.nonzero(newClusterAss[:, 0].A == 1)[
0], 0] = len(currentCentroids)
# 被划分的簇需要更新
currentCentroids[needToSplit] = newCentroids[0, :]
# 加入新的划分后的簇
currentCentroids.append(newCentroids[1, :])
# 刷新点分配结果
clusterAssment[np.nonzero(
clusterAssment[:, 0].A == needToSplit
)[0], :] = newClusterAss
return np.mat(currentCentroids), clusterAssment
54 changes: 54 additions & 0 deletions pca/pca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# coding: utf8
# pca/pca.py

import numpy as np

def normalize(X):
"""数据标准化处理
Args:
X 样本
Returns:
XNorm 标准化后的样本
"""
XNorm = X.copy()
m,n = XNorm.shape
mean = np.mean(XNorm, axis=0)
std = np.std(XNorm, axis=0)
XNorm = (XNorm - mean) / std
return XNorm

def PCA(X, k = 1):
"""PCA
Args:
X 样本
k 目的维度
Returns:
XNorm 标准化后的样本
Z 降维后的新样本
U U
UReduce UReduce
S S
V V
"""
m, n = X.shape
# 数据归一化
XNorm = normalize(X)
# 计算协方差矩阵
Coef = XNorm.T * XNorm/m
# 奇异值分解
U, S, V = np.linalg.svd(Coef)
# 取出前 k 个向量
UReduce = U[:, 0:k]
Z = XNorm * UReduce
return XNorm, Z, U, UReduce, S, V

def recover(UReduce, Z):
"""数据恢复
Args:
UReduce UReduce
Z 降维后的样本
"""
return Z * UReduce.T
44 changes: 44 additions & 0 deletions pca/test_pca4performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# coding: utf8
# pca/test_pca4visualization.py

import pca
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat

def display(images, width, height):
"""展示图片
Args:
images 图像样本
width 图像宽
height 图像高
"""
m, n = images.shape
rows = int(np.floor(np.sqrt(m)))
cols = int(np.ceil(m / rows))
# 图像拼接
dstImage = images.copy()
dstImage = np.zeros((rows * height, cols * width))
for i in range(rows):
for j in range(cols):
idx = cols * i + j
image = images[idx].reshape(height, width)
dstImage[i * height:i * height + height,
j * width: j * width + width] = image
plt.imshow(dstImage.T, cmap='gray')
plt.axis('off')
plt.show()

data = loadmat('data/ex7faces.mat')
X = np.mat(data['X'],dtype=np.float32)
m, n = X.shape

# 展示原图
display(X[0:100, :], 32, 32)

XNorm, Z, U, UReduce, S, V = pca.PCA(X, k=100)
XRec = pca.recover(UReduce, Z)

# 显示修复后的图,可以看出,PCA 损失了一部分细节
display(XRec[0:100, :], 32, 32)
57 changes: 57 additions & 0 deletions pca/test_pca4visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# coding: utf8
# pca/test_pca4visualization.py

import numpy as np
import kmeans
import pca
from scipy.io import loadmat
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors

def getCmap(count):
color_norm = colors.Normalize(vmin=0, vmax=count-1)
scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
def map_index_to_rgb_color(index):
return scalar_map.to_rgba(index)
return map_index_to_rgb_color

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

data = loadmat('data/bird_small.mat')
A = data['A']

A = A / 255.0;

height, width, channels = A.shape
X = np.mat(A.reshape(height * width, channels))

m, n = X.shape

clusterNum = 16
cmap = getCmap(clusterNum)
centroids, clusterAssment = kmeans.kMeans(X, clusterNum)
# 随机选择 1000 个样本绘制
sampleSize = 1000
sampleIndexs = np.random.choice(m, sampleSize)
clusters = clusterAssment[sampleIndexs]
samples = X[sampleIndexs]

# 三维下观察
for i in range(sampleSize):
x, y, z = samples[i,:].A[0]
center = clusters[i, 0]
color = cmap(center)
ax.scatter([x], [y], [z], color=color, marker='o')
plt.show()

# 二维下观察
reducedSamples = pca.PCA(samples, k=2)[1]
for i in range(sampleSize):
x, y = reducedSamples[i,:].A[0]
center = clusters[i, 0]
color = cmap(center)
plt.scatter([x], [y], color=color, marker='o')
plt.show()

0 comments on commit 3592b45

Please sign in to comment.