pca

AlexT1 · Apr 19, 2017 · 3592b45 · 3592b45
1 parent 5d23b46
commit 3592b45
Show file tree

Hide file tree

Showing 8 changed files with 319 additions and 0 deletions.
diff --git a/pca/data/bird_small.mat b/pca/data/bird_small.mat
diff --git a/pca/data/ex7data1.mat b/pca/data/ex7data1.mat
diff --git a/pca/data/ex7data2.mat b/pca/data/ex7data2.mat
diff --git a/pca/data/ex7faces.mat b/pca/data/ex7faces.mat
diff --git a/pca/kmeans.py b/pca/kmeans.py
@@ -0,0 +1,164 @@
+# coding: utf-8
+# svm/kmeans.py
+import numpy as np
+
+def loadDataSet(filename):
+    """
+    读取数据集
+
+    Args:
+        filename: 文件名
+    Returns:
+        dataMat: 数据样本矩阵
+    """
+    dataMat = []
+    fr = open(filename)
+    for line in fr.readlines():
+        curLine = line.strip().split('\t')
+        # 通过map函数批量转换
+        fitLine = map(float, curLine)
+        dataMat.append(fitLine)
+    return dataMat
+
+def distEclud(vecA, vecB):
+    """
+    计算两向量的欧氏距离
+
+    Args:
+        vecA: 向量A
+        vecB: 向量B
+    Returns:
+        欧式距离
+    """
+    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))
+
+def randCent(dataSet, k):
+    """
+    随机生成k个聚类中心
+
+    Args:
+        dataSet: 数据集
+        k: 簇数目
+    Returns:
+        centroids: 聚类中心矩阵
+    """
+    _, n = dataSet.shape
+    centroids = np.mat(np.zeros((k, n)))
+    for j in range(n):
+        # 随机聚类中心落在数据集的边界之内
+        minJ = np.min(dataSet[:, j])
+        maxJ = np.max(dataSet[:, j])
+        rangeJ = float(maxJ - minJ)
+        centroids[:, j] = minJ + rangeJ * np.random.rand(k, 1)
+    return centroids
+
+def kMeans(dataSet, k, maxIter = 5):
+    """
+    K-Means
+
+    Args:
+        dataSet: 数据集
+        k: 聚类数
+    Returns:
+        centroids: 聚类中心
+        clusterAssment: 点分配结果
+    """
+    # 随机初始化聚类中心
+    centroids = randCent(dataSet, k)
+    m, n = np.shape(dataSet)
+    # 点分配结果： 第一列指明样本所在的簇，第二列指明该样本到聚类中心的距离
+    clusterAssment = np.mat(np.zeros((m, 2)))
+    # 标识聚类中心是否仍在改变
+    clusterChanged = True
+    # 直至聚类中心不再变化
+    iterCount = 0
+    while clusterChanged and iterCount < maxIter:
+        iterCount += 1
+        clusterChanged = False
+        # 分配样本到簇
+        for i in range(m):
+            # 计算第i个样本到各个聚类中心的距离
+            minIndex = 0
+            minDist = np.inf
+            for j in range(k):
+                dist = distEclud(dataSet[i, :],  centroids[j, :])
+                if(dist < minDist):
+                    minIndex = j
+                    minDist = dist
+            # 判断cluster是否改变
+            if(clusterAssment[i, 0] != minIndex):
+                clusterChanged = True
+            clusterAssment[i, :] = minIndex, minDist**2
+        # 刷新聚类中心: 移动聚类中心到所在簇的均值位置
+        for cent in range(k):
+            # 通过数组过滤获得簇中的点
+            ptsInCluster = dataSet[np.nonzero(
+                clusterAssment[:, 0].A == cent)[0]]
+            if ptsInCluster.shape[0] > 0:
+                # 计算均值并移动
+                centroids[cent, :] = np.mean(ptsInCluster, axis=0)
+    return centroids, clusterAssment
+
+def biKmeans(dataSet, k):
+    """
+    二分kmeans算法
+    Args:
+        dataSet: 数据集
+        k: 聚类数
+    Returns:
+        centroids: 聚类中心
+        clusterAssment: 点分配结果
+    """
+    m, n = np.shape(dataSet)
+    # 起始时，只有一个簇，该簇的聚类中心为所有样本的平均位置
+    centroid0 = np.mean(dataSet, axis=0).tolist()[0]
+    # 设置一个列表保存当前的聚类中心
+    currentCentroids = [centroid0]
+    # 点分配结果： 第一列指明样本所在的簇，第二列指明该样本到聚类中心的距离
+    clusterAssment = np.mat(np.zeros((m, 2)))
+    # 初始化点分配结果，默认将所有样本先分配到初始簇
+    for j in range(m):
+        clusterAssment[j, 1] = distEclud(dataSet[j, :], np.mat(centroid0))**2
+    # 直到簇的数目达标
+    while len(currentCentroids) < k:
+        # 当前最小的代价
+        lowestError = np.inf
+        # 对于每一个簇
+        for j in range(len(currentCentroids)):
+            # 获得该簇的样本
+            ptsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0], :]
+            # 在该簇上进行2-means聚类
+            # 注意，得到的centroids，其聚类编号含0，1
+            centroids, clusterAss = kMeans(ptsInCluster, 2)
+            # 获得划分后的误差之和
+            splitedError = np.sum(clusterAss[:, 1])
+            # 获得其他簇的样本
+            ptsNoInCluster = dataSet[np.nonzero(
+                clusterAssment[:, 0].A != j)[0]]
+            # 获得剩余数据集的误差
+            nonSplitedError = np.sum(ptsNoInCluster[:, 1])
+            # 比较，判断此次划分是否划算
+            if (splitedError + nonSplitedError) < lowestError:
+                # 如果划算，刷新总误差
+                lowestError = splitedError + nonSplitedError
+                # 记录当前的应当划分的簇
+                needToSplit = j
+                # 新获得的簇以及点分配结果
+                newCentroids = centroids.A
+                newClusterAss = clusterAss.copy()
+        # 更新簇的分配结果
+        # 第0簇应当修正为被划分的簇
+        newClusterAss[np.nonzero(newClusterAss[:, 0].A == 0)[
+            0], 0] = needToSplit
+        # 第1簇应当修正为最新一簇
+        newClusterAss[np.nonzero(newClusterAss[:, 0].A == 1)[
+            0], 0] = len(currentCentroids)
+        # 被划分的簇需要更新
+        currentCentroids[needToSplit] = newCentroids[0, :]
+        # 加入新的划分后的簇
+        currentCentroids.append(newCentroids[1, :])
+        # 刷新点分配结果
+        clusterAssment[np.nonzero(
+            clusterAssment[:, 0].A == needToSplit
+        )[0], :] = newClusterAss
+    return np.mat(currentCentroids), clusterAssment
diff --git a/pca/pca.py b/pca/pca.py
@@ -0,0 +1,54 @@
+# coding: utf8
+# pca/pca.py
+
+import numpy as np
+
+def normalize(X):
+    """数据标准化处理
+
+    Args:
+        X 样本
+    Returns:
+        XNorm 标准化后的样本
+    """
+    XNorm = X.copy()
+    m,n = XNorm.shape
+    mean = np.mean(XNorm, axis=0)
+    std = np.std(XNorm, axis=0)
+    XNorm = (XNorm - mean) / std
+    return XNorm
+
+def PCA(X, k = 1):
+    """PCA
+
+    Args:
+        X 样本
+        k 目的维度
+    Returns:
+        XNorm 标准化后的样本
+        Z 降维后的新样本
+        U U
+        UReduce UReduce
+        S S
+        V V
+    """
+    m, n = X.shape
+    # 数据归一化
+    XNorm = normalize(X)
+    # 计算协方差矩阵
+    Coef = XNorm.T * XNorm/m
+    # 奇异值分解
+    U, S, V = np.linalg.svd(Coef)
+    # 取出前 k 个向量
+    UReduce = U[:, 0:k]
+    Z = XNorm * UReduce
+    return XNorm, Z, U, UReduce, S, V
+
+def recover(UReduce, Z):
+    """数据恢复
+
+    Args:
+        UReduce UReduce
+        Z 降维后的样本
+    """
+    return Z * UReduce.T
diff --git a/pca/test_pca4performance.py b/pca/test_pca4performance.py
@@ -0,0 +1,44 @@
+# coding: utf8
+# pca/test_pca4visualization.py
+
+import pca
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import loadmat
+
+def display(images, width, height):
+    """展示图片
+
+    Args:
+        images 图像样本
+        width 图像宽
+        height 图像高
+    """
+    m, n = images.shape
+    rows = int(np.floor(np.sqrt(m)))
+    cols = int(np.ceil(m / rows))
+    # 图像拼接
+    dstImage = images.copy()
+    dstImage = np.zeros((rows * height, cols * width))
+    for i in range(rows):
+        for j in range(cols):
+            idx = cols * i + j
+            image = images[idx].reshape(height, width)
+            dstImage[i * height:i * height + height,
+                     j * width: j * width + width] = image
+    plt.imshow(dstImage.T, cmap='gray')
+    plt.axis('off')
+    plt.show()
+
+data = loadmat('data/ex7faces.mat')
+X = np.mat(data['X'],dtype=np.float32)
+m, n = X.shape
+
+# 展示原图
+display(X[0:100, :], 32, 32)
+
+XNorm, Z, U, UReduce, S, V = pca.PCA(X, k=100)
+XRec = pca.recover(UReduce, Z)
+
+# 显示修复后的图，可以看出，PCA 损失了一部分细节
+display(XRec[0:100, :], 32, 32)
diff --git a/pca/test_pca4visualization.py b/pca/test_pca4visualization.py
@@ -0,0 +1,57 @@
+# coding: utf8
+# pca/test_pca4visualization.py
+
+import numpy as np
+import kmeans
+import pca
+from scipy.io import loadmat
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.pyplot as plt
+import matplotlib.cm as cmx
+import matplotlib.colors as colors
+
+def getCmap(count):
+    color_norm  = colors.Normalize(vmin=0, vmax=count-1)
+    scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
+    def map_index_to_rgb_color(index):
+        return scalar_map.to_rgba(index)
+    return map_index_to_rgb_color
+
+fig = plt.figure()
+ax = fig.add_subplot(111, projection='3d')
+
+data = loadmat('data/bird_small.mat')
+A = data['A']
+
+A = A / 255.0;
+
+height, width, channels = A.shape
+X = np.mat(A.reshape(height * width, channels))
+
+m, n = X.shape
+
+clusterNum = 16
+cmap = getCmap(clusterNum)
+centroids, clusterAssment = kmeans.kMeans(X, clusterNum)
+# 随机选择 1000 个样本绘制
+sampleSize = 1000
+sampleIndexs = np.random.choice(m, sampleSize)
+clusters = clusterAssment[sampleIndexs]
+samples = X[sampleIndexs]
+
+# 三维下观察
+for i in range(sampleSize):
+    x, y, z = samples[i,:].A[0]
+    center = clusters[i, 0]
+    color = cmap(center)
+    ax.scatter([x], [y], [z], color=color, marker='o')
+plt.show()
+
+# 二维下观察
+reducedSamples = pca.PCA(samples, k=2)[1]
+for i in range(sampleSize):
+    x, y = reducedSamples[i,:].A[0]
+    center = clusters[i, 0]
+    color = cmap(center)
+    plt.scatter([x], [y], color=color, marker='o')
+plt.show()