PCA实例及代码

在模型学习的过程中,训练集的维度较多会引起训练时间的增大,且得到的模型结构庞大,故需减少特征数量,但同时能够避免信息的丢失。将特征数量从几百上千降低到几十的过程就是数据降维。
主成分分析(Principal Component Analysis, PCA)是数据降维的一种,实现方法一般有两种:一种用特征值分解去实现,一种用奇异值分解去实现

特征值分解:
推导详情见http://blog.codinglabs.org/articles/pca-tutorial.html
算法步骤(设有m条n维数据):
1.将原始数据按列组成n行m列矩阵X
2.将X的每一行(代表一个属性字段)进行零均值化,即减去这一行的均值
3.求出协方差矩阵C=1mXX
4.求出协方差矩阵的特征值及对应的特征向量
5.将特征向量按对应特征值大小从上到下按行排列成矩阵,取前k行组成矩阵P
6.Y=PX即为降维到k维后的数据

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat

def normalize(data): 
        m, n = data.shape
        mean = np.mean(data, axis=0)
        std = np.std(data, axis=0)
        data = (data - mean)/std
        #归一化
        return data 
        
def PCA(data, k):
        m, n = data.shape
        data = normalize(data)
        Coef = data.T * data/m
        U, S, V = np.linalg.svd(Coef) 
        #奇异值分解
        UReduce = U[:, 0:k]
        #取前k维值
        Z = data * UReduce
        return Z

if __name__ == '__main__':
        data = np.mat([[1,2,3,4,5],[6,7,8,9,10]])
        data_pca = PCA(data, 1)
        print data, data_pca

应用实例

import numpy as np
import matplotlib.pyplot as plt

def normalize(data):
        m, n = data.shape
        mean = np.mean(data, axis=0)
        std = np.std(data, axis=0)
        data = (data - mean)/std
        return data

def PCA(data, k):
        m, n = data.shape
        data = normalize(data)
        Coef = data.T * data/m
        U, S, V = np.linalg.svd(Coef)
        UReduce = U[:, 0:k]
        Z = data * UReduce
        return XNorm, Z, U, UReduce, S, V

#数据恢复
def recover(UReduce, Z):
       return Z * UReduce.T

import pca
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat

#处理图片
def display(images, width, height):
        m, n = images.shape
        rows = int(np.floor(np.sqrt(m)))
        cols = int(np.ceil(m / rows))
        #图像拼接
        dstImage = images.copy()
        dstImage = np.zeros((rows * height, cols * width))
        for i in range(rows):
            for j in range(cols):
                idx = cols * i + j
                image = images[idx].reshape(height, width)
                dstImage[i * height:i * height + height,
                        j * width: j * width + width] = image
        plt.imshow(dstImage.T, cmap='gray')
        plt.axis('off')
        plt.show()

if __name__ == '__main__':
        data = loadmat('ex7faces.mat')
        X = np.mat(data['X'],dtype=np.float32)
        m, n = X.shape
        #展示原图
        display(X[0:100, :], 32, 32)
        XNorm, Z, U, UReduce, S, V = PCA(X, k=100)
        XRec = recover(UReduce, Z)
        #显示修复后的图,可以看出,PCA 损失了一部分细节
        display(XRec[0:100, :], 32, 32)

你可能感兴趣的:(PCA实例及代码)