数据降维及压缩1:主成分分析PCA实例

import pandas as pd
from sklearn.decomposition import PCA

#要读取的文件
inputfile = './data.xls'
#要写入的文件
outputfile = './reduced_data.xls'
#读取
data = pd.read_excel(inputfile,header=None)
print(data)
#pca对象
pca = PCA()
#数据拟合
pca.fit(data)
#分析
print(pca.components_)              #特征向量
print(pca.explained_variance_)      #特征值
print(pca.explained_variance_ratio_)#特征方差的百分比

#降维
pca = PCA(3)
pca.fit(data)
low_d = pca.transform(data)
pd.DataFrame(low_d).to_excel(outputfile)

#恢复数据
data = pca.inverse_transform(low_d)
print(data)
#导入模块
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import  Axes3D
from sklearn.datasets.samples_generator import make_blobs
from sklearn.decomposition import PCA

#获取数据
X,y = make_blobs(n_samples=10000,n_features=3,centers=[[3,3,3],[0,0,0],[1,1,1],[2,2,2]],cluster_std=[0.2,0.1,0.2,0.2],random_state=9)
#显示数据
fig = plt.figure()
ax = Axes3D(fig,rect=[0,0,1,1],elev=30,azim=10)
plt.scatter(X[:,0],X[:,1],X[:,2],marker='o')
plt.show()

#进行主成分分析
pca = PCA(n_components=3)
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)

#降维
pca = PCA(n_components=2)
pca.fit(X)
X_new = pca.transform(X)
plt.scatter(X_new[:,0],X_new[:,1],marker='o')
plt.show()
from numpy import *

def eigValPct(eigVals,percentage):
    sortArray = sort(eigVals)       #将特征值从小到大排序
    sortArray = sortArray[-1::-1]   #将特征值从大道小排序
    arraySum = sum(sortArray)       #arraySum数据全部的方差
    tempSum = 0
    num = 0
    for i in sortArray:
        tempSum+=i
        num+=1
        if tempSum>=arraySum*percentage:
            return num


def pca(dataMat,percentage=0.9):
    meanVals = mean(dataMat,axis=0)                 #对每一列秋平均值
    meanRemoved = dataMat-meanVals
    covMat = cov(meanRemoved,rowvar=0)              #计算方差
    eigVals,eigVects = linalg.eig(mat(covMat))      #寻找特征值和特征向量
    k = eigValPct(eigVals,percentage)               #要达到方差的百分比,需要前k个向量
    eigVallnd = argsort(eigVals)                    #从小到大排序
    eigVallnd = eigVallnd[:-(k+1):-1]               #从后往前取K个
    redEigVects = eigVects[:,eigVallnd]             #排序后特征值对应的特征向量
    lowDDataMat = meanRemoved*redEigVects           #将原始数据投影得到新的降维数据
    reconMat = (lowDDataMat*redEigVects.T)+meanVals #重构
    return lowDDataMat,reconMat

你可能感兴趣的:(#,Math=统计,挖掘2=)