数据预处理之PCA主成分分析

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# decomposition  降解  
# 经过运算属性变了   原来的数据变了
# pca 将数据属性变少,少量得数据可以代表原来比较多的数据

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings('ignore')
X,y=datasets.load_iris(True)

# 降维
pca=PCA(n_components=0.98,whiten=False)  # n_components 保留多少属性 小数 重要性,当一个属性的重要性不够时,找到可以加和 第二大的依次类推
                                                                        # 整数位个数  whiten 为是否标准化   


X_pca=pca.fit_transform(X)
X_pca.head()   #数据有正有负 则无原来的物理意义
array([[-2.68412563,  0.31939725, -0.02791483],
       [-2.71414169, -0.17700123, -0.21046427],
       [-2.88899057, -0.14494943,  0.01790026],
       [-2.74534286, -0.31829898,  0.03155937],
       [-2.72871654,  0.32675451,  0.09007924],])

PCA原理

# 1.去中心化
B=X-X.mean(axis=0)
B[:5]
array([[-0.74333333,  0.44266667, -2.358     , -0.99933333],
       [-0.94333333, -0.05733333, -2.358     , -0.99933333],
       [-1.14333333,  0.14266667, -2.458     , -0.99933333],
       [-1.24333333,  0.04266667, -2.258     , -0.99933333],
       [-0.84333333,  0.54266667, -2.358     , -0.99933333]])
 # 2计算协方差  矩阵
V=np.cov(B,rowvar=False)
V
array([[ 0.68569351, -0.042434  ,  1.27431544,  0.51627069],
       [-0.042434  ,  0.18997942, -0.32965638, -0.12163937],
       [ 1.27431544, -0.32965638,  3.11627785,  1.2956094 ],
       [ 0.51627069, -0.12163937,  1.2956094 ,  0.58100626]])
# 3,协方差矩阵的特征值和特征向量
eigen,ev=np.linalg.eigh(V)
display(eigen,ev)  # 特征值 和特征向量
array([0.02383509, 0.0782095 , 0.24267075, 4.22824171])



array([[ 0.31548719,  0.58202985,  0.65658877, -0.36138659],
       [-0.3197231 , -0.59791083,  0.73016143,  0.08452251],
       [-0.47983899, -0.07623608, -0.17337266, -0.85667061],
       [ 0.75365743, -0.54583143, -0.07548102, -0.3582892 ]])
eigen/eigen.sum() # 重要性
array([0.00521218, 0.01710261, 0.05306648, 0.92461872])
# 4 降维标准 2个特征  选取最大的两个特征值所对应的的特征的特征向量
# 百分比   计算各特征值 占权重 累加可以
vector=ev[:,2:]
# 5,进行矩阵运算
PCA_result=B.dot(vector)
PCA_result[:10]
array([[ 0.31939725,  2.68412563],
       [-0.17700123,  2.71414169],
       [-0.14494943,  2.88899057],
       [-0.31829898,  2.74534286],
       [ 0.32675451,  2.72871654],
       [ 0.74133045,  2.28085963],
       [-0.08946138,  2.82053775],
       [ 0.16338496,  2.62614497],
       [-0.57831175,  2.88638273],
       [-0.11377425,  2.6727558 ]])

验证降维数据,准确

# 降维数据  为0.87

lr=LogisticRegression()
X_train,X_test,y_train,y_test=train_test_split(X_pca,y,test_size=0.2,random_state=102)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.9
# 原数据

lr=LogisticRegression()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=102)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.9666666666666667

你可能感兴趣的:(机器学习)