import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
X,y=datasets.load_iris(True)
pca=PCA(n_components=0.98,whiten=False)
X_pca=pca.fit_transform(X)
X_pca.head()
array([[-2.68412563, 0.31939725, -0.02791483],
[-2.71414169, -0.17700123, -0.21046427],
[-2.88899057, -0.14494943, 0.01790026],
[-2.74534286, -0.31829898, 0.03155937],
[-2.72871654, 0.32675451, 0.09007924],])
PCA原理
B=X-X.mean(axis=0)
B[:5]
array([[-0.74333333, 0.44266667, -2.358 , -0.99933333],
[-0.94333333, -0.05733333, -2.358 , -0.99933333],
[-1.14333333, 0.14266667, -2.458 , -0.99933333],
[-1.24333333, 0.04266667, -2.258 , -0.99933333],
[-0.84333333, 0.54266667, -2.358 , -0.99933333]])
V=np.cov(B,rowvar=False)
V
array([[ 0.68569351, -0.042434 , 1.27431544, 0.51627069],
[-0.042434 , 0.18997942, -0.32965638, -0.12163937],
[ 1.27431544, -0.32965638, 3.11627785, 1.2956094 ],
[ 0.51627069, -0.12163937, 1.2956094 , 0.58100626]])
eigen,ev=np.linalg.eigh(V)
display(eigen,ev)
array([0.02383509, 0.0782095 , 0.24267075, 4.22824171])
array([[ 0.31548719, 0.58202985, 0.65658877, -0.36138659],
[-0.3197231 , -0.59791083, 0.73016143, 0.08452251],
[-0.47983899, -0.07623608, -0.17337266, -0.85667061],
[ 0.75365743, -0.54583143, -0.07548102, -0.3582892 ]])
eigen/eigen.sum()
array([0.00521218, 0.01710261, 0.05306648, 0.92461872])
vector=ev[:,2:]
PCA_result=B.dot(vector)
PCA_result[:10]
array([[ 0.31939725, 2.68412563],
[-0.17700123, 2.71414169],
[-0.14494943, 2.88899057],
[-0.31829898, 2.74534286],
[ 0.32675451, 2.72871654],
[ 0.74133045, 2.28085963],
[-0.08946138, 2.82053775],
[ 0.16338496, 2.62614497],
[-0.57831175, 2.88638273],
[-0.11377425, 2.6727558 ]])
验证降维数据,准确
lr=LogisticRegression()
X_train,X_test,y_train,y_test=train_test_split(X_pca,y,test_size=0.2,random_state=102)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.9
lr=LogisticRegression()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=102)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.9666666666666667