这篇主要记录在sklearn中如何应用pca,理论推导在http://blog.csdn.net/huangyi_906/article/details/75578213)
官网中给出的介绍:
class sklearn.decomposition.PCA(n_components=None, copy=True, whiten=False, svd_solver=’auto’, tol=0.0, iterated_power=’auto’, random_state=None)
其中参数:
属性:
方法:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
#all_data为在训练测试集同时特征变换
train_df = pd.read_csv('train_b.csv')
test_df = pd.read_csv('test_b.csv')
all_data = pd.concat([train_df.drop('y', axis=1), test_df])
#原数据中有数据有字符串格式,用LabelEncoder转成数字量。
for c in train_df.columns:
if train_df[c].dtype == 'object':
lbl = LabelEncoder()
lbl.fit(list(train_df[c].values) + list(test_df[c].values))
train_df[c] = lbl.transform(list(train_df[c].values))
test_df[c] = lbl.transform(list(test_df[c].values))
print train_df.shape, test_df.shape, all_data.shape
(4209, 378) (4209, 377) (8418, 377)
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, LassoCV
x = train_df.drop('y', axis=1)
y = train_df.y
x_train,x_test, y_train, y_test = train_test_split(x, y, train_size=0.7,random_state=1)
不经过PCA降维,直接用Lasson模型预测,r2分数看效果
las = LassoCV(alphas=np.logspace(-3,0,10),cv=3, normalize=True)
las.fit(x_train, y_train)
y_hat = las.predict(x_test)
r2 = r2_score(y_test, y_hat)
print r2
0.600606371001
数据经过PCA降维
pca = PCA(n_components=100)
x_pca = pca.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, train_size=0.7, random_state=1)
las = LassoCV(alphas=np.logspace(-3,0,10), cv=3, normalize=True)
las.fit(x_train, y_train)
y_hat = las.predict(x_test)
r2 = r2_score(y_test, y_hat)
print las.score(x_test, y_test)
print r2
0.585078356854
0.585078356854
# 属性:
# - components_ :主成分组数
# - explained_variance_ratio_:每个主成分占方差比例
# - n_components_ :一个整数,指示主成分有多少个元素。
print pca.components_.shape
print pca.components_
print pca.explained_variance_ratio_
print pca.n_components_
(100, 377)
[[ 9.99997229e-01 -7.03051423e-05 2.99568128e-04 ..., -1.56006146e-07
-1.37294911e-07 4.49824033e-07]
[ -6.84286561e-05 -9.47851214e-01 2.08504144e-01 ..., 7.39183496e-06
-9.91394396e-06 -2.72693141e-05]
[ 1.14415298e-04 2.36792722e-01 1.03927848e-04 ..., -9.05345363e-05
-2.25038995e-07 -5.86626778e-06]
...,
[ 1.61354771e-06 8.02087742e-04 2.94251412e-03 ..., 1.44147371e-02
-1.12809556e-03 2.32108319e-02]
[ 3.88195289e-07 -3.57689418e-04 1.86472048e-03 ..., 8.35456656e-03
7.63157915e-04 2.21159446e-02]
[ 4.75846061e-06 -3.18865700e-04 -1.91695177e-03 ..., 2.27787722e-02
8.69893184e-04 -9.42942931e-03]]
[ 9.99904441e-01 4.13130289e-05 2.19765863e-05 1.10193653e-05
8.26702521e-06 7.62663194e-06 1.42480683e-06 6.67174103e-07
3.88754081e-07 2.62818901e-07 2.16384043e-07 2.11922104e-07
1.82366088e-07 1.49327561e-07 1.32521195e-07 1.15136912e-07
9.27510747e-08 8.58563786e-08 8.01276864e-08 7.04005004e-08
6.34985956e-08 5.78088419e-08 5.57237058e-08 5.23667240e-08
4.59750995e-08 4.33226546e-08 3.87329541e-08 3.70671641e-08
3.38158788e-08 3.30567259e-08 3.18505129e-08 2.95970906e-08
2.81648193e-08 2.68042845e-08 2.53418096e-08 2.39548190e-08
2.22288938e-08 1.98302079e-08 1.92309281e-08 1.84941517e-08
1.75852712e-08 1.66850239e-08 1.59845385e-08 1.57456236e-08
1.56197617e-08 1.51236570e-08 1.36981729e-08 1.33881358e-08
1.28714627e-08 1.26702565e-08 1.23608559e-08 1.19063092e-08
1.15979570e-08 1.12774700e-08 1.06840169e-08 1.02724873e-08
9.65226045e-09 9.57709444e-09 9.21755466e-09 9.07374803e-09
8.83653670e-09 8.32320742e-09 8.10177890e-09 7.78543398e-09
7.68675342e-09 7.11795011e-09 6.97713683e-09 6.80653296e-09
6.70775259e-09 6.33472793e-09 6.17404430e-09 5.99154122e-09
5.90636490e-09 5.69580309e-09 5.52710835e-09 5.44626714e-09
5.25662572e-09 5.14883243e-09 5.05059409e-09 4.92476555e-09
4.83675619e-09 4.77177612e-09 4.56297210e-09 4.49822949e-09
4.34043398e-09 4.15862107e-09 4.14353784e-09 3.98119789e-09
3.92094430e-09 3.85990803e-09 3.80125941e-09 3.70113785e-09
3.61354437e-09 3.46643273e-09 3.31328766e-09 3.22858846e-09
3.21169764e-09 3.12481222e-09 3.01224773e-09 2.99584028e-09]
100
PCA要人为指定保留的特征数,这就有参数选择好坏的区别。下面用pipline结合GridSearchCV,找到分数最高的保留特征数。
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
x_train,x_test, y_train, y_test = train_test_split(x, y, train_size=0.7,random_state=1)
steps = [("pca", PCA()),
("las", LassoCV(alphas=np.logspace(-3,0,10), cv=3, normalize=True))] #把数据处理过程打包在pip中
pip = Pipeline(steps)
gsea = GridSearchCV(pip, param_grid={'pca__n_components': np.arange(1,370,10)}, cv=3) #参数选择在(1,370)中每隔10选一个数,共37个数。
gsea.fit(x_train, y_train)
print gsea.score(x_test, y_test)
print gsea.best_params_
0.586531700728
{'pca__n_components': 141}
发现经过PCA降维之后,模型的预测效果在R2评价有略微下降,分析原因:
- pca降维的原理是“将协方差矩阵化为对角阵的过程。而协方差矩阵的值反映的是不同波段之间数据的相关性,即协方差。对角化后的协方差矩阵除了主对角线上的元素外,都是零。所以说,特征之间的相关性被去除了”。
PCA作为一种无监督数据压缩算法,只保留最重要的主方向,则在压缩时,自变量和因变量间的关系有可能变的更复杂了。
PCA在特征之间有大相关性时效果通常不错。