这一章主要讲进行特征选择(feature selection)的方法:
最优子集选择法是一种穷搜( exhaustive search)策略,显然会发生维数灾难
算法:
由于 \(R^2\) 或者 \(RSS\) 一定会随着变量增加而变得更优,所以在最后选择的时候要用交叉验证法. 3种准则在课本中均有提到,不过超出了我的理解范围.
leaps 包中提供了 regsubsets 方法和lm的调用方法一样,用RSS进行最优子集选择
library(ISLR)
library(leaps)
Hitters <- na.omit(Hitters)
regfit.ful =regsubsets(Salary~.,Hitters)
summary(regfit.ful)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " " " "*" "*" " " " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " " " "
#参数 nvmax 可以指定最优变量个数,默认为8
regfit.ful = regsubsets(Salary ~.,data = Hitters,nvmax = 19)
reg.summray <- summary(regfit.ful)
#sunmary中包含了每个最优模型的 rsq,cp,adjr2,bic,我们可以将他们画出
par(mfrow=c(2,2))
plot(reg.summray$rsq)
points(reg.summray$rss,col="red")
plot(reg.summray$adjr2,ylab = "adjr2",type = "l")
plot(reg.summray$cp,ylab = "cp",type = "l")
plot(reg.summray$bic,ylab = "bic",type = "l")
#coef提取变量参数估计
coef(regfit.ful,6)
## (Intercept) AtBat Hits Walks CRBI
## 91.5117981 -1.8685892 7.6043976 3.6976468 0.6430169
## DivisionW PutOuts
## -122.9515338 0.2643076
这两种方法均是一种贪心策略,向前方法是逐步添加一个变量获的当前变量数下的最优模型,然后再添加一个变进行选择。向后方法则是删除一个变量。 同样调用 上面的函数,将参数设置为 “backward”,“forward”
reg.forward = regsubsets(Salary ~.,data = Hitters,method = "forward",nvmax = 19)
summary(reg.forward)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, method = "forward",
## nvmax = 19)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*"
由于交叉验证只有很少的假设所以试用范围更广,而上面的所有验证法都会估计 \(\sigma^2\)
set.seed(1)
train = sample(c(T,F),nrow(Hitters),rep=T)
test = !train
reg.best = regsubsets(Salary~.,data = Hitters[train,],nvmax = 19)
test.mat = model.matrix(Salary~.,data = Hitters[test,])#生成回归矩阵
计算每个模型的测试误差
val.err = rep(0,19)
for(i in 1:19){
coefi = coef(reg.best,id=i)
pred = test.mat[,names(coefi)] %*% coefi
val.err[i] = mean((Hitters$Salary[test]-pred)^2)
}
plot(val.err)
#自己编写预测函数
predict.regsubsets<-function(object,newdata,id,...){
form = as.formula(object$call[[2]])
mat = model.matrix(form,newdata)
coefi = coef(object,id=id)
return(mat[,names(coefi)]%*%coefi)
}
#随机分折
k = 10
set.seed(1)
folds = sample(1:k,nrow(Hitters),replace = T)
#求美一折i变量的验证误差
cv.err = matrix(NA, nrow =k, ncol = 19)
for(j in 1:k){
best.fit = regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax = 19)
for(i in 1:19){
pred = predict(best.fit,Hitters[folds==j,],id=i)
cv.err[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
}
}
#求每k个变量的平均
mean.cv = apply(cv.err,2,mean)
plot(mean.cv)
11个变量时得到一个最优子集
所谓r-regression就是在最小二乘惩罚函数的基础上对每一个系数加一个惩罚,会在同一尺度下压缩系数.
#处理矩阵x,y,注意,hitters以去掉缺失值
x = model.matrix(Salary~.,Hitters)[,-1] #去掉截距项
y = Hitters$Salary
library(glmnet)
grid = 10^seq(10,-2,length=100)
ridge.mod = glmnet(x,y,alpha = 0,lambda = grid)#ridge regression,默认情况下已进行标准化
coef(ridge.mod)[,50]#返回对应 ridge.mod$lambda[50] 的模型系数
## (Intercept) AtBat Hits HmRun Runs
## 407.356050200 0.036957182 0.138180344 0.524629976 0.230701523
## RBI Walks Years CAtBat CHits
## 0.239841459 0.289618741 1.107702929 0.003131815 0.011653637
## CHmRun CRuns CRBI CWalks LeagueN
## 0.087545670 0.023379882 0.024138320 0.025015421 0.085028114
## DivisionW PutOuts Assists Errors NewLeagueN
## -6.215440973 0.016482577 0.002612988 -0.020502690 0.301433531
使用predict可以进行插值预测系数
predict(ridge.mod,s=50,type = "coefficients")[1:20,]
## (Intercept) AtBat Hits HmRun Runs
## 4.876610e+01 -3.580999e-01 1.969359e+00 -1.278248e+00 1.145892e+00
## RBI Walks Years CAtBat CHits
## 8.038292e-01 2.716186e+00 -6.218319e+00 5.447837e-03 1.064895e-01
## CHmRun CRuns CRBI CWalks LeagueN
## 6.244860e-01 2.214985e-01 2.186914e-01 -1.500245e-01 4.592589e+01
## DivisionW PutOuts Assists Errors NewLeagueN
## -1.182011e+02 2.502322e-01 1.215665e-01 -3.278600e+00 -9.496680e+00
predict(ridge.mod,s=50,type = "coefficients",exact = T)[1:20,]#设置exact会对lambda进行准确计算,否会会进行插值,可以看到,两则并无显著差别
## (Intercept) AtBat Hits HmRun Runs
## 4.827241e+01 -3.531689e-01 1.951322e+00 -1.289788e+00 1.155633e+00
## RBI Walks Years CAtBat CHits
## 8.092550e-01 2.708568e+00 -6.210875e+00 5.999458e-03 1.070313e-01
## CHmRun CRuns CRBI CWalks LeagueN
## 6.291808e-01 2.177875e-01 2.155314e-01 -1.487633e-01 4.584577e+01
## DivisionW PutOuts Assists Errors NewLeagueN
## -1.182273e+02 2.501719e-01 1.208250e-01 -3.276435e+00 -9.408861e+00
predict(ridge.mod,s=1e10,type = "coefficients")[1:20,]#一个较大的lambda会几乎将所有系数设为0
## (Intercept) AtBat Hits HmRun Runs
## 5.359257e+02 5.443467e-08 1.974589e-07 7.956523e-07 3.339178e-07
## RBI Walks Years CAtBat CHits
## 3.527222e-07 4.151323e-07 1.697711e-06 4.673743e-09 1.720071e-08
## CHmRun CRuns CRBI CWalks LeagueN
## 1.297171e-07 3.450846e-08 3.561348e-08 3.767877e-08 -5.800263e-07
## DivisionW PutOuts Assists Errors NewLeagueN
## -7.807263e-06 2.180288e-08 3.561198e-09 -1.660460e-08 -1.152288e-07
交叉验证求最优的lambda cv.glmnet会默认用10折交叉验证来求参数
#选择训练集
set.seed(1)
train = sample(1:nrow(x),nrow(x)/2)
test = -train
y.test = y[test]
#求最优参数
set.seed(1)
ridge.mod = glmnet(x[train,],y[train],alpha = 0,lambda=grid,thresh = 1e-12)
cv.out = cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)
bestlambda = cv.out$lambda.min
#测试误差
ridge.pred=predict(ridge.mod,s=bestlambda,newx = x[test,])#预测值
cost = mean( (ridge.pred-y.test)^2)
可以看到在lambda=211的时候表现的很好,我们带回去,看它的系数
out = glmnet(x,y,alpha = 0,lambda = grid)
predict(out,type="coefficients",s=bestlambda,exact=T)[1:20,]
## (Intercept) AtBat Hits HmRun Runs
## 9.83874827 0.03162896 1.00814613 0.14006836 1.11329305
## RBI Walks Years CAtBat CHits
## 0.87343646 1.80387278 0.13518617 0.01115422 0.06491394
## CHmRun CRuns CRBI CWalks LeagueN
## 0.45165783 0.12891029 0.13724800 0.02913280 27.17039556
## DivisionW PutOuts Assists Errors NewLeagueN
## -91.62093693 0.19146155 0.04249938 -1.81130262 7.22446475
没有一个系数为0,说明ridge reggression不会帅选变量!
所谓lasso就是在最小二乘乘法的后面加上一个L1范数惩罚,这回达到缩减系数的目的。一个很直观的感受是,1范数带来的限制是方形,而MSE是椭圆,所以,会在坐标轴上交于一个最优值.
实验 实验部分和上面一样,只是alpha=1
lasso.mod = glmnet(x[train,],y[train],alpha = 1,lambda=grid)
plot(lasso.mod)
可以看到随着L1范数的减小,有些系数逐渐降为0 选着最优lambda
set.seed(1)
cv.out = cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
bestlambda = cv.out$lambda.min
lasso.pred = predict(lasso.mod,s=bestlambda,newx = x[test,])
mean((lasso.pred-y.test)^2)
## [1] 100743.4
#模型系数
out = glmnet(x,y,alpha = 1,lambda = grid)
lasso.coef = predict(out,type = "coefficients",s=bestlambda)[1:20,]