我的经验是,RF之前的PCA不是一个很好的优势,如果有的话。主成分回归(PCR)是例如何时,PCA协助在OLS线性回归之前调整训练特征,这对于稀疏数据集非常需要。由于RF本身已经在没有假设线性的情况下执行了良好/公平的正则化,所以它不一定是优势。也就是说,我发现自己两周前为R写了一个PCA-RF封装器。该代码包括由100个仅包含5个真正线性分量的特征组成的数据集的模拟数据集。在这种情况下,使用PCA进行预滤波实际上是一个小优点。代码是无缝实现的,因此每个RF参数都可以简单地传递给RF。加载矢量将保存在model_fit中以在预测期间使用。
@我希望能够知道哪些变量对我的模型更重要,但是如果我使用PCA,我只能说出哪些PC更重要。
简单的方法是在没有PCA的情况下运行并获取变量重要性,并期望为PCA-RF找到类似的东西。
令人厌倦的方式,用自己的可变重要性代码将PCA-RF包装成新的装袋方案。可以在50-100行左右完成。
为PCA-RF的烃源代码建议:
#wrap PCA around randomForest, forward any other arguments to randomForest
#define as new S3 model class
train_PCA_RF = function(x,y,ncomp=5,...) {
f.args=as.list(match.call()[-1])
pca_obj = princomp(x)
rf_obj = do.call(randomForest,c(alist(x=pca_obj$scores[,1:ncomp]),f.args[-1]))
out=mget(ls())
class(out) = "PCA_RF"
return(out)
}
#print method
print.PCA_RF = function(object) print(object$rf_obj)
#predict method
predict.PCA_RF = function(object,Xtest=NULL,...) {
print("predicting PCA_RF")
f.args=as.list(match.call()[-1])
if(is.null(f.args$Xtest)) stop("cannot predict without newdata parameter")
sXtest = predict(object$pca_obj,Xtest) #scale Xtest as Xtrain was scaled before
return(do.call(predict,c(alist(object = object$rf_obj, #class(x)="randomForest" invokes method predict.randomForest
newdata = sXtest), #newdata input, see help(predict.randomForest)
f.args[-1:-2]))) #any other parameters are passed to predict.randomForest
}
#testTrain predict #
make.component.data = function(
inter.component.variance = .9,
n.real.components = 5,
nVar.per.component = 20,
nObs=600,
noise.factor=.2,
hidden.function = function(x) apply(x,1,mean),
plot_PCA =T
){
Sigma=matrix(inter.component.variance,
ncol=nVar.per.component,
nrow=nVar.per.component)
diag(Sigma) = 1
x = do.call(cbind,replicate(n = n.real.components,
expr = {mvrnorm(n=nObs,
mu=rep(0,nVar.per.component),
Sigma=Sigma)},
simplify = FALSE)
)
if(plot_PCA) plot(prcomp(x,center=T,.scale=T))
y = hidden.function(x)
ynoised = y + rnorm(nObs,sd=sd(y)) * noise.factor
out = list(x=x,y=ynoised)
pars = ls()[!ls() %in% c("x","y","Sigma")]
attr(out,"pars") = mget(pars) #attach all pars as attributes
return(out)
}
一个运行的代码示例:
#start script------------------------------
#source above from separate script
#test
library(MASS)
library(randomForest)
Data = make.component.data(nObs=600)#plots PC variance
train = list(x=Data$x[ 1:300,],y=Data$y[1:300])
test = list(x=Data$x[301:600,],y=Data$y[301:600])
rf = randomForest (train$x, train$y,ntree =50) #regular RF
rf2 = train_PCA_RF(train$x, train$y,ntree= 50,ncomp=12)
rf
rf2
cat("rf, R^2:",cor(test$y,pred_rf )^2,"PCA_RF, R^2", cor(test$y,pred_rf2)^2)
pred_rf = predict(rf ,test$x)
pred_rf2 = predict(rf2,test$x)
cor(test$y,predict(rf ,test$x))^2
cor(test$y,predict(rf2,test$x))^2
pairs(list(trueY = test$y,
native_rf = pred_rf,
PCA_RF = pred_rf2)
)