机器学习专题:特征选择(R)

载入数据

library(readr)

# 载入数据
sampleTraits<-read_csv("2-datTraits_95.csv", col_names = T) 
#datExpr1<-read_csv("6-DEGset-95_DEseq_miRBAse_batch_scale.csv", col_names = T) #用于box
#datExpr2<-read_csv("5-DEGset_95_DEseq_miRBAse_batch.csv", col_names = T)      #用于lasso
#datExpr3<-read_csv("6-DEGset-95_DEseq_miRBAse_batch_scale_final.csv", col_names = T) #用于box
#datExpr4<-read_csv("6-DEGset_95_DEseq_miRBAse_batch_final.csv", col_names = T)      #用于lasso
datExpr5<-read_csv("6-DEGset_95_DEseq_miRBAse_batch_final_log2fc.csv", col_names = T)      #用于lasso

datExpr<-datExpr5 #**************************
dim(datExpr)

# 矩阵转置
miR<-as.vector(unlist(datExpr[,1]))
datExpr_t<-t(datExpr[,-1])
#datExpr_t[c(1:3),c(1:3)] 
colnames(datExpr_t)<-miR 
datExpr_t[c(1:3),c(1:3)]  # 列为样本,行为变量

1.封装法

#递归特征消除(Recursive Feature Elimination)
library(caret)
#library(gam)
data.x<-datExpr_t[,c(1:dim(datExpr_t)[2])] #矩阵格式
data.outcome<-factor(sampleTraits$Diagnosis, level = c('ASD', 'CTL')) # $用于列表

set.seed(123)
filter1<-rfe(x = data.x,
             y = data.outcome,
             sizes = seq(5,16,1), #sizes:通过一个整数向量,指定需要保留的特征数量
             #rfFuncs(随机森林),lmFuncs(线性回归),nbFuncs(朴素贝叶斯),treebagFuncs(装袋决策树),caretFuncs(自定义的训练模型)             
             rfeControl = rfeControl(functions = rfFuncs, 
                                     method = 'cv', #Cross-Validated (10 fold)
                                     repeats = 5)) #抽取?组样本
plot(filter1, type=c("g", "o"))
print(filter1)
predictors(filter1)

2.LASSO回归

# Lasso全名The leastAbsolute shrinkage and Selectionator operator(最小绝对(值)收缩和变量选择)
library(glmnet)
set.seed(123)
filter2<-cv.glmnet(x = data.x,
                   y = data.outcome,
                   family='binomial', #可选gaussian一维连续因变量,binomial二元离散因变量
                   nfolds=5, #默认为10
                   type.measure = "auc") #可选deviance,mse,mae,class
filter3<-glmnet(x = data.x,
                y = data.outcome,
                family='binomial')
plot(filter2);plot(filter3, xvar = 'lambda', label=TRUE)

filter2$lambda.min #最佳lambda值
filter2$lambda.1se #$lambda.1se#指在lambda.min一个标准差范围内最简模型的lambda值。

filter2.coef.lambda.1se<-coef(filter2, s=filter2$lambda.1se)
filter2.coef.lambda.1se #筛后指标
filter2.1se.out<-filter2.coef.lambda.1se[which(filter2.coef.lambda.1se != 0),]
filter2.1se.out<-round(filter2.1se.out, 4) #保留小数位数
filter2.1se.out;length(filter2.1se.out)

3.随机森林法(袋外误差(OOB))

## https://blog.csdn.net/wishchin/article/details/52515516
if(!suppressWarnings(require(varSelRF)))
{
  install.packages('varSelRF')
  require(varSelRF)
}

setwd('C:/Users/xllix/Documents/WORK/2019论文准备/1-论文初稿/2-results')
library(varSelRF)
library(readr)
set.seed(123)
rf.vs1<-varSelRF(data.x, 
                 data.outcome, 
                 c.sd = 1, mtryFactor = 1, 
                 ntree = 5000,ntreeIterat = 2000, #默认值500
                 vars.drop.num = NULL, vars.drop.frac = 0.1,
                 whole.range = TRUE, recompute.var.imp = FALSE, verbose = FALSE,
                 returnFirstForest = TRUE, fitted.rf = NULL, keep.forest = FALSE)
rf.vs1
select.history<-rf.vs1$selec.history;names(select.history)
select.history[select.history$Number.Variables == rf.vs1$best.model.nvars,]

selected.vars<-rf.vs1$selected.vars

plot(rf.vs1)
dev.off()
write.table(select.history,"7.4-for_svm_oob_log2fc2.txt", row.names = F, quote = F)

4.特征汇总

box<-predictors(filter1)
lasso<-row.names(as.data.frame(filter2.1se.out))[-1]
oob<-selected.vars

sect<-Reduce(intersect, list(box, oob, lasso));length(sect) #7

print('封装法/递归特征消除');box
print('LASSO回归');lasso
print('随机森林oob');oob
print('取交集');sect
write.table(box, "7.4-for_svm_box_log2fc.csv",row.names = F) #都不用scale
write.table(lasso, "7.4-for_svm_lasso_log2fc_5fc.csv",row.names = F)
write.table(oob, "7.4-for_svm_oob_log2fc.csv",row.names = F)

write.csv(sect, "7.5-for_svm_sec_log2fc.csv",row.names = F)

你可能感兴趣的:(机器学习专题:特征选择(R))