R语言——caret包机器学习的应用

参考文章:https://blog.csdn.net/jiabiao1602/article/details/44975741

此代码主要用于我分析光谱数据,结果为二分类,此光谱数据共有128个波长,既128个特征。

######数据的读入######

setwd("C:/Users/chengdehe/Desktop/近红外")

cheng<-read.csv("605_h4-1.csv",header = T) ###已经进行了,归一化

cheng[(1:3),(1:7)]     # 显示数据格式

##########自变量与因变量的划分#####

V<-cheng[ ,-(1:2)]  ###自变量

lable<-cheng[ ,2]   ###因变量

lable <- factor(lable , levels = c('D', 'H'), labels = c('D', 'H')) ##因子转换

colnames(cheng)[2]<-"type"   ###标题命名

##########降维-—去除方差变异小的变量########

library(caret)

zerovar<-nearZeroVar(V)    #若显示integer(empty)说明没有方差接近零的因变量

#newdata1<-V[,-zerovar]    #不为空,则运行此

newdata1<-V                #为空则运行此

#############去除多重共线性############

descrCorr<-cor(newdata1)     #计算各变量的相关性

highCorr <- findCorrelation(descrCorr, 0.99)   

#去除相关性大于0.99的变量(我的数据相关行太高,设置小数值,特征变量就去除完了)

newdata2 <- newdata1[, -highCorr]

#此处不存在共线

#comboInfo<- findLinearCombos(newdata2)

#comboInfo$remove        ###若为空值

#newdata2<-newdata2[ ,-comboInfo$remove]

############进行标准化,补缺失值########

###此步骤发现运行与不运行结果的差异,,,,,搞不懂

#Process = preProcess(newdata2)

#newdata3 = predict(Process, newdata2)

newdata3<-newdata2

###########数据的划分############

inTrain = createDataPartition(lable, p = 3/4, list = FALSE)

trainx = newdata3[inTrain,]

testx = newdata3[-inTrain,]

trainy = lable[inTrain]

testy = lable[-inTrain]

########################################################################################

###############随机森林###反向特征筛选--rfe#######

###############目的:选择合适的因变量数目

###交叉验证

ctrl= rfeControl(functions = rfFuncs, method = "cv",number= 5,repeats=30)

                      #    verbose = FALSE, returnResamp = "final")

                     ##错误functions = rfFuncs中“ rfFuncs”不可加双引号

subsets <- c(25,30,35,40,45,50,60,65,70)###c(10,30,50,70)

Profile <- rfe(trainx,trainy,sizes=subsets,rfeControl = ctrl)

####打印结果

print(Profile)          

####画图

plot(Profile,main = "#随机森林#反向特征数筛选--rfe-30#")           

####返回筛选出来的特征

Profile$optVariables    

###########################################################################################

#############################建模与参数优化#########

###划分为训练样本和检验样本

newdata4=newdata3[,Profile$optVariables ]

inTrain = createDataPartition(lable, p = 3/4, list = FALSE)

trainx = newdata4[inTrain,]

testx = newdata4[-inTrain,]

trainy = lable[inTrain]

testy = lable[-inTrain]

####定义模型训方式

fitControl = trainControl(method = "repeatedcv", number =10,

                          repeats = 3)#,search="random",returnResamp = "all")

                       # number代表10折交叉验证,repeats代表重复的次数

                       #search为"random"时,模型训练使用tuneLength代表参数随机组合数

################# "gbm"---提升树以分类树和回归树为基础########

gbmFit1 = train(trainx,trainy,method = "gbm",

                              trControl = fitControl,tuneLength = 10,verbose = F)

################# "treebag"---袋装决策树########

gbmFit2 = train(trainx,trainy,method = "treebag",

                trControl = fitControl,tuneLength = 10,verbose = F)

                                       ###无参数可调节

################# "pls"---线性回归########

gbmFit3 = train(trainx,trainy,method = "pls",

                trControl = fitControl,tuneLength = 10,verbose = F)

################# "nnet"---神经网络########

gbmFit4 = train(trainx,trainy,method = "nnet",

                trControl = fitControl,tuneLength = 10,verbose = F)

################# "svmLinear3"---支持向量机########

######## library("kernlab")     #install.packages("kernlab")

######## gbmFit5= train(trainx, trainy,method = "svmLinear",

######                             trControl = fitControl,

######                              tuneLength = 10,verbose = F) 

###此方法在提取判别概率时,总出错---extractProb。

############### "rf"---随机森林######

gbmFit5= train(trainx, trainy,method = "rf",trControl = fitControl

               ,tuneLength = 10)#,verbose = F)

### "knn"------K邻近############

gbmFit6= train(trainx, trainy,method = "knn",trControl = fitControl

               ,tuneLength = 10)

                                ####分类类型不可以为“0”“1” -- ,???verbose = F

############### "naive_bayes"---朴素贝叶斯######

library(naivebayes)

gbmFit7= train(trainx, trainy,method = "naive_bayes",trControl = fitControl

                              ,tuneLength = 10)#,verbose = F)

###############################################################################

####显示各模型计算结果,在tuneLength = 10,表示在可调节的参数内,随机组合10类进行计算

models = list( gbmFit1,gbmFit2,gbmFit3, gbmFit4,gbmFit5,gbmFit6,gbmFit7)

###画图plot(gbmFit3)---无意义,因为并未设置参数变量采用随机参数

#######################################################################

##利用ROCR包来绘制ROC图

library(ROCR)

#预测结果

predValues = extractPrediction(models,testX = testx, testY = testy)

head(predValues)

#查询筛选_测验样本

testValues = subset(predValues, dataType == "Test")

#得到预测结果的概率????要求相应的预测函数必须有概率估计

probValues = extractProb(models,testX = testx, testY = testy)

testProbs = subset(probValues, dataType == "Test")

###提取测试集的相关结果

Pred1 = subset(testValues, model == "gbm")

Pred2 = subset(testValues, model == "treebag")

Pred3 = subset(testValues, model == "pls")

Pred4 = subset(testValues, model == "nnet")

Pred5 = subset(testValues, model == "rf")

Pred6 = subset(testValues, model == "knn")

Pred7 = subset(testValues, model == "naive_bayes")

##进行混淆矩阵的计算---pred预测结果、obs真实结果

gbm <- confusionMatrix(Pred1$pred, Pred1$obs)

treebag <- confusionMatrix(Pred2$pred, Pred2$obs)

pls <- confusionMatrix(Pred3$pred, Pred3$obs)

nnet <- confusionMatrix(Pred4$pred, Pred4$obs)

rf <- confusionMatrix(Pred5$pred, Pred5$obs)

knn <- confusionMatrix(Pred6$pred, Pred6$obs)

bayes <- confusionMatrix(Pred7$pred, Pred7$obs)

##利用ROCR包来绘制ROC图

prob1 = subset(testProbs, model == "gbm")

prob2 = subset(testProbs, model == "treebag")

prob3 = subset(testProbs, model == "pls")

prob4 = subset(testProbs, model == "nnet")

prob5 = subset(testProbs, model == "rf")

prob6 = subset(testProbs, model == "knn")

prob7 = subset(testProbs, model == "naive_bayes")

library(ROCR)

######## "gbm" ###########

prob1$lable=ifelse(prob1$obs=="D",2,1)

pred1<- prediction(prob1$D, prob1$lable)

perf1 = performance(pred1, measure="tpr", x.measure="fpr" )

plot( perf1 , main="gbm")

########## "treebag" ########

prob2$lable=ifelse(prob2$obs=="D",2,1)

pred2<- prediction(prob2$D, prob1$lable)

perf2 = performance(pred2, measure="tpr", x.measure="fpr" )

plot( perf2, main="treebag" )

########## "pls" #########

prob3$lable=ifelse(prob3$obs=="D",2,1)

pred3<- prediction(prob3$D, prob1$lable)

perf3 = performance(pred3, measure="tpr", x.measure="fpr" )

plot( perf3 ,main="pls" )

########## "nnet" ##########

prob4$lable=ifelse(prob4$obs=="D",2,1)

pred4<- prediction(prob4$D, prob1$lable)

perf4 = performance(pred4, measure="tpr", x.measure="fpr" )

plot( perf4 , main="nnet")

########## "svm" ############

prob5$lable=ifelse(prob5$obs=="D",2,1)

pred5<- prediction(prob5$D, prob1$lable)

perf5 = performance(pred5, measure="tpr", x.measure="fpr" )

plot( perf5 , main="rf")

########## "knn" ###########

prob6$lable=ifelse(prob6$obs=="D",2,1)

pred6<- prediction(prob6$D, prob1$lable)

perf6 = performance(pred6, measure="tpr", x.measure="fpr" )

plot( perf6 ,main="knn" )

########## "bayes" ##########

prob7$lable=ifelse(prob7$obs=="D",2,1)

pred7<- prediction(prob7$D, prob7$lable)

perf7 = performance(pred7, measure="tpr", x.measure="fpr" )

plot( perf7 ,main="bayes" )

你可能感兴趣的:(R语言——caret包机器学习的应用)