参考文章:https://blog.csdn.net/jiabiao1602/article/details/44975741
此代码主要用于我分析光谱数据,结果为二分类,此光谱数据共有128个波长,既128个特征。
######数据的读入######
setwd("C:/Users/chengdehe/Desktop/近红外")
cheng<-read.csv("605_h4-1.csv",header = T) ###已经进行了,归一化
cheng[(1:3),(1:7)] # 显示数据格式
##########自变量与因变量的划分#####
V<-cheng[ ,-(1:2)] ###自变量
lable<-cheng[ ,2] ###因变量
lable <- factor(lable , levels = c('D', 'H'), labels = c('D', 'H')) ##因子转换
colnames(cheng)[2]<-"type" ###标题命名
##########降维-—去除方差变异小的变量########
library(caret)
zerovar<-nearZeroVar(V) #若显示integer(empty)说明没有方差接近零的因变量
#newdata1<-V[,-zerovar] #不为空,则运行此
newdata1<-V #为空则运行此
#############去除多重共线性############
descrCorr<-cor(newdata1) #计算各变量的相关性
highCorr <- findCorrelation(descrCorr, 0.99)
#去除相关性大于0.99的变量(我的数据相关行太高,设置小数值,特征变量就去除完了)
newdata2 <- newdata1[, -highCorr]
#此处不存在共线
#comboInfo<- findLinearCombos(newdata2)
#comboInfo$remove ###若为空值
#newdata2<-newdata2[ ,-comboInfo$remove]
############进行标准化,补缺失值########
###此步骤发现运行与不运行结果的差异,,,,,搞不懂
#Process = preProcess(newdata2)
#newdata3 = predict(Process, newdata2)
newdata3<-newdata2
###########数据的划分############
inTrain = createDataPartition(lable, p = 3/4, list = FALSE)
trainx = newdata3[inTrain,]
testx = newdata3[-inTrain,]
trainy = lable[inTrain]
testy = lable[-inTrain]
########################################################################################
###############随机森林###反向特征筛选--rfe#######
###############目的:选择合适的因变量数目
###交叉验证
ctrl= rfeControl(functions = rfFuncs, method = "cv",number= 5,repeats=30)
# verbose = FALSE, returnResamp = "final")
##错误functions = rfFuncs中“ rfFuncs”不可加双引号
subsets <- c(25,30,35,40,45,50,60,65,70)###c(10,30,50,70)
Profile <- rfe(trainx,trainy,sizes=subsets,rfeControl = ctrl)
####打印结果
print(Profile)
####画图
plot(Profile,main = "#随机森林#反向特征数筛选--rfe-30#")
####返回筛选出来的特征
Profile$optVariables
###########################################################################################
#############################建模与参数优化#########
###划分为训练样本和检验样本
newdata4=newdata3[,Profile$optVariables ]
inTrain = createDataPartition(lable, p = 3/4, list = FALSE)
trainx = newdata4[inTrain,]
testx = newdata4[-inTrain,]
trainy = lable[inTrain]
testy = lable[-inTrain]
####定义模型训方式
fitControl = trainControl(method = "repeatedcv", number =10,
repeats = 3)#,search="random",returnResamp = "all")
# number代表10折交叉验证,repeats代表重复的次数
#search为"random"时,模型训练使用tuneLength代表参数随机组合数
################# "gbm"---提升树以分类树和回归树为基础########
gbmFit1 = train(trainx,trainy,method = "gbm",
trControl = fitControl,tuneLength = 10,verbose = F)
################# "treebag"---袋装决策树########
gbmFit2 = train(trainx,trainy,method = "treebag",
trControl = fitControl,tuneLength = 10,verbose = F)
###无参数可调节
################# "pls"---线性回归########
gbmFit3 = train(trainx,trainy,method = "pls",
trControl = fitControl,tuneLength = 10,verbose = F)
################# "nnet"---神经网络########
gbmFit4 = train(trainx,trainy,method = "nnet",
trControl = fitControl,tuneLength = 10,verbose = F)
################# "svmLinear3"---支持向量机########
######## library("kernlab") #install.packages("kernlab")
######## gbmFit5= train(trainx, trainy,method = "svmLinear",
###### trControl = fitControl,
###### tuneLength = 10,verbose = F)
###此方法在提取判别概率时,总出错---extractProb。
############### "rf"---随机森林######
gbmFit5= train(trainx, trainy,method = "rf",trControl = fitControl
,tuneLength = 10)#,verbose = F)
### "knn"------K邻近############
gbmFit6= train(trainx, trainy,method = "knn",trControl = fitControl
,tuneLength = 10)
####分类类型不可以为“0”“1” -- ,???verbose = F
############### "naive_bayes"---朴素贝叶斯######
library(naivebayes)
gbmFit7= train(trainx, trainy,method = "naive_bayes",trControl = fitControl
,tuneLength = 10)#,verbose = F)
###############################################################################
####显示各模型计算结果,在tuneLength = 10,表示在可调节的参数内,随机组合10类进行计算
models = list( gbmFit1,gbmFit2,gbmFit3, gbmFit4,gbmFit5,gbmFit6,gbmFit7)
###画图plot(gbmFit3)---无意义,因为并未设置参数变量采用随机参数
#######################################################################
##利用ROCR包来绘制ROC图
library(ROCR)
#预测结果
predValues = extractPrediction(models,testX = testx, testY = testy)
head(predValues)
#查询筛选_测验样本
testValues = subset(predValues, dataType == "Test")
#得到预测结果的概率????要求相应的预测函数必须有概率估计
probValues = extractProb(models,testX = testx, testY = testy)
testProbs = subset(probValues, dataType == "Test")
###提取测试集的相关结果
Pred1 = subset(testValues, model == "gbm")
Pred2 = subset(testValues, model == "treebag")
Pred3 = subset(testValues, model == "pls")
Pred4 = subset(testValues, model == "nnet")
Pred5 = subset(testValues, model == "rf")
Pred6 = subset(testValues, model == "knn")
Pred7 = subset(testValues, model == "naive_bayes")
##进行混淆矩阵的计算---pred预测结果、obs真实结果
gbm <- confusionMatrix(Pred1$pred, Pred1$obs)
treebag <- confusionMatrix(Pred2$pred, Pred2$obs)
pls <- confusionMatrix(Pred3$pred, Pred3$obs)
nnet <- confusionMatrix(Pred4$pred, Pred4$obs)
rf <- confusionMatrix(Pred5$pred, Pred5$obs)
knn <- confusionMatrix(Pred6$pred, Pred6$obs)
bayes <- confusionMatrix(Pred7$pred, Pred7$obs)
##利用ROCR包来绘制ROC图
prob1 = subset(testProbs, model == "gbm")
prob2 = subset(testProbs, model == "treebag")
prob3 = subset(testProbs, model == "pls")
prob4 = subset(testProbs, model == "nnet")
prob5 = subset(testProbs, model == "rf")
prob6 = subset(testProbs, model == "knn")
prob7 = subset(testProbs, model == "naive_bayes")
library(ROCR)
######## "gbm" ###########
prob1$lable=ifelse(prob1$obs=="D",2,1)
pred1<- prediction(prob1$D, prob1$lable)
perf1 = performance(pred1, measure="tpr", x.measure="fpr" )
plot( perf1 , main="gbm")
########## "treebag" ########
prob2$lable=ifelse(prob2$obs=="D",2,1)
pred2<- prediction(prob2$D, prob1$lable)
perf2 = performance(pred2, measure="tpr", x.measure="fpr" )
plot( perf2, main="treebag" )
########## "pls" #########
prob3$lable=ifelse(prob3$obs=="D",2,1)
pred3<- prediction(prob3$D, prob1$lable)
perf3 = performance(pred3, measure="tpr", x.measure="fpr" )
plot( perf3 ,main="pls" )
########## "nnet" ##########
prob4$lable=ifelse(prob4$obs=="D",2,1)
pred4<- prediction(prob4$D, prob1$lable)
perf4 = performance(pred4, measure="tpr", x.measure="fpr" )
plot( perf4 , main="nnet")
########## "svm" ############
prob5$lable=ifelse(prob5$obs=="D",2,1)
pred5<- prediction(prob5$D, prob1$lable)
perf5 = performance(pred5, measure="tpr", x.measure="fpr" )
plot( perf5 , main="rf")
########## "knn" ###########
prob6$lable=ifelse(prob6$obs=="D",2,1)
pred6<- prediction(prob6$D, prob1$lable)
perf6 = performance(pred6, measure="tpr", x.measure="fpr" )
plot( perf6 ,main="knn" )
########## "bayes" ##########
prob7$lable=ifelse(prob7$obs=="D",2,1)
pred7<- prediction(prob7$D, prob7$lable)
perf7 = performance(pred7, measure="tpr", x.measure="fpr" )
plot( perf7 ,main="bayes" )