相信能看到这边文章的同僚们都能明白SMOTE算法是什么,在此就不过多介绍了,直接应用:
#利用SMOTE算法对原数据进行预处理
whitewine<-read.csv("C:/Users/Administrator/Desktop/实验数据集/whitewines.csv")
上面是我自己电脑中存储的数据,你可以按照你自己的 数据进行处理,切勿粘贴复制
#注意当条件是一个向量时,比较的结果也是一个向量,分段函数应该如下编程:
w<-matrix(data = whitewine$quality,nrow = 2)
summary(whitewine$quality)
length(w)
y<-numeric(length(w))
y[w>4]<-1
y[w<=4]<-2
y
whitewine$quality<-y
当然,比上述更简单的是下面这种:
#对原数据的标签进行处理,使之成为二类不平衡数据集
whitewine$quality<-ifelse(whitewine$quality>4,1,2)
summary(whitewine$quality)
table(whitewine$quality)
prop.table(table(whitewine$quality))
#简要查看数据,因为均为数值型属性,故不需要二值化,注意可能要运用标准化
head(whitewine,3)
#先对数据集进行分割,两种方法
set.seed(12345)
whitewine_rand<-whitewine[order(runif(4898)),]
summary(whitewine_rand$quality)
whitewine_rand_train<-whitewine_rand[1:4000,]
whitewine_rand_test<-whitewine_rand[4001:4898,]
prop.table(table(whitewine_rand_test$quality))
table(whitewine_rand_train$quality)
class(whitewine_rand_train$quality)
#另一种
library(caret)
set.seed(12345)
splitIndex<-createDataPartition(whitewine$quality,time=1,p=0.817,list=FALSE)
trainSplit<-whitewine[splitIndex,]
testSplit<-whitewine[-splitIndex,]
table(testSplit$quality)
prop.table(table(trainSplit$quality))
class(trainSplit$quality)
#利用第一种分割方法建立模型
whitewine_rand_train$quality<-factor(whitewine_rand_train$quality)
library(C50)
whitewine_model_a<-C5.0(whitewine_rand_train[-12],whitewine_rand_train$quality)
summary(whitewine_model_a)
whitewine_pred_a<-predict(whitewine_model_a,whitewine_rand_test,type = "class")
library(gmodels)
CrossTable(whitewine_rand_test$quality,whitewine_pred_a,prop.chisq = F,prop.c = F,prop.r = F,
dnn = c('actual quality','predict quality'))
#画出roc曲线
library(pROC)
class(whitewine_pred_a)
whitewine_pred_a
whitewine_pred_a<-as.numeric(whitewine_pred_a)
auc<-roc(whitewine_rand_test$quality,whitewine_pred_a)
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)
下面我将数据导出为arff格式,以便在weka中运行
write.arff(whitewine,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_a.arff")
#利用SMOTE算法预处理不平衡数据,这里未对原数据进行标准化
library(DMwR)
class(whitewine_rand_train$quality) #需因子型目标变量
table(whitewine_rand_train$quality)
whitewine_rand_train<-SMOTE(quality~.,whitewine_rand_train,perc.over=600,perc.under=100)
prop.table(table(whitewine_rand_train$quality))
#再利用处理之后的第一种分割方法建立模型
whitewine_rand_train$quality<-factor(whitewine_rand_train$quality)
library(C50)
whitewine_model_b<-C5.0(whitewine_rand_train[-12],whitewine_rand_train$quality)
summary(whitewine_model_b)
whitewine_pred_b<-predict(whitewine_model_b,whitewine_rand_test,type = "class")
library(gmodels)
CrossTable(whitewine_rand_test$quality,whitewine_pred_b,prop.chisq = F,prop.c = F,prop.r = F,
dnn = c('actual quality','predict quality'))
#画出roc曲线,并生成arff格式的预处理后的训练集与未处理的测试集
library(pROC)
class(whitewine_pred_b)
whitewine_pred_b
whitewine_pred_b<-as.numeric(whitewine_pred_b)
auc<-roc(whitewine_rand_test$quality,whitewine_pred_b)
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)
write.arff(whitewine_rand_train,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_train.arff")
whitewine_rand_test$quality<-ifelse(whitewine_rand_test$quality==0,1,2)
summary(whitewine_rand_test$quality)
write.arff(whitewine_rand_test,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_test.arff")
#对原数据进行标准化,再进行SMOTE增加实例,随后模型建立与预测
normalize<-function(x){
return((x-min(x))/(max(x)-min(x)))
}
whitewine_rand_train_n<-lapply(whitewine_rand_train[1:11],normalize)
summary(whitewine_rand_train_n$alcohol)
whitewine_rand_train_n$quality<-whitewine_rand_train$quality
whitewine_rand_train_n<-as.data.frame(whitewine_rand_train_n)
summary(whitewine_rand_train_n$quality)
#利用SMOTE算法预处理不平衡数据,这里已经对原数据进行标准化
library(DMwR)
class(whitewine_rand_train_n$quality) #需因子型目标变量
whitewine_rand_train_n$quality<-factor(whitewine_rand_train_n$quality)
table(whitewine_rand_train_n$quality)
whitewine_rand_train_n<-SMOTE(quality~.,whitewine_rand_train_n,perc.over=600,perc.under=100)
table(whitewine_rand_train_n$quality)
prop.table(table(whitewine_rand_train_n$quality))
#再利用处理之后的第一种分割方法建立模型,注意也一定要对测试数据集进行标准化!!!
whitewine_rand_train_n$quality<-factor(whitewine_rand_train_n$quality)
library(C50)
whitewine_model_c<-C5.0(whitewine_rand_train_n[-12],whitewine_rand_train_n$quality)
summary(whitewine_model_c)
whitewine_rand_test_n<-lapply(whitewine_rand_test[1:11],normalize)
whitewine_rand_test_n$quality<-whitewine_rand_test$quality
whitewine_rand_test_n<-as.data.frame(whitewine_rand_test_n)
summary(whitewine_rand_test_n$quality)
whitewine_pred_c<-predict(whitewine_model_c,whitewine_rand_test_n,type = "class")
library(gmodels)
CrossTable(whitewine_rand_test_n$quality,whitewine_pred_c,prop.chisq = F,prop.c = F,prop.r = F,
dnn = c('actual quality','predict quality'))
#画出roc曲线,并生成arff格式的预处理后的训练集与未处理的测试集
library(pROC)
class(whitewine_pred_c)
whitewine_pred_c
whitewine_pred_c<-as.numeric(whitewine_pred_c)
auc<-roc(whitewine_rand_test_n$quality,whitewine_pred_c)
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)
write.arff(whitewine_rand_train_n,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_train_n.arff")
write.arff(whitewine_rand_test_n,file = "C:/Users/Administrator/Desktop/实验数据集/whitewine_test_n.arff")
#再利用另外一种划分方法进行模型建立
ctrl<-trainControl(method="cv",number=5)
library(ipred)
library(plyr)
class(trainSplit$quality)
tbmodel<-train(quality~.,data=trainSplit,method="treebag",
trControl=ctrl)
predictors<-names(trainSplit)[names(trainSplit)!='quality']
pred<-predict(tbmodel$finalModel,testSplit[,predictors])
library(pROC)
auc<-roc(testSplit$quality,pred)
print(auc) #得出auc0.8947,可能不准确,因为是不平衡数据集
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)
#利用SMOTE算法预处理不平衡数据,这里未对原数据进行标准化
library(DMwR)
class(trainSplit$quality) #需因子型目标变量
trainSplit$quality<-factor(trainSplit$quality)
table(trainSplit$quality)
trainSplit<-SMOTE(quality~.,trainSplit,perc.over=600,perc.under=100)
prop.table(table(trainSplit$quality))
#再建立模型,并生成arff数据导入weka
ctrl<-trainControl(method="cv",number=5)
library(ipred)
library(plyr)
class(trainSplit$quality)
trainSplit$quality<-as.numeric(trainSplit$quality)
tbmodel<-train(quality~.,data=trainSplit,method="treebag",
trControl=ctrl)
predictors<-names(trainSplit)[names(trainSplit)!='quality']
pred<-predict(tbmodel$finalModel,testSplit[,predictors])
library(pROC)
auc<-roc(testSplit$quality,pred)
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
write.arff(trainSplit,file = "C:/Users/Administrator/Desktop/实验数据集/trainsplit1.arff")
write.arff(testSplit,file = "C:/Users/Administrator/Desktop/实验数据集/testsplit1.arff")
本人第一篇博文,全是代码,有兴趣的可以联系我,详述这个过程