基于分层k倍交叉证实构建决策森林(感觉不像决策森林,有些装代法的味道)
library(RWeka)
library(partykit)
library(caret)
#class_col是储存类别信息的列号
create_forest <- function(k_fold, data, class_col, n.fold){
#设置折叠数
k <- k_fold
#用列表储存分层折叠后的数据集
dataSet <- list()
#用列表储存构建的决策森林
forest <- list()
#储存每个树的准确度
acc <- c()
#将类别列的列名改为Species
colnames(data)[class_col] <- "Species"
#species_name储存类的名称
species_name <- names(table(data[,class_col]))
#class_num储存类的数量
class_num <- length(species_name)
#Species_num储存各类的数量
species_num <- as.numeric(table(data[,class_col]))
#分层折叠时以fold_index为参考索引
fold_index <- matrix(0, nrow=class_num, ncol=(k+1))
for(i in 1:class_num){
base_num <- species_num[i]%/%k
mod_num <- species_num[i]%%k
fold_index[i, 1] <- 1
for(j in 2:(k+1)){
if(j <= mod_num){
fold_index[i, j] <- fold_index[i, j-1] + base_num + 1
}
else{
fold_index[i, j] <- fold_index[i, j-1] + base_num
}
}
}
#分层折叠
#将混乱的索引按类储存进列表
for(n in 1:n.fold){
index <- list()
for(i in 1:class_num){
index[[i]] <- sample(which(data[,class_col]==species_name[i]),
length(which(data[,class_col]==species_name[i])),
replace=FALSE)
}
#将分层折叠的数据存入dataSet
for(i in 1:k){
dataSet[[i]] <- data[1,]
for(j in 1:class_num){
dataSet[[i]] <- rbind(dataSet[[i]],
data[index[[j]][fold_index[j,i]:(fold_index[j,i+1]-1)],])
}
dataSet[[i]] <- dataSet[[i]][-1, ]
}
#i为检验集,其他为训练集,构造决策树
for(i in 1:k){
test <- dataSet[[i]]
left <- dataSet[-i]
train <- test[1, ]
for(j in 1:length(left)){
train <- rbind(train, left[[j]])
}
train <- train[-1, ]
treeC4.5 <- J48(Species~., data=train)
forest[[i+(n-1)*k]] <- treeC4.5
acc[i+(n-1)*k] <- confusionMatrix(predict(treeC4.5, newdata=test),
test$Species)$overall[1]
}
}
#返回决策森林与树的准确率
return(list(forest, acc))
}
#用构造的森林预测样本的类别
myfun <- function(x){
return(names(x)[which.max(x)])
}
mypre <- function(trees_acc, data, class_col){
species <- names(table(data[,class_col]))
forest <- trees_acc[[1]]
acc <- trees_acc[[2]]
nrows <- nrow(data)
result <- c()
#得分矩阵,行是样本,一列代表一个类的得分,与CART树的预测结果类似
pre_score <- matrix(0, nrows*length(species), nrow=nrows, ncol=length(species))
colnames(pre_score) <- species
#每个树对所有样本预测
for(i in 1:length(forest)){
pre <- predict(forest[[i]], newdata=data)
#根据决策树的准确率来打分,每次预测后,更新得分矩阵
for(j in 1:nrows){
pre_score[j, pre[j]] <- pre_score[j, pre[j]] + acc[i]
}
}
#选得分最高的那个类作为预测的类
result <- apply(pre_score, 1, myfun)
return(as.factor(result))
}
#用原数据集对决策森林进行检验
trees_acc <- create_forest(5, iris, 5, 5)
result <- mypre(trees_acc, iris, 5)
confusionMatrix(result, iris$Species)