基于分层k倍交叉证实构建决策森林（感觉不像决策森林，有些装代法的味道）

library(RWeka)

library(partykit)

library(caret)

#class_col是储存类别信息的列号

create_forest <- function(k_fold, data, class_col, n.fold){

#设置折叠数

k <- k_fold

#用列表储存分层折叠后的数据集

dataSet <- list()

#用列表储存构建的决策森林

forest <- list()

#储存每个树的准确度

acc <- c()

#将类别列的列名改为Species

colnames(data)[class_col] <- "Species"

#species_name储存类的名称

species_name <- names(table(data[,class_col]))

#class_num储存类的数量

class_num <- length(species_name)

#Species_num储存各类的数量

species_num <- as.numeric(table(data[,class_col]))

#分层折叠时以fold_index为参考索引

fold_index <- matrix(0, nrow=class_num, ncol=(k+1))

for(i in 1:class_num){

base_num <- species_num[i]%/%k

mod_num <- species_num[i]%%k

fold_index[i, 1] <- 1

for(j in 2:(k+1)){

if(j <= mod_num){

fold_index[i, j] <- fold_index[i, j-1] + base_num + 1

}

else{

fold_index[i, j] <- fold_index[i, j-1] + base_num

}

#分层折叠

#将混乱的索引按类储存进列表

for(n in 1:n.fold){

index <- list()

for(i in 1:class_num){

index[[i]] <- sample(which(data[,class_col]==species_name[i]),

length(which(data[,class_col]==species_name[i])),

replace=FALSE)

}

#将分层折叠的数据存入dataSet

for(i in 1:k){

dataSet[[i]] <- data[1,]

for(j in 1:class_num){

dataSet[[i]] <- rbind(dataSet[[i]],

data[index[[j]][fold_index[j,i]:(fold_index[j,i+1]-1)],])

}

dataSet[[i]] <- dataSet[[i]][-1, ]

}

#i为检验集，其他为训练集，构造决策树

for(i in 1:k){

test <- dataSet[[i]]

left <- dataSet[-i]

train <- test[1, ]

for(j in 1:length(left)){

train <- rbind(train, left[[j]])

}

train <- train[-1, ]

treeC4.5 <- J48(Species~., data=train)

forest[[i+(n-1)*k]] <- treeC4.5

acc[i+(n-1)*k] <- confusionMatrix(predict(treeC4.5, newdata=test),

test$Species)$overall[1]

}

#返回决策森林与树的准确率

return(list(forest, acc))

}

#用构造的森林预测样本的类别

myfun <- function(x){

return(names(x)[which.max(x)])

}

mypre <- function(trees_acc, data, class_col){

species <- names(table(data[,class_col]))

forest <- trees_acc[[1]]

acc <- trees_acc[[2]]

nrows <- nrow(data)

result <- c()

#得分矩阵，行是样本，一列代表一个类的得分,与CART树的预测结果类似

pre_score <- matrix(0, nrows*length(species), nrow=nrows, ncol=length(species))

colnames(pre_score) <- species

#每个树对所有样本预测

for(i in 1:length(forest)){

pre <- predict(forest[[i]], newdata=data)

#根据决策树的准确率来打分,每次预测后，更新得分矩阵

for(j in 1:nrows){

pre_score[j, pre[j]] <- pre_score[j, pre[j]] + acc[i]

}

#选得分最高的那个类作为预测的类

result <- apply(pre_score, 1, myfun)

return(as.factor(result))

}

#用原数据集对决策森林进行检验

trees_acc <- create_forest(5, iris, 5, 5)

result <- mypre(trees_acc, iris, 5)

confusionMatrix(result, iris$Species)

2021-12-17

基于分层k倍交叉证实构建决策森林（感觉不像决策森林，有些装代法的味道）

你可能感兴趣的:(2021-12-17)