install.packages("randomForest")
#只有caret包才有混淆矩阵
#set.seed保证每次运行代码获得相同的随机样本
npdata <- read.table("data.txt",header=T)
set.seed(123)
#抽取训练集和测试集
train_sub <- sample(nrow(npdata),0.7*nrow(npdata ))#随机无放回抽取0.7样本
train_data <- npdata[train_sub,]
test_data <- npdata[-train_sub,]
library(randomForest)
#导入数据
#随机森林计算
set.seed(100)
np.rf <- randomForest(npl~.,data=train_data,importance=TRUE,ntree=50)
#使用训练集查看预测精度
np_predict <- predict(np.rf,train_data)
plot(train_data$npl,np_predict,main='训练集', xlab = 'npl',ylab = 'Predict')
abline(1,1)
#使用测试集评估预测性能
np_predict <- predict(np.rf,test_data)
plot(test_data$npl,np_predict,main='预测集', xlab = 'npl',ylab = 'Predict')
abline(1,1)
#查看拟合优度
print(np.rf)
importance(np.rf)
varImpPlot(np.rf,main = "variable importance")
#最优决策树数量
plot(np.rf)
#绘制错误率表,确定变量个数
result = rfcv(train_data[-1],train_data$npl,cv.fold = 10)
result$error.cv
with(result,plot(n.var,error.cv,log="x",type = "o",lwd=2))