关注公共号:小程在线
关注CSDN博客:程志伟的博客
R版本:3.6.1
adaboost包:提供Bagging函数和Adaboot函数
> setwd('G:\\R语言\\大三下半年\\数据挖掘:R语言实战\\')
> data=read.csv("G:\\R语言\\大三下半年\\数据挖掘:R语言实战\\数据挖掘:R语言实战(案例数据集)\\10 集成学习\\bank.csv",header=T,sep=";")
> dim(data)
[1] 4521 17
> head(data)
age job marital education default balance housing loan contact
1 30 unemployed married primary no 1787 no no cellular
2 33 services married secondary no 4789 yes yes cellular
3 35 management single tertiary no 1350 yes no cellular
4 30 management married tertiary no 1476 yes yes unknown
5 59 blue-collar married secondary no 0 yes no unknown
6 35 management single tertiary no 747 no no cellular
day month duration campaign pdays previous poutcome y
1 19 oct 79 1 -1 0 unknown no
2 11 may 220 1 339 4 failure no
3 16 apr 185 1 330 1 failure no
4 3 jun 199 4 -1 0 unknown no
5 5 may 226 1 -1 0 unknown no
6 23 feb 141 2 176 3 failure no
#summary对离散变量统计个数,对连续性变量统计最大、最小,百分数
> summary(data)
age job marital education
Min. :19.00 management :969 divorced: 528 primary : 678
1st Qu.:33.00 blue-collar:946 married :2797 secondary:2306
Median :39.00 technician :768 single :1196 tertiary :1350
Mean :41.17 admin. :478 unknown : 187
3rd Qu.:49.00 services :417
Max. :87.00 retired :230
(Other) :713
default balance housing loan contact
no :4445 Min. :-3313 no :1962 no :3830 cellular :2896
yes: 76 1st Qu.: 69 yes:2559 yes: 691 telephone: 301
Median : 444 unknown :1324
Mean : 1423
3rd Qu.: 1480
Max. :71188
day month duration campaign
Min. : 1.00 may :1398 Min. : 4 Min. : 1.000
1st Qu.: 9.00 jul : 706 1st Qu.: 104 1st Qu.: 1.000
Median :16.00 aug : 633 Median : 185 Median : 2.000
Mean :15.92 jun : 531 Mean : 264 Mean : 2.794
3rd Qu.:21.00 nov : 389 3rd Qu.: 329 3rd Qu.: 3.000
Max. :31.00 apr : 293 Max. :3025 Max. :50.000
(Other): 571
pdays previous poutcome y
Min. : -1.00 Min. : 0.0000 failure: 490 no :4000
1st Qu.: -1.00 1st Qu.: 0.0000 other : 197 yes: 521
Median : -1.00 Median : 0.0000 success: 129
Mean : 39.77 Mean : 0.5426 unknown:3705
3rd Qu.: -1.00 3rd Qu.: 0.0000
Max. :871.00 Max. :25.0000
随机抽取data四分之一的序号
> sub=sample(1:nrow(data),round(nrow(data)/4))
> length(sub)
[1] 1130
#不包含与sub的数据为训练集
> data_train=data[-sub,]
#将包含于sub的数据为测试集
> data_test=data[sub,]
> dim(data_train);dim(data_test)
[1] 3391 17
[1] 1130 17
> #########应用案例##################
> library(adabag)
> library(rpart)
#使用bagging函数建模
> bag=bagging(y~.,data_train,mfinal=5)
> names((bag))
[1] "formula" "trees" "votes" "prob" "class"
[6] "samples" "importance" "terms" "call"
#变量的重要性
> bag$importance
age balance campaign contact day default
1.8922873 0.0000000 0.4540416 0.6697403 2.4357156 0.7193998
duration education housing job loan marital
45.9418566 1.0869881 0.9077231 8.8893773 0.0000000 1.3477648
month pdays poutcome previous
11.9151942 4.0980214 19.6418900 0.0000000
> #模型的第二颗树
> bag$trees[2]
[[1]]
n= 3391
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 3391 410 no (0.87909171 0.12090829)
2) duration< 640.5 3112 265 no (0.91484576 0.08515424)
4) poutcome=failure,other,unknown 3029 212 no (0.93000990 0.06999010)
8) duration< 211.5 1882 41 no (0.97821467 0.02178533) *
9) duration>=211.5 1147 171 no (0.85091543 0.14908457)
18) pdays< 373 1136 161 no (0.85827465 0.14172535) *
19) pdays>=373 11 1 yes (0.09090909 0.90909091) *
5) poutcome=success 83 30 yes (0.36144578 0.63855422)
10) month=apr,aug,feb,may,nov,oct 51 22 no (0.56862745 0.43137255)
20) job=admin.,blue-collar,housemaid,retired,technician,unknown 39 11 no (0.71794872 0.28205128) *
21) job=management,services,student,unemployed 12 1 yes (0.08333333 0.91666667) *
11) month=jan,jul,jun,mar,sep 32 1 yes (0.03125000 0.96875000) *
3) duration>=640.5 279 134 yes (0.48028674 0.51971326)
6) contact=telephone,unknown 102 36 no (0.64705882 0.35294118)
12) marital=married 67 13 no (0.80597015 0.19402985) *
13) marital=divorced,single 35 12 yes (0.34285714 0.65714286)
26) education=secondary 13 4 no (0.69230769 0.30769231) *
27) education=primary,tertiary 22 3 yes (0.13636364 0.86363636) *
7) contact=cellular 177 68 yes (0.38418079 0.61581921)
14) month=apr,aug,dec,jul,nov 114 54 yes (0.47368421 0.52631579)
28) duration< 825 60 22 no (0.63333333 0.36666667)
56) education=secondary,tertiary,unknown 53 16 no (0.69811321 0.30188679)
112) duration>=658 46 10 no (0.78260870 0.21739130) *
113) duration< 658 7 1 yes (0.14285714 0.85714286) *
57) education=primary 7 1 yes (0.14285714 0.85714286) *
29) duration>=825 54 16 yes (0.29629630 0.70370370)
58) job=admin.,blue-collar,entrepreneur,housemaid,retired 15 4 no (0.73333333 0.26666667) *
59) job=management,self-employed,services,student,technician,unemployed 39 5 yes (0.12820513 0.87179487) *
15) month=feb,jan,jun,mar,may 63 14 yes (0.22222222 0.77777778) *
> #模型bag对样本的投票
> bag$votes[105:115,]
[,1] [,2]
[1,] 5 0
[2,] 5 0
[3,] 5 0
[4,] 5 0
[5,] 5 0
[6,] 5 0
[7,] 4 1
[8,] 5 0
[9,] 5 0
[10,] 5 0
[11,] 5 0
> #模型bag对样本的投票概率
> bag$prob[105:115,]
[,1] [,2]
[1,] 1.0 0.0
[2,] 1.0 0.0
[3,] 1.0 0.0
[4,] 1.0 0.0
[5,] 1.0 0.0
[6,] 1.0 0.0
[7,] 0.8 0.2
[8,] 1.0 0.0
[9,] 1.0 0.0
[10,] 1.0 0.0
[11,] 1.0 0.0
> #模型bag的预测类别
> bag$class[105:115]
[1] "no" "no" "no" "no" "no" "no" "no" "no" "no" "no" "no"
> #模型bag的模型抽样情况
> bag$samples[105:115]
[1] 2436 2 1135 171 2616 2252 2053 964 482 1193 19
> #模型bag的模型抽样情况
> bag$samples[105:115,]
[,1] [,2] [,3] [,4] [,5]
[1,] 2436 3309 1413 1983 1344
[2,] 2 1610 2143 771 841
[3,] 1135 2762 219 2554 3308
[4,] 171 3172 1870 2456 1060
[5,] 2616 652 1303 18 2491
[6,] 2252 174 2794 734 1309
[7,] 2053 2756 2685 2877 1652
[8,] 964 874 2267 1256 1977
[9,] 482 2337 989 1530 679
[10,] 1193 2274 1902 1813 2917
[11,] 19 2775 1956 419 2153
> #将树的深度降为3
> bag1=bagging(y~.,data_train,mfinal=5,control=rpart.control(maxdepth=3))
> #通过control参数控制基分类树的复杂度
> bag1$tree[2]#查看第二课子树的具体结构
[[1]]
n= 3391
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 3391 398 no (0.88263049 0.11736951)
2) duration< 647 3129 248 no (0.92074145 0.07925855)
4) poutcome=failure,other,unknown 3025 182 no (0.93983471 0.06016529) *
5) poutcome=success 104 38 yes (0.36538462 0.63461538)
10) duration< 118.5 11 0 no (1.00000000 0.00000000) *
11) duration>=118.5 93 27 yes (0.29032258 0.70967742) *
3) duration>=647 262 112 yes (0.42748092 0.57251908)
6) job=blue-collar,entrepreneur,housemaid,unemployed 88 36 no (0.59090909 0.40909091) *
7) job=admin.,management,retired,self-employed,services,student,technician 174 60 yes (0.34482759 0.65517241)
14) duration< 775 59 27 no (0.54237288 0.45762712) *
15) duration>=775 115 28 yes (0.24347826 0.75652174) *
> #使用bag模型对测试集进行预测
> pre_bag=predict(bag,data_test)
> names(pre_bag)
[1] "formula" "votes" "prob" "class" "confusion" "error"
> pre_bag$votes[1:10,]
[,1] [,2]
[1,] 5 0
[2,] 5 0
[3,] 5 0
[4,] 3 2
[5,] 5 0
[6,] 1 4
[7,] 5 0
[8,] 3 2
[9,] 5 0
[10,] 5 0
> pre_bag$class[1:10]
[1] "no" "no" "no" "no" "no" "yes" "no" "no" "no" "no"
> #测试集预测结果的混淆矩阵
> pre_bag$confusion
Observed Class
Predicted Class no yes
no 984 97
yes 20 29
> #测试集预测错误率
> pre_bag$error
[1] 0.1035398
> sub_minor=which(data_test$y=="yes")#取少数“yes”在测试集中的编号
> sub_major=which(data_test$y=="no")#去多数类“no”在测试集中的编号
> err_bag=sum(pre_bag$class!=data_test$y)/nrow(data_test)#计算总体错误率err_bag
> err_minor_bag=sum(pre_bag$class[sub_minor]!=data_test$y[sub_minor])/length(sub_minor)
> #计算少数类“yes“的错误率err——minor_bag
> err_major_bag=sum(pre_bag$class[sub_major]!=data_test$y[sub_major])/length(sub_major)
> #计算no的
> err_bag;err_minor_bag;err_major_bag
[1] 0.1035398
[1] 0.7698413
[1] 0.01992032
>#######Adaboost算法###########
> boo=boosting(y~.,data_train,mfinal=5) #Adaboost算法,建立模型
> pre_boo=predict(boo,data_test) #预测
> err_boo=sum(pre_boo$class!=data_test$y)/nrow(data_test)#计算总体错误率
> err_minor_boo=sum(pre_boo$class[sub_minor]!=data_test$y[sub_minor])/length(sub_minor)#少数类yes的错误率
> err_major_boo=sum(pre_boo$class[sub_major]!=data_test$y[sub_major])/length(sub_major)#多数类no的错误率
> err_boo;err_minor_boo;err_major_boo
[1] 0.1132743
[1] 0.7222222
[1] 0.03685259