得到监督模型后,我们可以改变输入值,比较给定模型输出效果的变化敏感程度来评估不同特征对模型的的重要性。
library(lattice)
library(ggplot2)
library(caret)
data(churn)
str(churnTrain)
churnTrain = churnTrain[,!names(churnTrain) %in% c("state","area_code","account_length")]
#生成随机编号为2的随机数
set.seed(2)
#将churnTrain的数据集分为两类,按0.7与0.3的比例无放回抽样
ind = sample(2,nrow(churnTrain),replace = TRUE,prob = c(0.7,0.3))
trainset = churnTrain[ind == 1,]
testset = churnTrain[ind == 2,]
control = trainControl(method = "repeatedcv",number = 10,repeats = 3)
library(rpart)
library(C50)
data(churn)
model = train(churn~.,data = trainset,method = "rpart",preProcess = "scale" ,trControl = control)
importance = varImp(model,scale = FALSE)
importance
rpart variable importance
Overall
number_customer_service_calls 116.015
total_day_minutes 106.988
total_day_charge 100.648
international_planyes 86.789
voice_mail_planyes 25.974
total_eve_minutes 23.097
total_eve_charge 23.097
number_vmail_messages 19.885
total_intl_minutes 6.347
total_intl_calls 0.000
total_night_minutes 0.000
total_day_calls 0.000
total_night_calls 0.000
total_night_charge 0.000
total_eve_calls 0.000
total_intl_charge 0.000
利用plot图绘制变量图重要性
plot(importance)
利用rpart等一些分类算法包从训练模型中产生的对象包括了变量了重要性,我们可以借助输出查看变量的重要性。
model.rp = rpart(churn ~ .,data = trainset)
model.rp$variable.importance
total_day_minutes total_day_charge number_customer_service_calls total_intl_minutes
111.645286 110.881583 58.486651 48.283228
total_intl_charge total_eve_charge total_eve_minutes international_plan
47.698379 47.166646 47.166646 42.194508
total_intl_calls number_vmail_messages voice_mail_plan total_night_calls
36.730344 19.884863 19.884863 7.195828
total_eve_calls total_night_charge total_night_minutes total_day_calls
3.553423 1.754547 1.754547 1.494986
重新生成trainset
new_train = trainset[,!names(churnTrain) %in% c("churn","international_plan","voice_mail_plan")]
计算每个属性之间的关联度
cor_mat = cor(new_train)
cor_mat
number_vmail_messages total_day_minutes total_day_calls total_day_charge total_eve_minutes
number_vmail_messages 1.000000e+00 -3.788346e-05 -0.015315725 -4.344686e-05 0.016058678
total_day_minutes -3.788346e-05 1.000000e+00 0.003940177 1.000000e+00 0.018136088
total_day_calls -1.531573e-02 3.940177e-03 1.000000000 3.942808e-03 -0.016774585
total_day_charge -4.344686e-05 1.000000e+00 0.003942808 1.000000e+00 0.018138428
total_eve_minutes 1.605868e-02 1.813609e-02 -0.016774585 1.813843e-02 1.000000000
total_eve_calls -1.715851e-02 2.421777e-02 0.001938560 2.422109e-02 -0.024822635
total_eve_charge 1.608561e-02 1.813039e-02 -0.016756722 1.813273e-02 0.999999775
total_night_minutes 1.536272e-02 7.287277e-03 0.019582169 7.286773e-03 -0.009225611
total_night_calls 7.575418e-03 2.334305e-02 -0.010994425 2.334289e-02 -0.001625934
total_night_charge 1.534769e-02 7.246376e-03 0.019590677 7.245871e-03 -0.009243068
total_intl_minutes 7.256768e-03 -1.623444e-02 0.019197349 -1.623804e-02 -0.012775313
total_intl_calls 1.513658e-02 1.428235e-02 0.003062639 1.428083e-02 -0.001403721
total_intl_charge 7.286473e-03 -1.620263e-02 0.019255703 -1.620623e-02 -0.012697993
number_customer_service_calls -2.210761e-02 -7.981699e-03 -0.017568292 -7.982226e-03 -0.012419994
total_eve_calls total_eve_charge total_night_minutes total_night_calls total_night_charge
number_vmail_messages -0.0171585053 0.016085609 0.015362721 0.007575418 0.015347687
total_day_minutes 0.0242177707 0.018130387 0.007287277 0.023343046 0.007246376
total_day_calls 0.0019385605 -0.016756722 0.019582169 -0.010994425 0.019590677
total_day_charge 0.0242210899 0.018132728 0.007286773 0.023342891 0.007245871
total_eve_minutes -0.0248226349 0.999999775 -0.009225611 -0.001625934 -0.009243068
total_eve_calls 1.0000000000 -0.024817028 -0.008842555 0.007155111 -0.008786733
total_eve_charge -0.0248170279 1.000000000 -0.009223415 -0.001612116 -0.009240862
total_night_minutes -0.0088425553 -0.009223415 1.000000000 0.026300284 0.999999233
total_night_calls 0.0071551108 -0.001612116 0.026300284 1.000000000 0.026261765
total_night_charge -0.0087867331 -0.009240862 0.999999233 0.026261765 1.000000000
total_intl_minutes 0.0008668991 -0.012791651 -0.005866862 0.002762074 -0.005879993
total_intl_calls 0.0079279293 -0.001397968 -0.012199350 0.015687833 -0.012174331
total_intl_charge 0.0008246073 -0.012714417 -0.005862733 0.002657183 -0.005875863
number_customer_service_calls 0.0063984603 -0.012415843 -0.001085209 -0.020231294 -0.001067571
total_intl_minutes total_intl_calls total_intl_charge number_customer_service_calls
number_vmail_messages 0.0072567683 0.015136582 0.0072864731 -0.022107609
total_day_minutes -0.0162344421 0.014282352 -0.0162026313 -0.007981699
total_day_calls 0.0191973487 0.003062639 0.0192557029 -0.017568292
total_day_charge -0.0162380367 0.014280828 -0.0162062318 -0.007982226
total_eve_minutes -0.0127753135 -0.001403721 -0.0126979931 -0.012419994
total_eve_calls 0.0008668991 0.007927929 0.0008246073 0.006398460
total_eve_charge -0.0127916505 -0.001397968 -0.0127144174 -0.012415843
total_night_minutes -0.0058668622 -0.012199350 -0.0058627326 -0.001085209
total_night_calls 0.0027620735 0.015687833 0.0026571833 -0.020231294
total_night_charge -0.0058799926 -0.012174331 -0.0058758631 -0.001067571
total_intl_minutes 1.0000000000 0.044054461 0.9999928823 -0.015930448
total_intl_calls 0.0440544614 1.000000000 0.0441335428 -0.018235001
total_intl_charge 0.9999928823 0.044133543 1.0000000000 -0.015940717
number_customer_service_calls -0.0159304482 -0.018235001 -0.0159407173 1.000000000
调用findCorrelation函数找到关联度超过0.75的属性
highly_correlations = findCorrelation(cor_mat,cutoff = 0.75)
highly_correlations
[1] 11 4 5 8
输出这些高度关联的属性的名称:
names(new_train)[highly_correlations]
[1] "total_intl_minutes" "total_day_charge" "total_eve_minutes" "total_night_minutes"
为了得到每个属性的相关值,需要先去掉非数值类型的属性,然后通过相关性计算得到一个关联度矩阵,然后将关联度阈值设定为0.75,包括total_intl_minutes,total_day_charge,total_eve_minutes,total_night_minutes。
将训练数据集trainset中名为international_plan的特征转化为intl_yes,intl_no:
intl_plan = model.matrix(~ trainset$international_plan -1,data = data.frame(trainset$international_plan))
colnames(intl_plan) = c( "trainset$international_planno" = "intl_no" ,"trainset$international_planyes" = "intl_yes")
将训练集中的trainset中名为了voice_mail_plan特征转化成voice_yes与voice_no:
voice_plan = model.matrix(~ trainset$voice_mail_plan -1,data = data.frame(trainset$voice_mail_plan))
colnames(voice_plan) = c( "trainset$voice_mail_planno" = "voice_no" ,"trainset$voice_mail_planyes" = "voice_yes")
去掉international_plan 和voice_mail_planno这两个属性,将训练集的trainset和intl_plan 、voice_plan 两个数据框合并。
trainset$international_plan = NULL
trainset$voice_mail_plan = NULL
trainset = cbind(intl_plan,voice_plan,trainset)
同理对测试数据集做同样的处理:
intl_plan = model.matrix(~testset$international_plan -1,data = data.frame(testset$international_plan))
colnames(intl_plan) = c( "testset$international_planno" = "intl_no" ,"testset$international_planyes" = "intl_yes")
voice_plan = model.matrix(~ testset$voice_mail_plan -1,data = data.frame(testset$voice_mail_plan))
colnames(voice_plan) = c( "testset$voice_mail_planno" = "voice_no" ,"testset$voice_mail_planyes" = "voice_yes")
testset$international_plan = NULL
testset$voice_mail_plan = NULL
testset = cbind(intl_plan,voice_plan,testset)
使用线性判别分析方法创建一个特征筛选算法:
ldacontrol = rfeControl(functions = ldaFuncs,method = "cv")
使用从编号1到18的数据子集对训练数据集trainset进行反向特征筛选:
ldaprofile = rfe(trainset[,!names(trainset) %in% c("churn")],trainset[,c("churn")],sizes = c(1:18),rfeControl = ldacontrol)
ldaprofile
Recursive feature selection
Outer resampling method: Cross-Validated (10 fold)
Resampling performance over subset size:
Variables Accuracy Kappa AccuracySD KappaSD Selected
1 0.8523 0.0000 0.001675 0.00000
2 0.8523 0.0000 0.001675 0.00000
3 0.8436 0.1400 0.011711 0.09055
4 0.8432 0.2076 0.010202 0.03927
5 0.8471 0.2321 0.016556 0.05733
6 0.8454 0.2308 0.015287 0.04411
7 0.8462 0.2369 0.014101 0.04268
8 0.8441 0.2220 0.016293 0.07222
9 0.8458 0.2284 0.016027 0.06877
10 0.8479 0.2377 0.017831 0.08265
11 0.8492 0.2481 0.018360 0.08050
12 0.8510 0.2542 0.016630 0.07754
13 0.8514 0.2577 0.017362 0.07950
14 0.8536 0.2695 0.016204 0.07610 *
15 0.8523 0.2693 0.016640 0.06845
16 0.8531 0.2713 0.016522 0.06908
17 0.8514 0.2624 0.016150 0.07040
18 0.8510 0.2612 0.015494 0.06877
The top 5 variables (out of 14):
total_day_charge, total_day_minutes, intl_no, intl_yes, numb`
r_customer_service_calls
绘制选择结果示意图:
plot(ldaprofile,type = c("o","g"))
ldaprofile$optVariables
[1] "total_day_charge" "total_day_minutes" "intl_no" "intl_yes"
[5] "number_customer_service_calls" "total_eve_minutes" "total_eve_charge" "voice_yes"
[9] "total_intl_calls" "voice_no" "number_vmail_messages" "total_intl_charge"
[13] "total_intl_minutes" "total_night_minutes"
检测合适的模型:
ldaprofile$fit
Call:
lda(x, y)
Prior probabilities of groups:
yes no
0.1477322 0.8522678
Group means:
total_day_charge total_day_minutes intl_no intl_yes number_customer_service_calls total_eve_minutes total_eve_charge voice_yes
yes 35.00143 205.8877 0.7046784 0.29532164 2.204678 213.7269 18.16702 0.1666667
no 29.62402 174.2555 0.9351242 0.06487582 1.441460 199.6197 16.96789 0.2954891
total_intl_calls voice_no number_vmail_messages total_intl_charge total_intl_minutes total_night_minutes
yes 4.134503 0.8333333 5.099415 2.899386 10.73684 205.4640
no 4.514445 0.7045109 8.674607 2.741343 10.15119 201.4184
Coefficients of linear discriminants:
LD1
total_day_charge 0.715025524
total_day_minutes -0.130486469
intl_no 1.129944662
intl_yes -1.129944662
number_customer_service_calls -0.421997335
total_eve_minutes 0.198406977
total_eve_charge -2.390372792
voice_yes 0.330463968
total_intl_calls 0.066240268
voice_no -0.330463968
number_vmail_messages -0.003529233
total_intl_charge 2.315069869
total_intl_minutes -0.693504606
total_night_minutes -0.002127471
最后,通过重新采样来评估性能
postResample(predict(ldaprofile,testset[,!names(testset) %in% c("churn")]),testset[,c("churn")])
Accuracy Kappa
0.8605108 0.2672027
使用caret包完成特征的筛选,由于数据集包括了因子编码属性,首先调用model.matrix将这些因子转化成多个二元属性。
我们将训练方法设置为多个交叉验证方法.cv,同时调用了函数ladFuncs来完成线性判别分析,调用递归特征函数ladFuncs来执行特征选择,还可以通过函数ref在重采样的数据子集和筛选后的特征集基础上再次评估模型性能,并输出特征选择结果。
我们能够根据所得的模型基本信息来绘制变量个数与预测准确率之间的关系示意图,从图中可以知道(模型中最合适的变量因子为12),最后我们计算得到了重采样后数据集的预测准确度为0.86,kappa的检测结果为0.27.