参考:http://chiffon.gitcafe.io/2015/05/20/newtry.html#topofpage
数据来自UCIhttp://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening,一个信a用卡的数据,具体各项变量名以及变量名代表的含义不明(应该是出于保护隐私的目的),本文会用logit,GBM(Gradient Boosting Machines),knn,xgboost来对数据进行分类预测,对比准确率 预计的准确率应该是: xgboost > GBM > logit > knn
dataset <- read.table("F:\\R\\Rworkspace\\信用卡数据/crx.data", header=F, sep=",", na.strings="?")
head(dataset)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1 b 30.83 0.000 u g w v 1.25 t t 1 f g 202 0 +
## 2 a 58.67 4.460 u g q h 3.04 t t 6 f g 43 560 +
## 3 a 24.50 0.500 u g q h 1.50 t f 0 f g 280 824 +
## 4 b 27.83 1.540 u g w v 3.75 t t 5 t g 100 3 +
## 5 b 20.17 5.625 u g w v 1.71 t f 0 f s 120 0 +
## 6 b 32.08 4.000 u g m v 2.50 t f 0 t g 360 0 +
#以上是数据的形式,接下来看下数据是否有缺失值和各个数据的类型
sapply(dataset, function(x) sum(is.na(x)))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 12 12 0 6 6 9 9 0 0 0 0 0 0 13 0 0
sapply(dataset, class)
## V1 V2 V3 V4 V5 V6 V7
## "factor" "numeric" "numeric" "factor" "factor" "factor" "factor"
## V8 V9 V10 V11 V12 V13 V14
## "numeric" "factor" "factor" "integer" "factor" "factor" "integer"
## V15 V16
## "integer" "factor"
#分割数据的训练集和测试集,这里set.seed(123),设定70%的训练集,30%的测试集.
dim(dataset)
## [1] 690 16
#na.omit返回删除NA值后的对象
dataset <- na.omit(dataset)
dim(dataset)
## [1] 653 16
index <- sample(nrow(dataset), round(0.7*nrow(dataset)))
train <- dataset[index, ]
test <- dataset[-index, ]
dim(train)
## [1] 457 16
dim(test)
## [1] 196 16
有时候,因子型数据需要转化变量为哑变量,因为在一些挖掘场合,数据不能直接使用因子型的数据。knn • glmnet • svm • xgboost 有些挖掘方法是可以使用因子变量的,比如:• logistic regression • rpart • GBM • randomforest
dataset2 <- dataset
library(plyr)
into_factor <- function(x) {
if(class(x) == "factor") {
n <- length(x)
data.fac <- data.frame(x=x, y=1:n)
output <- model.matrix(y~x, data.fac)[, -1]
}else {
output <- x
}
output
}
into_factor(dataset$V4)[1:5, ]
## xu xy
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
dataset2 <- colwise(into_factor)(dataset2)
# 注意:此操作要删除所有的NA值后才行,否则会报错Error: 不是所有的length(rows) == 1都是TRUE
dataset2 <- do.call(cbind, dataset2)
dataset2 <- as.data.frame(dataset2)
head(dataset2)
## V1 V2 V3 xu xy xgg xp xc xcc xd xe xff xi xj xk xm xq xr xw xx xdd
## 1 1 30.83 0.000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 2 0 58.67 4.460 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 3 0 24.50 0.500 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 4 1 27.83 1.540 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 5 1 20.17 5.625 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 6 1 32.08 4.000 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## xff xh xj xn xo xv xz V8 V9 V10 V11 V12 xp xs V14 V15 V16
## 1 0 0 0 0 0 1 0 1.25 1 1 1 0 0 0 202 0 1
## 2 0 1 0 0 0 0 0 3.04 1 1 6 0 0 0 43 560 1
## 3 0 1 0 0 0 0 0 1.50 1 0 0 0 0 0 280 824 1
## 4 0 0 0 0 0 1 0 3.75 1 1 5 1 0 0 100 3 1
## 5 0 0 0 0 0 1 0 1.71 1 0 0 0 0 1 120 0 1
## 6 0 0 0 0 0 1 0 2.50 1 0 0 1 0 0 360 0 1
dim(dataset2)
## [1] 653 38
使用logistic回归来进行测试建模和预测,使用的函数是glm
#1)建模
logit.model <- glm(V16~., data=train, family="binomial")
#3)模型优化
logit.model <- step(logit.model)
## Start: AIC=307.61
## V16 ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 +
## V12 + V13 + V14 + V15
##
##
## Step: AIC=307.57
## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V12 +
## V13 + V14 + V15
##
## Df Deviance AIC
## - V13 2 235.53 303.53
## - V1 2 235.78 303.78
## - V8 1 236.03 306.03
## - V7 8 250.45 306.45
## - V12 1 237.02 307.02
## - V4 2 239.02 307.02
## - V2 1 237.29 307.29
## <none> 235.57 307.57
## - V10 1 237.69 307.69
## - V3 1 237.91 307.91
## - V11 1 239.35 309.35
## - V14 1 239.88 309.88
## - V6 14 270.37 314.37
## - V15 2 249.63 317.63
## - V9 2 371.49 439.49
##
## Step: AIC=303.53
## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V12 +
## V14 + V15
##
## Df Deviance AIC
## - V12 1 235.59 301.59
## - V1 1 235.79 301.79
## - V8 1 235.94 301.94
## - V7 8 250.28 302.28
## - V4 2 239.11 303.11
## - V2 1 237.29 303.29
## - V10 1 237.43 303.43
## <none> 235.53 303.53
## - V3 1 237.93 303.93
## - V11 1 239.02 305.02
## - V14 1 239.99 305.99
## - V6 13 270.50 312.50
## - V15 1 249.74 315.74
## - V9 1 371.58 437.58
##
## Step: AIC=301.59
## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V14 +
## V15
##
## Df Deviance AIC
## - V1 1 235.87 299.87
## - V8 1 236.06 300.06
## - V7 8 250.51 300.51
## - V4 2 239.13 301.13
## - V2 1 237.40 301.40
## - V10 1 237.54 301.54
## <none> 235.59 301.59
## - V3 1 237.96 301.96
## - V11 1 239.08 303.08
## - V14 1 240.05 304.05
## - V6 13 270.53 310.53
## - V15 1 249.75 313.75
## - V9 1 371.58 435.58
##
## Step: AIC=299.87
## V16 ~ V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V14 + V15
##
## Df Deviance AIC
## - V8 1 236.33 298.33
## - V7 8 250.56 298.56
## - V4 2 239.56 299.56
## - V2 1 237.56 299.56
## - V10 1 237.81 299.81
## <none> 235.87 299.87
## - V3 1 238.27 300.27
## - V11 1 239.36 301.36
## - V14 1 240.38 302.38
## - V6 13 270.53 308.53
## - V15 1 250.02 312.02
## - V9 1 371.87 433.87
##
## Step: AIC=298.33
## V16 ~ V2 + V3 + V4 + V6 + V7 + V9 + V10 + V11 + V14 + V15
##
## Df Deviance AIC
## - V7 8 250.84 296.84
## - V4 2 239.79 297.79
## - V10 1 238.29 298.29
## <none> 236.33 298.33
## - V3 1 238.61 298.61
## - V2 1 238.70 298.70
## - V11 1 240.12 300.12
## - V14 1 240.72 300.72
## - V6 13 271.43 307.43
## - V15 1 250.16 310.16
## - V9 1 380.26 440.26
##
## Step: AIC=296.84
## V16 ~ V2 + V3 + V4 + V6 + V9 + V10 + V11 + V14 + V15
##
## Df Deviance AIC
## - V2 1 252.14 296.14
## - V4 2 254.33 296.33
## - V10 1 252.49 296.49
## <none> 250.84 296.84
## - V3 1 254.44 298.44
## - V14 1 254.54 298.54
## - V11 1 254.57 298.57
## - V15 1 261.39 305.39
## - V6 13 289.21 309.21
## - V9 1 394.85 438.85
##
## Step: AIC=296.14
## V16 ~ V3 + V4 + V6 + V9 + V10 + V11 + V14 + V15
##
## Df Deviance AIC
## - V10 1 253.58 295.58
## - V4 2 256.00 296.00
## <none> 252.14 296.14
## - V3 1 255.26 297.26
## - V14 1 255.43 297.43
## - V11 1 256.66 298.66
## - V15 1 263.54 305.54
## - V6 13 289.22 307.22
## - V9 1 402.78 444.78
##
## Step: AIC=295.58
## V16 ~ V3 + V4 + V6 + V9 + V11 + V14 + V15
##
## Df Deviance AIC
## <none> 253.58 295.58
## - V4 2 257.68 295.68
## - V3 1 256.82 296.82
## - V14 1 257.00 297.00
## - V15 1 266.01 306.01
## - V11 1 267.64 307.64
## - V6 13 292.03 308.03
## - V9 1 409.96 449.96
#2)预测
logit.predict <- predict(logit.model, test, type="response")
logit.pred <- ifelse(logit.predict > 0.5, "+", "-")
table(logit.pred, test$V16)
##
## logit.pred - +
## - 91 13
## + 19 73
mean(logit.pred==test$V16)
## [1] 0.8367347
使用GBM方法来进行预测,这里用的是caret,repeat-cv来选择最优树
#1、使用组合算法建模
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
ctrl <- trainControl(method = "repeatedcv", number = 5, repeats=5)
set.seed(300)
model_gbm <- train(V16~., data=train, method="gbm", metric="Kappa", trControl=ctrl)
## Loading required package: gbm
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2753 nan 0.1000 0.0480
## 2 1.1983 nan 0.1000 0.0392
。。。
## 100 0.3115 nan 0.1000 -0.0018
#2、建模
pred_gbm <- predict(model_gbm, test)
#3、模型评估
table(pred_gbm, test$V16)
##
## pred_gbm - +
## - 90 8
## + 20 78
mean(pred_gbm==test$V16)
## [1] 0.8571429
首先测试一个knn模型,不做CV,不做标准化,不做数据类型转换得到的结果,这里,不转换数据类型会把因子类型的变量舍弃,仅保留数值变量
##knn算法 无标准化
#1、建模
library(caret)
model_knn <- knn3(V16~., data=train, k=5)
#2、预测
pred_knn <- predict(model_knn, test, class="response")
#str(pred_knn)
pred_knn1 <-ifelse(pred_knn[, 1]<0.5, "+", "-")
table(pred_knn1, test$V16)
##
## pred_knn1 - +
## - 78 31
## + 32 55
mean(pred_knn1==test$V16)
## [1] 0.6785714
##knn算法,标准化处理
#1、对数据集进行标准化处理
dataset2 <- cbind(colwise(scale)(dataset2[, -38]), V16=as.factor(dataset2$V16))
#2、重新创建训练集和测试集数据
set.seed(123)
library(caret)
index <- createDataPartition(dataset2$V16, p=0.7, list=F)
train_scale <- dataset2[index, ]
test_scale <- dataset2[-index, ]
#3、建模
model_scale <- knn3(V16~., data=train_scale, k=5)
#4、预测
pred_scale <- predict(model_scale, test_scale, type="class")
table(pred_scale, test_scale$V16)
##
## pred_scale 0 1
## 0 85 34
## 1 22 54
mean(pred_scale==test_scale$V16)
## [1] 0.7128205
##knn算法的K-折交叉验证
#1、创建交叉验证函数
library(class)
library(caret)
knn_cv <- function(data, n=5, k) {
index <- sample(1:5, nrow(data), replace=T)
acc <- 0
for(i in 1:5) {
ind = index == i
train <- data[-ind, ]
test <- data[ind, ]
model_knn <- knn3(V16~., data=train, k=k)
pred <- predict(model_knn, test, type="class")
acc[i] <- mean(pred==test$V16)
}
mean(acc)
}
#2、重新创建训练集和测试集数据
set.seed(123)
library(caret)
index <- createDataPartition(dataset2$V16, p=0.7, list=F)
train_cv <- dataset2[index, ]
test_cv <- dataset2[-index, ]
#3、验证K-折交叉验证函数
knn_cv(train_cv, 3, 5)
## [1] 0.8730277
acc <- 0
for(i in 2:20) {
acc[i] <- knn_cv(train_cv, 3, i)
print(paste(i,knn_cv(train_cv, 3, i), sep="——》"))
}
## [1] "2——》0.890464698699993"
## [1] "3——》0.909093242809613"
## [1] "4——》0.883418367346939"
## [1] "5——》0.872213395351693"
## [1] "6——》0.867066796941293"
## [1] "7——》0.854495087053878"
## [1] "8——》0.848111602148951"
## [1] "9——》0.842036790410966"
## [1] "10——》0.840109115714773"
## [1] "11——》0.831505296371422"
## [1] "12——》0.833180115999597"
## [1] "13——》0.817368273676231"
## [1] "14——》0.834854658514452"
## [1] "15——》0.827973456326398"
## [1] "16——》0.827506897124673"
## [1] "17——》0.833968115218115"
## [1] "18——》0.832558368162111"
## [1] "19——》0.835863665139981"
## [1] "20——》0.832136554676483"
#上面等价于下面代码
#k <- 1:20
#set.seed(123)
#acc <- sapply(k, function(x) knn_cv(train_cv, 3, x))
#4、确定准确率最大的K值,并重新建模
plot(1:20, acc, type="b")
k.final <- which.max(acc)
k.final
## [1] 3
model <- knn3(V16~., data=train_cv, k=k.final)
#5、预测
pred <- predict(model, test_cv, type="class")
table(pred, test_cv$V16)
##
## pred 0 1
## 0 89 37
## 1 18 51
mean(pred==test_cv$V16)
## [1] 0.7179487
require(xgboost)
## Loading required package: xgboost
require(methods)
require(plyr)
set.seed(123)
n <- nrow(dataset2)
index = sample(n,round(0.7*n))
train.xg = dataset2[index,]
test.xg = dataset2[-index,]
label <- as.matrix(train.xg[,38,drop =F])
data <- as.matrix(train.xg[,-38,drop =F])
data2 <- as.matrix(test.xg[,-38,drop =F])
label2 = as.matrix(test.xg[,38,drop =F])
xgmat <- xgb.DMatrix(data, label = label, missing = -10000)
param <- list("objective" = "binary:logistic",
"bst:eta" = 1,
"bst:max_depth" = 2,
"eval_metric" = "logloss",
"silent" = 1,
"nthread" = 16 ,
"min_child_weight" =1.45
)
nround =275
bst = xgb.train(param, xgmat, nround )
res1 = predict(bst,data2)
pre1 = ifelse(res1>0.5,1,0)
table(pre1,label2)
## label2
## pre1 0 1
## 0 91 15
## 1 12 78
table(pre1,label2)
## label2
## pre1 0 1
## 0 91 15
## 1 12 78
mean(pre1 ==label2)
## [1] 0.8622449