项目1:logit,GBM,knn,xgboost准确率测试

参考:http://chiffon.gitcafe.io/2015/05/20/newtry.html#topofpage


数据来自UCIhttp://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening,一个信a用卡的数据,具体各项变量名以及变量名代表的含义不明(应该是出于保护隐私的目的),本文会用logit,GBM(Gradient Boosting Machines),knn,xgboost来对数据进行分类预测,对比准确率 预计的准确率应该是: xgboost > GBM > logit > knn

1、加载数据

dataset <- read.table("F:\\R\\Rworkspace\\信用卡数据/crx.data", header=F, sep=",", na.strings="?")
head(dataset)
##   V1    V2    V3 V4 V5 V6 V7   V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1  b 30.83 0.000  u  g  w  v 1.25  t   t   1   f   g 202   0   +
## 2  a 58.67 4.460  u  g  q  h 3.04  t   t   6   f   g  43 560   +
## 3  a 24.50 0.500  u  g  q  h 1.50  t   f   0   f   g 280 824   +
## 4  b 27.83 1.540  u  g  w  v 3.75  t   t   5   t   g 100   3   +
## 5  b 20.17 5.625  u  g  w  v 1.71  t   f   0   f   s 120   0   +
## 6  b 32.08 4.000  u  g  m  v 2.50  t   f   0   t   g 360   0   +
#以上是数据的形式,接下来看下数据是否有缺失值和各个数据的类型
sapply(dataset, function(x) sum(is.na(x)))
##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 
##  12  12   0   6   6   9   9   0   0   0   0   0   0  13   0   0
sapply(dataset, class)
##        V1        V2        V3        V4        V5        V6        V7 
##  "factor" "numeric" "numeric"  "factor"  "factor"  "factor"  "factor" 
##        V8        V9       V10       V11       V12       V13       V14 
## "numeric"  "factor"  "factor" "integer"  "factor"  "factor" "integer" 
##       V15       V16 
## "integer"  "factor"

2、创建训练集和测试集数据

#分割数据的训练集和测试集,这里set.seed(123),设定70%的训练集,30%的测试集.
dim(dataset)
## [1] 690  16
#na.omit返回删除NA值后的对象
dataset <- na.omit(dataset)
dim(dataset)
## [1] 653  16
index <- sample(nrow(dataset), round(0.7*nrow(dataset)))
train <- dataset[index, ]
test <- dataset[-index, ]
dim(train)
## [1] 457  16
dim(test)
## [1] 196  16

3、将因子类型转化为哑元变量

有时候,因子型数据需要转化变量为哑变量,因为在一些挖掘场合,数据不能直接使用因子型的数据。knn • glmnet • svm • xgboost 有些挖掘方法是可以使用因子变量的,比如:• logistic regression • rpart • GBM • randomforest

dataset2 <- dataset
library(plyr)
into_factor <- function(x) {
  if(class(x) == "factor") {
    n <- length(x)
    data.fac <- data.frame(x=x, y=1:n)
    output <- model.matrix(y~x, data.fac)[, -1]
  }else {
    output <- x
  }
  output
}

into_factor(dataset$V4)[1:5, ]
##   xu xy
## 1  1  0
## 2  1  0
## 3  1  0
## 4  1  0
## 5  1  0
dataset2 <- colwise(into_factor)(dataset2)
# 注意:此操作要删除所有的NA值后才行,否则会报错Error: 不是所有的length(rows) == 1都是TRUE

dataset2 <- do.call(cbind, dataset2)
dataset2 <- as.data.frame(dataset2)
head(dataset2)
##   V1    V2    V3 xu xy xgg xp xc xcc xd xe xff xi xj xk xm xq xr xw xx xdd
## 1  1 30.83 0.000  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0
## 2  0 58.67 4.460  1  0   0  0  0   0  0  0   0  0  0  0  0  1  0  0  0   0
## 3  0 24.50 0.500  1  0   0  0  0   0  0  0   0  0  0  0  0  1  0  0  0   0
## 4  1 27.83 1.540  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0
## 5  1 20.17 5.625  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0
## 6  1 32.08 4.000  1  0   0  0  0   0  0  0   0  0  0  0  1  0  0  0  0   0
##   xff xh xj xn xo xv xz   V8 V9 V10 V11 V12 xp xs V14 V15 V16
## 1   0  0  0  0  0  1  0 1.25  1   1   1   0  0  0 202   0   1
## 2   0  1  0  0  0  0  0 3.04  1   1   6   0  0  0  43 560   1
## 3   0  1  0  0  0  0  0 1.50  1   0   0   0  0  0 280 824   1
## 4   0  0  0  0  0  1  0 3.75  1   1   5   1  0  0 100   3   1
## 5   0  0  0  0  0  1  0 1.71  1   0   0   0  0  1 120   0   1
## 6   0  0  0  0  0  1  0 2.50  1   0   0   1  0  0 360   0   1
dim(dataset2)
## [1] 653  38

4、logistic 回归

使用logistic回归来进行测试建模和预测,使用的函数是glm

#1)建模
logit.model <- glm(V16~., data=train, family="binomial")

#3)模型优化
logit.model <- step(logit.model)
## Start:  AIC=307.61
## V16 ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + 
##     V12 + V13 + V14 + V15
## 
## 
## Step:  AIC=307.57
## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V12 + 
##     V13 + V14 + V15
## 
##        Df Deviance    AIC
## - V13   2   235.53 303.53
## - V1    2   235.78 303.78
## - V8    1   236.03 306.03
## - V7    8   250.45 306.45
## - V12   1   237.02 307.02
## - V4    2   239.02 307.02
## - V2    1   237.29 307.29
## <none>      235.57 307.57
## - V10   1   237.69 307.69
## - V3    1   237.91 307.91
## - V11   1   239.35 309.35
## - V14   1   239.88 309.88
## - V6   14   270.37 314.37
## - V15   2   249.63 317.63
## - V9    2   371.49 439.49
## 
## Step:  AIC=303.53
## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V12 + 
##     V14 + V15
## 
##        Df Deviance    AIC
## - V12   1   235.59 301.59
## - V1    1   235.79 301.79
## - V8    1   235.94 301.94
## - V7    8   250.28 302.28
## - V4    2   239.11 303.11
## - V2    1   237.29 303.29
## - V10   1   237.43 303.43
## <none>      235.53 303.53
## - V3    1   237.93 303.93
## - V11   1   239.02 305.02
## - V14   1   239.99 305.99
## - V6   13   270.50 312.50
## - V15   1   249.74 315.74
## - V9    1   371.58 437.58
## 
## Step:  AIC=301.59
## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V14 + 
##     V15
## 
##        Df Deviance    AIC
## - V1    1   235.87 299.87
## - V8    1   236.06 300.06
## - V7    8   250.51 300.51
## - V4    2   239.13 301.13
## - V2    1   237.40 301.40
## - V10   1   237.54 301.54
## <none>      235.59 301.59
## - V3    1   237.96 301.96
## - V11   1   239.08 303.08
## - V14   1   240.05 304.05
## - V6   13   270.53 310.53
## - V15   1   249.75 313.75
## - V9    1   371.58 435.58
## 
## Step:  AIC=299.87
## V16 ~ V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V14 + V15
## 
##        Df Deviance    AIC
## - V8    1   236.33 298.33
## - V7    8   250.56 298.56
## - V4    2   239.56 299.56
## - V2    1   237.56 299.56
## - V10   1   237.81 299.81
## <none>      235.87 299.87
## - V3    1   238.27 300.27
## - V11   1   239.36 301.36
## - V14   1   240.38 302.38
## - V6   13   270.53 308.53
## - V15   1   250.02 312.02
## - V9    1   371.87 433.87
## 
## Step:  AIC=298.33
## V16 ~ V2 + V3 + V4 + V6 + V7 + V9 + V10 + V11 + V14 + V15
## 
##        Df Deviance    AIC
## - V7    8   250.84 296.84
## - V4    2   239.79 297.79
## - V10   1   238.29 298.29
## <none>      236.33 298.33
## - V3    1   238.61 298.61
## - V2    1   238.70 298.70
## - V11   1   240.12 300.12
## - V14   1   240.72 300.72
## - V6   13   271.43 307.43
## - V15   1   250.16 310.16
## - V9    1   380.26 440.26
## 
## Step:  AIC=296.84
## V16 ~ V2 + V3 + V4 + V6 + V9 + V10 + V11 + V14 + V15
## 
##        Df Deviance    AIC
## - V2    1   252.14 296.14
## - V4    2   254.33 296.33
## - V10   1   252.49 296.49
## <none>      250.84 296.84
## - V3    1   254.44 298.44
## - V14   1   254.54 298.54
## - V11   1   254.57 298.57
## - V15   1   261.39 305.39
## - V6   13   289.21 309.21
## - V9    1   394.85 438.85
## 
## Step:  AIC=296.14
## V16 ~ V3 + V4 + V6 + V9 + V10 + V11 + V14 + V15
## 
##        Df Deviance    AIC
## - V10   1   253.58 295.58
## - V4    2   256.00 296.00
## <none>      252.14 296.14
## - V3    1   255.26 297.26
## - V14   1   255.43 297.43
## - V11   1   256.66 298.66
## - V15   1   263.54 305.54
## - V6   13   289.22 307.22
## - V9    1   402.78 444.78
## 
## Step:  AIC=295.58
## V16 ~ V3 + V4 + V6 + V9 + V11 + V14 + V15
## 
##        Df Deviance    AIC
## <none>      253.58 295.58
## - V4    2   257.68 295.68
## - V3    1   256.82 296.82
## - V14   1   257.00 297.00
## - V15   1   266.01 306.01
## - V11   1   267.64 307.64
## - V6   13   292.03 308.03
## - V9    1   409.96 449.96
#2)预测
logit.predict <- predict(logit.model, test, type="response")
logit.pred <- ifelse(logit.predict > 0.5, "+", "-")

table(logit.pred, test$V16)
##           
## logit.pred  -  +
##          - 91 13
##          + 19 73
mean(logit.pred==test$V16)
## [1] 0.8367347

5、GBM:Gradient Boosting Machines

使用GBM方法来进行预测,这里用的是caret,repeat-cv来选择最优树

#1、使用组合算法建模
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
ctrl <- trainControl(method = "repeatedcv", number = 5, repeats=5)
set.seed(300)
model_gbm <- train(V16~., data=train, method="gbm", metric="Kappa", trControl=ctrl)
## Loading required package: gbm
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.2753             nan     0.1000    0.0480
##      2        1.1983             nan     0.1000    0.0392
。。。
##    100        0.3115             nan     0.1000   -0.0018
#2、建模
pred_gbm <- predict(model_gbm, test)

#3、模型评估
table(pred_gbm, test$V16)
##         
## pred_gbm  -  +
##        - 90  8
##        + 20 78
mean(pred_gbm==test$V16)
## [1] 0.8571429

6、knn method for classification

首先测试一个knn模型,不做CV,不做标准化,不做数据类型转换得到的结果,这里,不转换数据类型会把因子类型的变量舍弃,仅保留数值变量

##knn算法 无标准化
#1、建模
library(caret)
model_knn <- knn3(V16~., data=train, k=5)

#2、预测
pred_knn <- predict(model_knn, test, class="response")
#str(pred_knn)
pred_knn1 <-ifelse(pred_knn[, 1]<0.5, "+", "-")
table(pred_knn1, test$V16)
##          
## pred_knn1  -  +
##         - 78 31
##         + 32 55
mean(pred_knn1==test$V16)
## [1] 0.6785714
##knn算法,标准化处理
#1、对数据集进行标准化处理
dataset2 <- cbind(colwise(scale)(dataset2[, -38]), V16=as.factor(dataset2$V16))

#2、重新创建训练集和测试集数据
set.seed(123)
library(caret)
index <- createDataPartition(dataset2$V16, p=0.7, list=F)
train_scale <- dataset2[index, ]
test_scale <- dataset2[-index, ]

#3、建模
model_scale <- knn3(V16~., data=train_scale, k=5)

#4、预测
pred_scale <- predict(model_scale,  test_scale, type="class")
table(pred_scale, test_scale$V16)
##           
## pred_scale  0  1
##          0 85 34
##          1 22 54
mean(pred_scale==test_scale$V16)
## [1] 0.7128205
##knn算法的K-折交叉验证
#1、创建交叉验证函数
library(class)
library(caret)
knn_cv <- function(data, n=5, k) {
  index <- sample(1:5, nrow(data), replace=T)
  acc <- 0
  for(i in 1:5) {
    ind = index == i
    train <- data[-ind,  ]
    test <- data[ind, ]
    model_knn <- knn3(V16~., data=train, k=k)
    pred <- predict(model_knn,  test, type="class")
    acc[i] <- mean(pred==test$V16)
  }
  mean(acc)
}

#2、重新创建训练集和测试集数据
set.seed(123)
library(caret)
index <- createDataPartition(dataset2$V16, p=0.7, list=F)
train_cv <- dataset2[index, ]
test_cv <- dataset2[-index, ]

#3、验证K-折交叉验证函数
knn_cv(train_cv, 3, 5)
## [1] 0.8730277
acc <- 0
for(i in 2:20) {
  acc[i] <- knn_cv(train_cv, 3, i)
  print(paste(i,knn_cv(train_cv, 3, i), sep="——》"))
}
## [1] "2——》0.890464698699993"
## [1] "3——》0.909093242809613"
## [1] "4——》0.883418367346939"
## [1] "5——》0.872213395351693"
## [1] "6——》0.867066796941293"
## [1] "7——》0.854495087053878"
## [1] "8——》0.848111602148951"
## [1] "9——》0.842036790410966"
## [1] "10——》0.840109115714773"
## [1] "11——》0.831505296371422"
## [1] "12——》0.833180115999597"
## [1] "13——》0.817368273676231"
## [1] "14——》0.834854658514452"
## [1] "15——》0.827973456326398"
## [1] "16——》0.827506897124673"
## [1] "17——》0.833968115218115"
## [1] "18——》0.832558368162111"
## [1] "19——》0.835863665139981"
## [1] "20——》0.832136554676483"
#上面等价于下面代码
#k <- 1:20
#set.seed(123)
#acc <- sapply(k, function(x) knn_cv(train_cv, 3, x))

#4、确定准确率最大的K值,并重新建模
plot(1:20, acc, type="b")

k.final <- which.max(acc)
k.final
## [1] 3
model <- knn3(V16~., data=train_cv, k=k.final)

#5、预测
pred <- predict(model, test_cv, type="class")
table(pred, test_cv$V16)
##     
## pred  0  1
##    0 89 37
##    1 18 51
mean(pred==test_cv$V16)
## [1] 0.7179487

7、xgboost

require(xgboost)
## Loading required package: xgboost
require(methods)
require(plyr)
set.seed(123)

n <- nrow(dataset2)
index = sample(n,round(0.7*n))
train.xg = dataset2[index,]
test.xg = dataset2[-index,]

label <- as.matrix(train.xg[,38,drop =F])
data <- as.matrix(train.xg[,-38,drop =F])

data2 <-  as.matrix(test.xg[,-38,drop =F])
label2 =  as.matrix(test.xg[,38,drop =F])

xgmat <- xgb.DMatrix(data, label = label, missing = -10000)
param <- list("objective" = "binary:logistic",
    "bst:eta" = 1,
    "bst:max_depth" = 2,
    "eval_metric" = "logloss",
    "silent" = 1,
    "nthread" = 16 ,
    "min_child_weight" =1.45
)
nround =275
bst = xgb.train(param, xgmat, nround )

res1 = predict(bst,data2)
pre1 = ifelse(res1>0.5,1,0)
table(pre1,label2)
##     label2
## pre1  0  1
##    0 91 15
##    1 12 78
table(pre1,label2)
##     label2
## pre1  0  1
##    0 91 15
##    1 12 78
mean(pre1 ==label2)
## [1] 0.8622449

你可能感兴趣的:(机器学习,knn,xgboost,GBM,logit)