#1、加载数据
data(iris)
#2、创建训练集和测试集数据
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
set.seed(2003)
index <- createDataPartition(iris$Species, p=0.8, list=F)
train_iris <- iris[index, ]
test_iris <- iris[-index, ]
#3、建模
library(adabag)
## Loading required package: rpart
## Loading required package: mlbench
model_iris <- boosting(Species~., data=train_iris)
#4、模型评估
summary(model_iris)
## Length Class Mode
## formula 3 formula call
## trees 100 -none- list
## weights 100 -none- numeric
## votes 360 -none- numeric
## prob 360 -none- numeric
## class 120 -none- character
## importance 4 -none- numeric
## terms 3 terms call
## call 3 -none- call
pred <- predict(model_iris, train_iris)
str(pred)
## List of 6
## $ formula :Class 'formula' length 3 Species ~ .
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## $ votes : num [1:120, 1:3] 100.2 98.1 98.1 98.1 100.2 ...
## $ prob : num [1:120, 1:3] 0.899 0.88 0.88 0.88 0.899 ...
## $ class : chr [1:120] "setosa" "setosa" "setosa" "setosa" ...
## $ confusion: 'table' int [1:3, 1:3] 40 0 0 0 40 0 0 0 40
## ..- attr(*, "dimnames")=List of 2
## .. ..$ Predicted Class: chr [1:3] "setosa" "versicolor" "virginica"
## .. ..$ Observed Class : chr [1:3] "setosa" "versicolor" "virginica"
## $ error : num 0
mean(pred$class==train_iris[, 5])
## [1] 1
#查看模型的重要性
model_iris$importance
## Petal.Length Petal.Width Sepal.Length Sepal.Width
## 53.18526 24.87230 11.75237 10.19007
#5、预测
pred_test <- predict(model_iris, test_iris)
mean(pred_test$class==test_iris[, 5])
## [1] 0.9666667
table(pred_test$class, test_iris[, 5])
##
## setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
#6、查看测试集的误差演变
error <- errorevol(model_iris, test_iris)
plot(error$error, type="l")
#从上可知:在第4次迭代后误差率就达到零了,实现预测零误差率
请参考:http://chiffon.gitcafe.io/2015/05/20/newtry.html#topofpage logit,GBM,knn,xgboost对一个信用卡数据的实现
这份数据 需要我们通过一些蘑菇的若干属性判断这个品种是否有毒。数据以1或0来标记某个属性存在与否,所以样例数据为稀疏矩阵类型:
#1、加载数据
library(xgboost)
data(agaricus.train)
data("agaricus.test")
#2、创建训练集和测试集数据
train <- agaricus.train
test <- agaricus.test
#查看数据详情:
summary(train)
## Length Class Mode
## data 820638 dgCMatrix S4
## label 6513 -none- numeric
summary(test)
## Length Class Mode
## data 202986 dgCMatrix S4
## label 1611 -none- numeric
str(train)
## List of 2
## $ data :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## .. ..@ i : int [1:143286] 2 6 8 11 18 20 21 24 28 32 ...
## .. ..@ p : int [1:127] 0 369 372 3306 5845 6489 6513 8380 8384 10991 ...
## .. ..@ Dim : int [1:2] 6513 126
## .. ..@ Dimnames:List of 2
## .. .. ..$ : NULL
## .. .. ..$ : chr [1:126] "cap-shape=bell" "cap-shape=conical" "cap-shape=convex" "cap-shape=flat" ...
## .. ..@ x : num [1:143286] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..@ factors : list()
## $ label: num [1:6513] 1 0 0 1 0 0 0 1 0 0 ...
#注:dgCMatrix 类是一类稀疏的数字矩阵压缩,稀疏,面向列的格式。实现在列的非零元素增加行顺序排序。 dgCMatrix是Matrix包类稀疏的数字矩阵“标准”。
#3、把训练集拆分为训练变量和训练标签
train_data <- train$data
train_label <- train$label
summary(train_data)
## Length Class Mode
## 820638 dgCMatrix S4
summary(train_label)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4821 1.0000 1.0000
#4、建模
model <- xgboost(train_data, train_label, max.depth=2, eta=1,nround=2, objective="binary:logistic")
## [0] train-error:0.046522
## [1] train-error:0.022263
#我们迭代了两次,可以看到函数输出了每一次迭代模型的误差信息。这里的数据是稀疏矩阵,当然也支持普通的稠密矩阵。如果数据文件太大不希望读进R中,我们也可以通过设置参数 data = 'path_to_file' 使其直接从硬盘读取数据并分析。目前支持直接从硬盘读取libsvm格式的文件。
#5、预测
pred <- predict(model, test$data)
#6、做交叉验证:做交叉验证的函数参数与训练函数基本一致,只需要在原有参数的基础上设置 nfold
cv.res <- xgb.cv(data=train$data, label=train$label, max.depth=2, eta=1, nround=2, objective="binary:logistic", nfold=5)
## [0] train-error:0.051091+0.011941 test-error:0.054043+0.011023
## [1] train-error:0.021188+0.001648 test-error:0.021650+0.005332
cv.res
## train.error.mean train.error.std test.error.mean test.error.std
## 1: 0.051091 0.011941 0.054043 0.011023
## 2: 0.021188 0.001648 0.021650 0.005332
#交叉验证的函数会返回一个data.table类型的结果,方便我们监控训练集和测试集上的表现,从而确定最优的迭代步数。
cv.res <- xgb.cv(data=train$data, label=train$label, max.depth=2, eta=1, nround=5, objective="binary:logistic", nfold=5)
## [0] train-error:0.050400+0.010525 test-error:0.056808+0.016615
## [1] train-error:0.021150+0.001853 test-error:0.021804+0.004787
## [2] train-error:0.010096+0.006705 test-error:0.009365+0.005675
## [3] train-error:0.014279+0.002215 test-error:0.013819+0.002922
## [4] train-error:0.006142+0.002176 test-error:0.005682+0.003101
cv.res
## train.error.mean train.error.std test.error.mean test.error.std
## 1: 0.050400 0.010525 0.056808 0.016615
## 2: 0.021150 0.001853 0.021804 0.004787
## 3: 0.010096 0.006705 0.009365 0.005675
## 4: 0.014279 0.002215 0.013819 0.002922
## 5: 0.006142 0.002176 0.005682 0.003101
#从上可知:nround为3最好。