1、caret包简介

caret是(Classification And REgression Training)的缩写，该函数集试图简化创建预测模型的过程，包括的工具如下：

1.数据分割
2.数据预处理
3.特征选择
4.模型训练与调优
5.变量重要性评估

2、数据分割

内容：
1.基于结果的简单拆分
2.基于预测因子的拆分
3.时间序列数据的拆分
4.使用重要组进行数据拆分

2.1 基于结果的简单拆分

通过createDataPartition()函数，如果参数为因子变量，则函数在每个类中进行随机抽样，并且保留数据的总体类分布。
例如，要创建一个单一的80/20分割的鸢尾花数据:

> library(pacman)
> p_load(caret)
> 
> data(iris)
> 
> table(iris$Species)

## 
##     setosa versicolor  virginica 
##         50         50         50

> set.seed(123)
> ind <- createDataPartition(iris$Species,
+                            p = 0.8,
+                            # FALSE不将数据以列表方式返回
+                            list = F,
+                            # 创建拆分的数量
+                            times = 1)
> train <- iris[ind,]
> table(train$Species)

## 
##     setosa versicolor  virginica 
##         40         40         40

> test <- iris[-ind,]
> table(test$Species)

## 
##     setosa versicolor  virginica 
##         10         10         10

通过createDataPartition()函数实现的数据分割，当设置p=0.8时，就隐含了两层含义，即从总体中抽取80%的样本，同时在各个因子水平下也取80%的样本。

createResample()可用于生成简单的bootstrap有放回抽样样本。

> set.seed(123)
> ind2 <- createResample(iris$Species,
+                        # 指定抽样组数，默认为10组
+                        times = 1,
+                        list = T)
> ind2

## $Resample1
##   [1]   1   4   5   6   7   7   7   9  10  11  13  14  14  16  16  16  16  17
##  [19]  20  21  21  22  22  23  24  24  25  26  26  30  30  32  32  33  33  34
##  [37]  34  35  36  38  39  39  39  40  41  42  43  43  46  46  48  50  50  51
##  [55]  52  52  53  53  54  54  54  54  55  59  60  61  63  63  64  67  69  69
##  [73]  70  72  72  74  74  74  75  76  76  77  77  78  79  79  81  83  83  84
##  [91]  85  85  86  86  88  89  90  90  91  91  91  92  93  94  94  94  97  98
## [109]  99 102 103 104 106 107 107 108 109 109 110 110 111 112 113 116 117 118
## [127] 118 118 121 121 122 125 125 127 127 135 135 135 135 136 136 137 137 137
## [145] 137 140 141 142 142 143

可以看到抽取的样本中有重复的数据。
createfold()可用于从一组数据生成平衡的交叉验证的样本抽样。

> ind3 <- createFolds(iris$Species,
+                     # 折数，默认为10
+                     k = 1,
+                     list = T,
+                     # 是否返回抽样的真实值，默认返回索引值
+                     returnTrain = F)

createMultiFolds(y, k = 10, times = 5)函数多了times参数，即抽样的组数，默认为5。

2.2 基于预测因子的拆分

maxDissim() 函数使用最大差异方法创建子样本

> p_load(mlbench)
> data(BostonHousing)
> str(BostonHousing)

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

# 标准化
> boston.scale <- scale(BostonHousing[, c("age", "nox")])

> p_load(dplyr, ggplot2)
> # 先使用随机抽样
> samp <- sample(1:nrow(boston.scale), 5)
> 
> df.poor <- boston.scale[samp, ] %>% as_tibble()
>
> # 使用最大差异抽样
> max.dis <- maxDissim(df.poor, boston.scale[-samp, ], n = 20)
> df.max <- boston.scale[max.dis, ] %>% as_tibble()

使用5个初始随机样本，我们可以从数据中选择另外20个样本，这样新的样本与初始的5个样本是最不相同的。

> ggplot() + 
+     geom_point(data = subset(as_tibble(boston.scale),source = boston.scale), 
+         aes(age, nox), col = "gray60") + 
+     geom_point(data = subset(df.poor, source = boston.scale), 
+         aes(age, nox), col = "red", size = 2) + 
+     geom_point(data = subset(df.max, source = boston.scale), 
+         aes(age, nox), col = "blue", size = 2) + theme_bw()

抽样对比

红色是随机抽样的5个样本，蓝色是以与红色最大差异抽出的20个样本。

2.3 时间序列数据切分

> df <- read.csv("data_set/google.csv") %>% 
+   select(Date,Volume) %>% 
+   as_tibble()
> str(df)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1138 obs. of  2 variables:
##  $ Date  : Factor w/ 1138 levels "2006-02-13","2006-02-14",..: 1138 1137 1136 1135 1134 1133 1132 1131 1130 1129 ...
##  $ Volume: int  3925000 2671400 1890700 1305000 1934700 2204600 2223000 2074000 1872200 3319500 ...

> df$Date <- as.Date(df$Date)
> 
> split.df <- createTimeSlices(df$Date,
+                              # 训练集样本中连续值的数量
+                              initialWindow = 910,
+                              # 测试集样本中连续值的数量
+                              horizon = nrow(df) - 910,
+                              # 如果为FALSE，则训练集始终从第一个样本开始，并且训练集的大小将随数据拆分而变化
+                              fixedWindow = T)
> df.train <- df[split.df$train$Training910,]
> df.test <- df[split.df$test$Testing910,]
> dim(df.train)

## [1] 910   2

> dim(df.test)

## [1] 228   2

2.4 使用重要组进行数据拆分

在某些情况下，在（重新）采样期间应考虑数据中重要的定性因素。

> set.seed(123)
> subjects <- sample(1:20, size = 80, replace = T)
> table(subjects)

## subjects
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  1  3  5  2  5  3  6  5  5  6  3  4  3  6  5  3  3  4  5  3

# k = length(unique(group)
> folds <- groupKFold(subjects, k = 15)
> str(folds)

## List of 11
##  $ Fold01: int [1:74] 1 2 3 4 5 6 8 9 10 11 ...
##  $ Fold02: int [1:72] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold03: int [1:74] 1 2 3 4 6 7 8 9 10 11 ...
##  $ Fold04: int [1:70] 1 3 4 5 6 7 8 9 10 11 ...
##  $ Fold05: int [1:79] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold06: int [1:71] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold07: int [1:75] 2 3 4 5 6 7 8 9 10 11 ...
##  $ Fold08: int [1:77] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold09: int [1:75] 1 2 3 5 6 7 8 9 10 11 ...
##  $ Fold10: int [1:62] 1 2 3 4 5 7 10 12 13 14 ...
##  $ Fold11: int [1:71] 1 2 4 5 6 7 8 9 11 12 ...

3、可视化

Featureplot()函数是不同点阵图的包装器，用于对数据进行可视化。

3.1 散点图

> # 画图主题
> p_load(AppliedPredictiveModeling)
> transparentTheme(trans = 0.4)
> 
> # 散点图矩阵
> featurePlot(x = iris[, 1:4], y = iris$Species, plot = "pairs", auto.key = list(columns = 3))

散点图矩阵

> transparentTheme(trans = 0.9)
> # 带椭圆的散点图矩阵
> featurePlot(x = iris[, 1:4], y = iris$Species, plot = "ellipse", auto.key = list(columns = 3))

带椭圆的散点图矩阵

3.2 密度图

> # 密度图
> transparentTheme(trans = 0.9)
> featurePlot(x = iris[,1:4],
+             y = iris$Species,
+             plot = "density",
+             scales = list(x = list(relation = "free"),
+                           y = list(relation = "free")),
+             adjust = 1.5,
+             # 点的形状
+             pch = "|",
+             layout = c(4,1),
+             auto.key = list(columns = 3))

密度图

3.3 箱线图

> # 箱线图
> featurePlot(x = iris[, 1:4], 
+     y = iris$Species, 
+     plot = "box", 
+     scales = list(y = list(relation = "free"), 
+             x = list(rot = 90)), layout = c(4, 1), 
+             auto.key = list(columns = 3))

箱线图

3.4 分散作图

当回归预测的结果是连续的，featurePlot()函数可以为每个预测作图：

> p_load(mlbench)
> data(BostonHousing)
> regVar <- c("age", "lstat", "tax")
> str(BostonHousing[, regVar])

## 'data.frame':    506 obs. of  3 variables:
##  $ age  : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ lstat: num  4.98 9.14 4.03 2.94 5.33 ...
##  $ tax  : num  296 242 242 222 222 222 311 311 311 311 ...

> # 画图主题
> theme1 <- trellis.par.get()
> theme1$plot.symbol$col = rgb(0.2, 0.2, 0.2, 0.4)
> theme1$plot.symbol$pch = 16
> theme1$plot.line$col = rgb(1, 0, 0, 0.7)
> theme1$plot.line$lwd <- 2
> trellis.par.set(theme1)
> 
> featurePlot(x = BostonHousing[, regVar], 
+     y = BostonHousing$medv, 
+     plot = "scatter", 
+     layout = c(3, 1))

为预测结果分散作图

> # 添加回归线
> featurePlot(x = BostonHousing[, regVar], 
+     y = BostonHousing$medv, 
+     plot = "scatter", 
+     type = c("p", "smooth"), 
+     span = 0.5, 
+     col = rgb(0.2, 0.2, 0.2, 0.4), 
+     pch = 1, 
+     layout = c(3, 1))

添加回归线

54-caret包学习：数据分割与可视化