R绘图基础指南 | 3. 散点图(二)

scatter_2.jpg

3.6 添加回归模型拟合线

#  运行stat_smooth()函数并设定 method=lm 即可向散点图中添加线性回归拟合线
# 默认情况下 stat_smooth() 函数会为回归拟合线自动添加95% 的置信域，可以设置 level 参数对置信水平进行调整。设置 se = FALSE, 则不添加置信域
library(gcookbook) # For the data set
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn))

sp + geom_point() + stat_smooth(method=lm)
# 99% 置信域
sp + geom_point() + stat_smooth(method=lm, level=0.99)
# 没有置信域
sp + geom_point() + stat_smooth(method=lm, se=FALSE)
# 设置拟合线的颜色
sp + geom_point(colour="grey60") +
  stat_smooth(method=lm, se=FALSE, colour="black")

image-20210818214744424

# stat_smooth()函数默认的模型为 loess 曲线
sp + geom_point(colour="grey60") + stat_smooth()
sp + geom_point(colour="grey60") + stat_smooth(method=loess)

unnamed-chunk-65

# 分组绘制模型拟合线
sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
  geom_point() +
  scale_colour_brewer(palette="Set1")

sps + geom_smooth()
sps + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

image-20210819170729235

值得注意的是：loess()函数只能根据数据对应的x轴的范围进行预测。如果想基于数据集对拟合线进行外推，必须使用支持外推的函数，比如lm()，并将fullrange=TRUE参数传递给 stat_smooth() 函数。

3.7 根据已有模型向散点图添加拟合线

使用 lm() 函数建立一个以 ageYear 为预测变量对 heightIn 进行预测的模型。然后，调用 predict() 函数对 heightIn 进行预测。

model <- lm(heightIn ~ ageYear + I(ageYear^2), heightweight)
model

> model

Call:
lm(formula = heightIn ~ ageYear + I(ageYear^2), data = heightweight)

Coefficients:
 (Intercept)       ageYear  I(ageYear^2)  
    -10.3136        8.6673       -0.2478

# 创建一个 ageYear 列，并对其进行插值。
xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)
predicted <- data.frame(ageYear=seq(xmin, xmax, length.out=100))

# 计算 heightIn 的预测值
predicted$heightIn <- predict(model, predicted)
head(predicted)

> head(predicted)
   ageYear heightIn
1 11.58000 56.82624
2 11.63980 57.00047
3 11.69960 57.17294
4 11.75939 57.34363
5 11.81919 57.51255
6 11.87899 57.67969

# 将预测曲线绘制的数据点散点图上
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
  geom_point(colour="grey40")

sp + geom_line(data=predicted, size=1)

unnamed-chunk-71

# 应用定义的 predictvals() 函数可以简化向散点图添加模型拟合线的过程
predictvals <- function(model, xvar, yvar, xrange=NULL, samples=100, ...) {
  if (is.null(xrange)) {
    if (any(class(model) %in% c("lm", "glm")))
      xrange <- range(model$model[[xvar]])
    else if (any(class(model) %in% "loess"))
      xrange <- range(model$x)
  }
  
  newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
  names(newdata) <- xvar
  newdata[[yvar]] <- predict(model, newdata = newdata, ...)
  newdata
}

# 调用lm() 函数和 loess() 函数对数据集建立线性和LOESS模型
modlinear <- lm(heightIn ~ ageYear, heightweight)

modloess  <- loess(heightIn ~ ageYear, heightweight)


lm_predicted    <- predictvals(modlinear, "ageYear", "heightIn")
loess_predicted <- predictvals(modloess, "ageYear", "heightIn")

ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
  geom_point(colour="grey40") + 
  geom_line(data=lm_predicted, colour="red", size=.8) +
  geom_line(data=loess_predicted, colour="blue", size=.8)

unnamed-chunk-72

3.8 添加来自多个模型的拟合线

根据变量 sex 的水平对 heightweight 数据集进行分组，调用 lm() 函数对每组数据分别建立线性模型，并将模型结果放在一个列表内。随后，通过下面定义的 make_model() 函数建立模型。

make_model <- function(data) {
  lm(heightIn ~ ageYear, data)
}

# 将heighweight 数据集分别切分为男性和女性组并建立模型
ibrary(gcookbook) 
library(plyr)
models <- dlply(heightweight, "sex", .fun = make_model)

# 查看两个lm对象f和m组成的列表
models

> models
$f

Call:
lm(formula = heightIn ~ ageYear, data = data)

Coefficients:
(Intercept)      ageYear  
     43.963        1.209  


$m

Call:
lm(formula = heightIn ~ ageYear, data = data)

Coefficients:
(Intercept)      ageYear  
     30.658        2.301  


attr(,"split_type")
[1] "data.frame"
attr(,"split_labels")
  sex
1   f
2   m

predvals <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn")
head(predvals)


ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
  geom_point() + geom_line(data=predvals)

unnamed-chunk-81

# 设置 xrange 参数使两组预测线对应的xz轴范围与整个数据集对应的x轴范围详谈
predvals <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn",
                  xrange=range(heightweight$ageYear))


ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
  geom_point() + geom_line(data=predvals)

unnamed-chunk-82

3.9 向散点图添加模型系数

调用 annotate() 函数在图形中添加文本。

model <- lm(heightIn ~ ageYear, heightweight)
# 查看模型参数
summary(model)

> summary(model)

Call:
lm(formula = heightIn ~ ageYear, data = heightweight)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.3517 -1.9006  0.1378  1.9071  8.3371 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  37.4356     1.8281   20.48   <2e-16 ***
ageYear       1.7483     0.1329   13.15   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.989 on 234 degrees of freedom
Multiple R-squared:  0.4249,    Adjusted R-squared:  0.4225 
F-statistic: 172.9 on 1 and 234 DF,  p-value: < 2.2e-16

pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point() +
  geom_line(data=pred)
# x,y参数设置标签位置
sp + annotate("text", label="r^2=0.42", x=16.5, y=52)
# parse = TRUE 调用R的数学表达式语法
sp + annotate("text", label="r^2 == 0.42", parse = TRUE, x=16.5, y=52)

image-20210819215348390

# 自动生成公式
eqn <- as.character(as.expression(
  substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2,
             list(a = format(coef(model)[1], digits=3),
                  b = format(coef(model)[2], digits=3),
                  r2 = format(summary(model)$r.squared, digits=2)
             ))))
eqn

parse(text=eqn)  # Parsing turns it into an expression

sp + annotate("text", label=eqn, parse=TRUE, x=Inf, y=-Inf, hjust=1.1, vjust=-.5)

unnamed-chunk-93

参考书籍

R Graphics Cookbook, 2nd edition.