3.6 添加回归模型拟合线
# 运行stat_smooth()函数并设定 method=lm 即可向散点图中添加线性回归拟合线
# 默认情况下 stat_smooth() 函数会为回归拟合线自动添加95% 的置信域,可以设置 level 参数对置信水平进行调整。设置 se = FALSE, 则不添加置信域
library(gcookbook) # For the data set
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn))
sp + geom_point() + stat_smooth(method=lm)
# 99% 置信域
sp + geom_point() + stat_smooth(method=lm, level=0.99)
# 没有置信域
sp + geom_point() + stat_smooth(method=lm, se=FALSE)
# 设置拟合线的颜色
sp + geom_point(colour="grey60") +
stat_smooth(method=lm, se=FALSE, colour="black")
# stat_smooth()函数默认的模型为 loess 曲线
sp + geom_point(colour="grey60") + stat_smooth()
sp + geom_point(colour="grey60") + stat_smooth(method=loess)
# 分组绘制模型拟合线
sps <- ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() +
scale_colour_brewer(palette="Set1")
sps + geom_smooth()
sps + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
值得注意的是:loess()
函数只能根据数据对应的x轴的范围进行预测。如果想基于数据集对拟合线进行外推,必须使用支持外推的函数,比如lm()
,并将fullrange=TRUE
参数传递给 stat_smooth()
函数。
3.7 根据已有模型向散点图添加拟合线
使用 lm()
函数建立一个以 ageYear
为预测变量对 heightIn
进行预测的模型。然后,调用 predict()
函数对 heightIn
进行预测。
model <- lm(heightIn ~ ageYear + I(ageYear^2), heightweight)
model
> model
Call:
lm(formula = heightIn ~ ageYear + I(ageYear^2), data = heightweight)
Coefficients:
(Intercept) ageYear I(ageYear^2)
-10.3136 8.6673 -0.2478
# 创建一个 ageYear 列,并对其进行插值。
xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)
predicted <- data.frame(ageYear=seq(xmin, xmax, length.out=100))
# 计算 heightIn 的预测值
predicted$heightIn <- predict(model, predicted)
head(predicted)
> head(predicted)
ageYear heightIn
1 11.58000 56.82624
2 11.63980 57.00047
3 11.69960 57.17294
4 11.75939 57.34363
5 11.81919 57.51255
6 11.87899 57.67969
# 将预测曲线绘制的数据点散点图上
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(colour="grey40")
sp + geom_line(data=predicted, size=1)
# 应用定义的 predictvals() 函数可以简化向散点图添加模型拟合线的过程
predictvals <- function(model, xvar, yvar, xrange=NULL, samples=100, ...) {
if (is.null(xrange)) {
if (any(class(model) %in% c("lm", "glm")))
xrange <- range(model$model[[xvar]])
else if (any(class(model) %in% "loess"))
xrange <- range(model$x)
}
newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model, newdata = newdata, ...)
newdata
}
# 调用lm() 函数和 loess() 函数对数据集建立线性和LOESS模型
modlinear <- lm(heightIn ~ ageYear, heightweight)
modloess <- loess(heightIn ~ ageYear, heightweight)
lm_predicted <- predictvals(modlinear, "ageYear", "heightIn")
loess_predicted <- predictvals(modloess, "ageYear", "heightIn")
ggplot(heightweight, aes(x=ageYear, y=heightIn)) +
geom_point(colour="grey40") +
geom_line(data=lm_predicted, colour="red", size=.8) +
geom_line(data=loess_predicted, colour="blue", size=.8)
3.8 添加来自多个模型的拟合线
根据变量 sex 的水平对 heightweight 数据集进行分组,调用 lm() 函数对每组数据分别建立线性模型,并将模型结果放在一个列表内。随后,通过下面定义的 make_model() 函数建立模型。
make_model <- function(data) {
lm(heightIn ~ ageYear, data)
}
# 将heighweight 数据集分别切分为男性和女性组并建立模型
ibrary(gcookbook)
library(plyr)
models <- dlply(heightweight, "sex", .fun = make_model)
# 查看两个lm对象f和m组成的列表
models
> models
$f
Call:
lm(formula = heightIn ~ ageYear, data = data)
Coefficients:
(Intercept) ageYear
43.963 1.209
$m
Call:
lm(formula = heightIn ~ ageYear, data = data)
Coefficients:
(Intercept) ageYear
30.658 2.301
attr(,"split_type")
[1] "data.frame"
attr(,"split_labels")
sex
1 f
2 m
predvals <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn")
head(predvals)
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() + geom_line(data=predvals)
# 设置 xrange 参数使两组预测线对应的xz轴范围与整个数据集对应的x轴范围详谈
predvals <- ldply(models, .fun=predictvals, xvar="ageYear", yvar="heightIn",
xrange=range(heightweight$ageYear))
ggplot(heightweight, aes(x=ageYear, y=heightIn, colour=sex)) +
geom_point() + geom_line(data=predvals)
3.9 向散点图添加模型系数
调用 annotate()
函数在图形中添加文本。
model <- lm(heightIn ~ ageYear, heightweight)
# 查看模型参数
summary(model)
> summary(model)
Call:
lm(formula = heightIn ~ ageYear, data = heightweight)
Residuals:
Min 1Q Median 3Q Max
-8.3517 -1.9006 0.1378 1.9071 8.3371
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 37.4356 1.8281 20.48 <2e-16 ***
ageYear 1.7483 0.1329 13.15 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.989 on 234 degrees of freedom
Multiple R-squared: 0.4249, Adjusted R-squared: 0.4225
F-statistic: 172.9 on 1 and 234 DF, p-value: < 2.2e-16
pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point() +
geom_line(data=pred)
# x,y参数设置标签位置
sp + annotate("text", label="r^2=0.42", x=16.5, y=52)
# parse = TRUE 调用R的数学表达式语法
sp + annotate("text", label="r^2 == 0.42", parse = TRUE, x=16.5, y=52)
# 自动生成公式
eqn <- as.character(as.expression(
substitute(italic(y) == a + b * italic(x) * "," ~~ italic(r)^2 ~ "=" ~ r2,
list(a = format(coef(model)[1], digits=3),
b = format(coef(model)[2], digits=3),
r2 = format(summary(model)$r.squared, digits=2)
))))
eqn
parse(text=eqn) # Parsing turns it into an expression
sp + annotate("text", label=eqn, parse=TRUE, x=Inf, y=-Inf, hjust=1.1, vjust=-.5)
参考书籍
- R Graphics Cookbook, 2nd edition.