《统计学习导论-基于R应用》第三章:线性回归(代码)

库library

库:一组不含在基础R配置内的函数和数据集

library(MASS) # 加载库
library(ISLR)# 安装库 install.packages("ISLR") 

简单线性回归

fix(Boston) # 查看Boston数据集
names(Boston) # 查看数据集的列名(预测变量+响应变量medv)
  1. 'crim'
  2. 'zn'
  3. 'indus'
  4. 'chas'
  5. 'nox'
  6. 'rm'
  7. 'age'
  8. 'dis'
  9. 'rad'
  10. 'tax'
  11. 'ptratio'
  12. 'black'
  13. 'lstat'
  14. 'medv'
?Boston # 查看数据集的更多信息

线性拟合

# 方法一
lm.fit = lm(medv~lstat,data= Boston) # 指定数据集
# 方法二
attach(Boston) # 绑定数据集
lm.fit = lm(medv~lstat)
lm.fit # 给出拟合函数的基础信息

Call:
lm(formula = medv ~ lstat)

Coefficients:
(Intercept)        lstat  
      34.55        -0.95  
summary(lm.fit) # 给出拟合函数各类信息

Call:
lm(formula = medv ~ lstat)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.168  -3.990  -1.318   2.034  24.500 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.55384    0.56263   61.41   <2e-16 ***
lstat       -0.95005    0.03873  -24.53   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared:  0.5441,	Adjusted R-squared:  0.5432 
F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
names(lm.fit) # 列出拟合函数存储的所有信息种类
  1. 'coefficients'
  2. 'residuals'
  3. 'effects'
  4. 'rank'
  5. 'fitted.values'
  6. 'assign'
  7. 'qr'
  8. 'df.residual'
  9. 'xlevels'
  10. 'call'
  11. 'terms'
  12. 'model'
coef(lm.fit) # 提取拟合参数(系数估计值)
(Intercept) 34.5538408793831
lstat -0.950049353757991
confint(lm.fit) # 系数估计值的置信区间
2.5 % 97.5 %
(Intercept) 33.448457 35.6592247
lstat -1.026148 -0.8739505
# 根据指定预测变量计算响应变量,同时给出置信空间或者预测空间
predict(lm.fit, data.frame(lstat=(c(5,10,15))), interval = "confidence") # 置信空间
fit lwr upr
1 29.80359 29.00741 30.59978
2 25.05335 24.47413 25.63256
3 20.30310 19.73159 20.87461
predict(lm.fit, data.frame(lstat=(c(5,10,15))), interval = "prediction") # 预测空间
fit lwr upr
1 29.80359 17.565675 42.04151
2 25.05335 12.827626 37.27907
3 20.30310 8.077742 32.52846
plot(lstat,medv) # 散点图
abline(lm.fit) # 添加最小二乘回归直线

《统计学习导论-基于R应用》第三章:线性回归(代码)_第1张图片

# abline(a,b) 绘制截距为a,斜率为b的直线
plot(lstat, medv)
abline(lm.fit, lwd = 3) # 线宽

《统计学习导论-基于R应用》第三章:线性回归(代码)_第2张图片

plot(lstat, medv)
abline(lm.fit, lwd = 3, col = "red") # 颜色

《统计学习导论-基于R应用》第三章:线性回归(代码)_第3张图片

plot(lstat, medv,  col="red", pch = "+") #图形符号

《统计学习导论-基于R应用》第三章:线性回归(代码)_第4张图片

plot(lstat, medv,  col="red", pch = 1:3) #图形符号

《统计学习导论-基于R应用》第三章:线性回归(代码)_第5张图片

par(mfrow=c(2,2)) # 划分网格
plot(lm.fit) # 绘制诊断图

《统计学习导论-基于R应用》第三章:线性回归(代码)_第6张图片

plot(predict(lm.fit),residuals(lm.fit)) # 残差对拟合值的散点图

《统计学习导论-基于R应用》第三章:线性回归(代码)_第7张图片

残差显示,数据有非线性

plot(predict(lm.fit),rstudent(lm.fit)) # 学生化残差对拟合值的散点图

《统计学习导论-基于R应用》第三章:线性回归(代码)_第8张图片

plot(hatvalues(lm.fit)) # 杠杆统计量
which.max(hatvalues(lm.fit)) # 最大元素(杠杆统计量)的索引

375: 375

《统计学习导论-基于R应用》第三章:线性回归(代码)_第9张图片

多元线性回归

lm.fit = lm(medv~ lstat+age, data = Boston) # 对两个预测变量进行拟合
summary(lm.fit)

Call:
lm(formula = medv ~ lstat + age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.981  -3.978  -1.283   1.968  23.158 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
age          0.03454    0.01223   2.826  0.00491 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.173 on 503 degrees of freedom
Multiple R-squared:  0.5513,	Adjusted R-squared:  0.5495 
F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16
lm.fit = lm(medv~.,data = Boston) # 对所有预测变量进行拟合
summary(lm.fit)

Call:
lm(formula = medv ~ ., data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.595  -2.730  -0.518   1.777  26.199 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
zn           4.642e-02  1.373e-02   3.382 0.000778 ***
indus        2.056e-02  6.150e-02   0.334 0.738288    
chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
age          6.922e-04  1.321e-02   0.052 0.958229    
dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
black        9.312e-03  2.686e-03   3.467 0.000573 ***
lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.745 on 492 degrees of freedom
Multiple R-squared:  0.7406,	Adjusted R-squared:  0.7338 
F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
?summary.lm # 产看所有可用项目
summary(lm.fit)$r.sq # R^2
summary(lm.fit)$sigma # RSE

0.740642664109409

4.74529818169963

library(car)
vif(lm.fit) # 方差膨胀因子
Loading required package: carData
crim 1.79219154743325
zn 2.29875817874944
indus 3.99159641834604
chas 1.07399532755379
nox 4.3937198475775
rm 1.93374443578326
age 3.10082551281534
dis 3.95594490637273
rad 7.48449633527449
tax 9.00855394759708
ptratio 1.7990840492489
black 1.34852107640638
lstat 2.94149107809193
# 除某一变量之外的所有变量拟合
# 方案一
lm.fit1 = lm(medv~.-age, data= Boston)
lm.fit1

Call:
lm(formula = medv ~ . - age, data = Boston)

Coefficients:
(Intercept)         crim           zn        indus         chas          nox  
  36.436927    -0.108006     0.046334     0.020562     2.689026   -17.713540  
         rm          dis          rad          tax      ptratio        black  
   3.814394    -1.478612     0.305786    -0.012329    -0.952211     0.009321  
      lstat  
  -0.523852  
# 方案二
lm.fit1 = update(lm.fit, ~.-age)
lm.fit1

Call:
lm(formula = medv ~ crim + zn + indus + chas + nox + rm + dis + 
    rad + tax + ptratio + black + lstat, data = Boston)

Coefficients:
(Intercept)         crim           zn        indus         chas          nox  
  36.436927    -0.108006     0.046334     0.020562     2.689026   -17.713540  
         rm          dis          rad          tax      ptratio        black  
   3.814394    -1.478612     0.305786    -0.012329    -0.952211     0.009321  
      lstat  
  -0.523852  

交互项

summary(lm(medv~lstat:age, data = Boston)) # 对lstat和age的交互项拟合

Call:
lm(formula = medv ~ lstat:age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-13.347  -4.372  -1.534   1.914  27.193 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 30.1588631  0.4828240   62.46   <2e-16 ***
lstat:age   -0.0077146  0.0003799  -20.31   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.827 on 504 degrees of freedom
Multiple R-squared:  0.4501,	Adjusted R-squared:  0.449 
F-statistic: 412.4 on 1 and 504 DF,  p-value: < 2.2e-16
summary(lm(medv~lstat*age, data = Boston)) # 对lstat、age、lstat和age的交互项 ,三者进行拟合
summary(lm(medv~lstat+age+lstat:age, data = Boston)) # 对lstat、age、lstat和age的交互项 ,三者进行拟合

Call:
lm(formula = medv ~ lstat * age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.806  -4.045  -1.333   2.085  27.552 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
age         -0.0007209  0.0198792  -0.036   0.9711    
lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.149 on 502 degrees of freedom
Multiple R-squared:  0.5557,	Adjusted R-squared:  0.5531 
F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16

Call:
lm(formula = medv ~ lstat + age + lstat:age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.806  -4.045  -1.333   2.085  27.552 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
age         -0.0007209  0.0198792  -0.036   0.9711    
lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.149 on 502 degrees of freedom
Multiple R-squared:  0.5557,	Adjusted R-squared:  0.5531 
F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16

非线性变换

lm.fit2 = lm(medv~lstat+I(lstat^2))
summary(lm.fit)

Call:
lm(formula = medv ~ lstat + I(lstat^2))

Residuals:
     Min       1Q   Median       3Q      Max 
-15.2834  -3.8313  -0.5295   2.3095  25.4148 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 42.862007   0.872084   49.15   <2e-16 ***
lstat       -2.332821   0.123803  -18.84   <2e-16 ***
I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.524 on 503 degrees of freedom
Multiple R-squared:  0.6407,	Adjusted R-squared:  0.6393 
F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16
lm.fit = lm(medv~lstat)
anova(lm.fit, lm.fit2) # 查看两种方式的区别
Res.Df RSS Df Sum of Sq F Pr(>F)
504 19472.38 NA NA NA NA
503 15347.24 1 4125.138 135.1998 7.630116e-28
par(mfrow=c(2,2))
plot(lm.fit2)

《统计学习导论-基于R应用》第三章:线性回归(代码)_第10张图片

lm.fit5 = lm(medv~poly(lstat,5)) # 5阶多项式
summary(lm.fit5)

Call:
lm(formula = medv ~ poly(lstat, 5))

Residuals:
     Min       1Q   Median       3Q      Max 
-13.5433  -3.1039  -0.7052   2.0844  27.1153 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)       22.5328     0.2318  97.197  < 2e-16 ***
poly(lstat, 5)1 -152.4595     5.2148 -29.236  < 2e-16 ***
poly(lstat, 5)2   64.2272     5.2148  12.316  < 2e-16 ***
poly(lstat, 5)3  -27.0511     5.2148  -5.187 3.10e-07 ***
poly(lstat, 5)4   25.4517     5.2148   4.881 1.42e-06 ***
poly(lstat, 5)5  -19.2524     5.2148  -3.692 0.000247 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.215 on 500 degrees of freedom
Multiple R-squared:  0.6817,	Adjusted R-squared:  0.6785 
F-statistic: 214.2 on 5 and 500 DF,  p-value: < 2.2e-16
summary(lm(medv~log(rm),data = Boston)) # 对数变换

Call:
lm(formula = medv ~ log(rm), data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-19.487  -2.875  -0.104   2.837  39.816 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -76.488      5.028  -15.21   <2e-16 ***
log(rm)       54.055      2.739   19.73   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.915 on 504 degrees of freedom
Multiple R-squared:  0.4358,	Adjusted R-squared:  0.4347 
F-statistic: 389.3 on 1 and 504 DF,  p-value: < 2.2e-16

定性预测变量

fix(Carseats)
names(Carseats)
  1. 'Sales'
  2. 'CompPrice'
  3. 'Income'
  4. 'Advertising'
  5. 'Population'
  6. 'Price'
  7. 'ShelveLoc'
  8. 'Age'
  9. 'Education'
  10. 'Urban'
  11. 'US'
lm.fit = lm(Sales~.+Income:Advertising + Price:Age, data = Carseats)
summary(lm.fit)

Call:
lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9208 -0.7503  0.0177  0.6754  3.3413 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
Income              0.0108940  0.0026044   4.183 3.57e-05 ***
Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
Population          0.0001592  0.0003679   0.433 0.665330    
Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
Age                -0.0579466  0.0159506  -3.633 0.000318 ***
Education          -0.0208525  0.0196131  -1.063 0.288361    
UrbanYes            0.1401597  0.1124019   1.247 0.213171    
USYes              -0.1575571  0.1489234  -1.058 0.290729    
Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
Price:Age           0.0001068  0.0001333   0.801 0.423812    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.011 on 386 degrees of freedom
Multiple R-squared:  0.8761,	Adjusted R-squared:  0.8719 
F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16
# R语言自动将定性变量转变为虚拟变量,并通过contrasts()返回虚拟变量的编码
attach(Carseats)
contrasts(ShelveLoc)
Good Medium
Bad 0 0
Good 1 0
Medium 0 1

编写函数`

LoadLibraries = function(){
    library(ISLR)
    library(MASS)
    print("The libraries have been loaded.")
}
LoadLibraries # 查看函数内容
function () 
{
    library(ISLR)
    library(MASS)
    print("The libraries have been loaded.")
}
LoadLibraries() # 调用函数
[1] "The libraries have been loaded."

你可能感兴趣的:(概率与统计,小白学R,ISL,统计学习导论,线性回归,第三章)