吉林大学R语言

文章目录

  • 常用方法及使用情景总结
  • R语言复习
      • 课件1
      • 1.1数据对象及类型
        • 1.1.1向量
        • 常用的统计与分析函数
        • 字符型向量
        • 逻辑型向量
        • 向量下标与子集的提取
        • 1.1.2数组与矩阵
        • 数组与矩阵操作
        • 1.1.3数据框
    • 课件2
      • 2.1 数据存储
      • 2.2数据读取
      • 2.3 编程基础
      • 2.4R语言绘图
        • 2.4.1 散点图
        • 2.4.2 柱状图
        • 2.4.3饼图
        • 2.4.4直方图
        • 2.4.5箱线图
    • 课件3
      • 3.1描述性统计
      • 3.2相关系数和协方差
      • 3.3概率与分布
        • 3.3.1随机抽样
        • 3.3.2排列组合计算
        • 3.3.3概率分布
        • 3.3.1均匀分布
        • 3.3.2正态分布
    • 课件4
      • Z检验
      • 4.1 T检验
      • 4.2F检验
      • 4.3非参数的假设检验
        • 4.3.1卡方独立性检验
        • 4.3.2Fisher精确性研究
      • 4.4方差分析
        • 4.4.1正态性检验
        • 4.4.2方差齐次性检验
        • 4.4.3单因素方差分析
        • 4.4.4 双因子方差分析
          • 无交互作用方差分析
          • 有交互作用的方差分析
    • 课件5
      • 5.1回归分析
        • 5.1.2多元线性回归模型
      • 5.2逐步回归
      • 5.3诊断分析
        • 5.3.1方差分析
        • 5.3.2方差齐性诊断分析
        • 5.3.3 异常点识别
        • 5.3.4共线性诊断
      • 5.4广义线性模型
        • 5.4.1逻辑回归
    • 课件6
      • 6.1聚类分析
        • 6.1.1距离的计算
        • 6.1.2聚类模型
        • 6.1.3kmeans聚类
      • 6.2判别分析
        • 6.2.1Fisher 判别法(线性判别)
        • 6.2.2距离判别法
        • 6.2.3曲线判别
        • 6.2.4Bayes判别

常用方法及使用情景总结

#有问题
#假设检验,
z.test<-function(x,n,sigma,alpha,u0=0,alternative="two.sided"){
options(digits=4)
result<-list( )
mean<-mean(x)
z<-(mean-u0)/(sigma/sqrt(n))
p<-pnorm(z,lower.tail=FALSE)
result$mean<-mean
result$z<-z
result$p.value<-p
if(alternative=="two.sided"){
p<-2*p
result$p.value<-p
}
else if (alternative == "greater"|alternative =="less" ){
result$p.value<-p
}
 else return("your input is wrong")
 result$conf.int<- c(
 mean-sigma*qnorm(1-alpha/2,mean=0, sd=1,
 lower.tail = TRUE)/sqrt(n),
 mean+sigma*qnorm(1-alpha/2,mean=0, sd=1,
 lower.tail = TRUE)/sqrt(n))
 result }
z.test(0.13,25,0.1,0.95,u0=0.12,alternative="less")
$mean
0.13
$z
0.5
$p.value
0.308537538725987
$conf.int
  1. 0.128745864441136
  2. 0.131254135558864
#包装精盐
x=c(490,506,508,502,498,511,510,515,512)
t.test(x,mu = 500)

	One Sample t-test

data:  x
t = 2.2, df = 8, p-value = 0.06
alternative hypothesis: true mean is not equal to 500
95 percent confidence interval:
 499.7 511.8
sample estimates:
mean of x 
    505.8 
#生活费检验
ttest=read.csv("TTest1.csv",header = TRUE)
t.test(ttest,mu = 1000)

	One Sample t-test

data:  ttest
t = -2.8, df = 584, p-value = 0.005
alternative hypothesis: true mean is not equal to 1000
95 percent confidence interval:
 568.9 922.2
sample estimates:
mean of x 
    745.5 
x=c(20.5,19.8,19.7,20.4,20.1,20.0,19.0,19.9)
y=c(20.7,19.8,19.5,20.8,20.4,19.6,20.2)
t.test(x,y,alternative = "two.sided")

	Welch Two Sample t-test

data:  x and y
t = -0.85, df = 12, p-value = 0.4
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.7768  0.3411
sample estimates:
mean of x mean of y 
    19.93     20.14 
t.test(salary,phd,var.equal = TRUE)

	Two Sample t-test

data:  salary and phd
t = 29, df = 88, p-value <2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 7290 8373
sample estimates:
mean of x mean of y 
 7833.333     1.533 
ttest$phd
  1. 1
  2. 1
  3. 1
  4. 1
  5. 1
  6. 1
  7. 2
  8. 2
  9. 1
  10. 2
  11. 2
  12. 2
  13. 2
  14. 1
  15. 1
  16. 1
  17. 1
  18. 1
  19. 1
  20. 1
  21. 1
  22. 1
  23. 2
  24. 2
  25. 2
  26. 1
  27. 2
  28. 2
  29. 2
  30. 2
  31. 2
  32. 2
  33. 2
  34. 2
  35. 2
  36. 1
  37. 2
  38. 2
  39. 2
  40. 2
  41. 1
  42. 1
  43. 1
  44. 2
  45. 2
#月薪差异
ttest=read.csv("TTest1.csv",header = TRUE)
attach(ttest)
#这样写是因为phd是一个离散型变量么???
t.test(salary~phd,data = ttest,var.equal = TRUE)
The following objects are masked from ttest (pos = 3):

    bgfriend, costofeat, face, gender, hoursofathlete, howdifficult,
    index, like, phd, ref, salary, star, totalvalue

	Two Sample t-test

data:  salary by phd
t = -3.4, df = 43, p-value = 0.002
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -2643.4  -660.2
sample estimates:
mean in group 1 mean in group 2 
           6952            8604 
t.test(like~phd,data = ttest,var.equal = TRUE)

	Two Sample t-test

data:  like by phd
t = -0.74, df = 43, p-value = 0.5
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.2431  0.5764
sample estimates:
mean in group 1 mean in group 2 
          5.333           5.667 
#成对数据的t检验
x=c(20.5,18.8,19.8,20.9,21.5,19.5,21.0,21.2)
y=c(17.7,20.3,20.0,18.8,19.0,20.1,20.0,19.1)
t.test(x,y,var.equal = TRUE,paired = TRUE)

	Paired t-test

data:  x and y
t = 1.8, df = 7, p-value = 0.1
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.3214  2.3714
sample estimates:
mean of the differences 
                  1.025 
#方差比较——F检验
x<-c(20.5, 19.8, 19.7, 20.4, 20.1, 20.0, 19.0, 19.9)
y<-c(20.7, 19.8, 19.5, 20.8, 20.4, 19.6, 20.2)
var.test(x,y)

	F test to compare two variances

data:  x and y
F = 0.79, num df = 7, denom df = 6, p-value = 0.8
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.1393 4.0600
sample estimates:
ratio of variances 
            0.7932 
#卡方独立性检验
cold=matrix(c(43,28,96,84),nrow = 2,dimnames = list(c("drink","not drink"),c("colding","not colding")))
cold
colding not colding
drink 43 96
not drink 28 84
#牛奶卡方独立性检验
chisq.test(cold)#P值>0.05所以接受原假设,喝牛奶对感冒没有影响

	Pearson's Chi-squared test with Yates' continuity correction

data:  cold
X-squared = 0.8, df = 1, p-value = 0.4
library(vcd)
head(Arthritis)
Loading required package: grid
ID Treatment Sex Age Improved
57 Treated Male 27 Some
46 Treated Male 29 None
77 Treated Male 30 None
17 Treated Male 32 Marked
36 Treated Male 46 Marked
23 Treated Male 58 Marked

R语言复习

课件1

  • 赋值
  • 基本运算

“+” , “-”, “*”, “/”, “^”(指数),

sqrt()开放,exp()指数函数

  • 基本数据类型

数值型:整型,单精度实型,双精度实型

字符型

复数型((不讨论)

逻辑型:FALSE,TRUE,NA(注意与NaN(不是一个数字)相区分)

mode(x):获取类型

length():获得长度

  • 数据对象基类型
#1.简单的赋值操作
#=
n=10
n

10

#assign 函数赋值
assign("a",20)
a

20

#2.基本运算
((10+2)*5-2^4)/4#11

11

sqrt(3)+exp(-2)#开方,指数函数

1.86738609080549

#3.基本数据类型与方法
x=123
mode(x)

‘numeric’

length(x)

1

#逻辑型
flag=1>3
flag

FALSE

#lnf:正无穷
x=5/0
x#无穷

Inf

Inf-Inf#NaN:不是一个数字

NaN

1.1数据对象及类型

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YxoGnF9x-1678242376633)(attachment:0ac2a92512490b88bad8a14631f4451.png)]

1.1.1向量

  • 只需要长度和类型即可描述
  • 所有元素必须是同一类型、
  • 循环法则:向量不等长时,较短的一方会循环至等长
  • 向量的计算

^,sqrt(),tan():分别为每一个元素的对应操作

  • 常用的统计分析函数
#1.数值型向量的建立
# seq(起,终,间距)
z=seq(1,5,by=0.5)

z
  1. 1
  2. 1.5
  3. 2
  4. 2.5
  5. 3
  6. 3.5
  7. 4
  8. 4.5
  9. 5
#  :
1:10
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
1:10-1 #先生成向量后全部减一
  1. 0
  2. 1
  3. 2
  4. 3
  5. 4
  6. 5
  7. 6
  8. 7
  9. 8
  10. 9
# rep(  ,times=,each)
z=rep(2:5,2)#重复几次

  1. 2
  2. 3
  3. 4
  4. 5
  5. 2
  6. 3
  7. 4
  8. 5
z=rep(1:2,each=2)#=
z
  1. 1
  2. 1
  3. 2
  4. 2
#2.向量的计算
#同一个表达式不需要向量有相同的长度
x=1:5
y=1:4
x+y#y重复自己与x等长
Warning message in x + y:
"长的对象长度不是短的对象长度的整倍数"
  1. 2
  2. 4
  3. 6
  4. 8
  5. 6
sqrt(x)
  1. 1
  2. 1.4142135623731
  3. 1.73205080756888
  4. 2
  5. 2.23606797749979

常用的统计与分析函数

  • 离差: 观测值与参照点之间的差距

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SQKzexw3-1678242376634)(attachment:54ad8d1be750d1fb0991c6533b295dd.png)]

  • 注意prod()可用于计算排列组合

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-K4IByAbS-1678242376635)(attachment:fcd8efc2dea0908c89d0e0d88df4954.png)]

#计算A(3)
prod(1:3)

6

  • 秩:即排名

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vsVLRGEK-1678242376635)(attachment:e11e8ce447e534adfd5295ec33e6d22.png)]

x=c(3,1,2,5)#3,1,2,4(5为第四大)
rank(x);
  1. 3
  2. 1
  3. 2
  4. 4

字符型向量

  • paste(x1,x2,sep=“”)#将x1,x2对应向量进行组合
z=c("green","Kobe","lebrn")
z
  1. 'green'
  2. 'Kobe'
  3. 'lebrn'
labels=paste("x",1:10,sep="")
labels
  1. 'x1'
  2. 'x2'
  3. 'x3'
  4. 'x4'
  5. 'x5'
  6. 'x6'
  7. 'x7'
  8. 'x8'
  9. 'x9'
  10. 'x10'
labs=paste(c("X","Y"),1:10,sep ="")
labs
  1. 'X1'
  2. 'Y2'
  3. 'X3'
  4. 'Y4'
  5. 'X5'
  6. 'Y6'
  7. 'X7'
  8. 'Y8'
  9. 'X9'
  10. 'Y10'

逻辑型向量

x=1:10
flag=x>5#生成一个逻辑型向量
flag
  1. FALSE
  2. FALSE
  3. FALSE
  4. FALSE
  5. FALSE
  6. TRUE
  7. TRUE
  8. TRUE
  9. TRUE
  10. TRUE

向量下标与子集的提取

  • 追加一个[ ] 完成提取
x=c(1:10)
x[1:5]#选取前5个
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
x[-(1:5)]#删除前1到5个元素
  1. 6
  2. 7
  3. 8
  4. 9
  5. 10

1.1.2数组与矩阵

  • 数组:维度大于1的数据包表
  • 矩阵:特殊的数组(dim=2)

array(x,dim=c())#向量,维度

dimnames(A)=list(x1,x2)#行名,列名。。。。。

matrix(x,nc,nr,byrow=TRUE):循环准则依然适用,但是数据项个数必须是矩阵列数的
倍数

diag(n):生成单位阵,对角阵

#1.数组的建立
a=array(1:12,dim=c(3:4))#默认按列排列
a
1 4 7 10
2 5 8 11
3 6 9 12
a=array(c(1:9),dim = c(3,3))#注意用逗号
print(a)
     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9
dimnames(a)=list(c("a","b","e"),c("c","d","f"))#第一个参数为行名,第二个为列名
a
c d f
a 1 4 7
b 2 5 8
e 3 6 9
#2.矩阵的建立

#matrix()
m=matrix(1:4,nr=2,nc=2)
m

1 3
2 4
#diag:生成单位阵
m1=diag(3)
m1
1 0 0
0 1 0
0 0 1
#生成对角阵
v=1:5
m2=diag(v)
m2
1 0 0 0 0
0 2 0 0 0
0 0 3 0 0
0 0 0 4 0
0 0 0 0 5
#生成特殊对角阵
v=1:5
m3=diag(v,nr=5,nc=6)
m3
1 0 0 0 0 0
0 2 0 0 0 0
0 0 3 0 0 0
0 0 0 4 0 0
0 0 0 0 5 0

数组与矩阵操作

  • 提取
  • 替换
  • 运算

t():转置

diag():提取对角元素

rbind(m1,m2):按行合并

cbind(m1,m2):按列合并

*:逐元乘积

%*%:代数乘积

m4=matrix(1:12,nr=3,nc=4)
m4
1 4 7 10
2 5 8 11
3 6 9 12
#提取
m4[c(1,2),c(2,3),drop=FALSE]#选取第1,2行的2,3列
4 7
5 8
m4[-1]#去掉第一行
  1. 2
  2. 3
  3. 4
  4. 5
  5. 6
  6. 7
  7. 8
  8. 9
  9. 10
  10. 11
  11. 12
#替换与添加
m4[,3]=NA #第四列替换
m4

1 4 NA 10
2 5 NA 11
3 6 NA 12
m4[is.na(m4)]=-1#缺失值全部用-1替换
m4
1 4 -1 10
2 5 -1 11
3 6 -1 12
#矩阵运算

1.1.3数据框

  • 建立

data.frame(INDEX=y,VALUE=x),向量必须有相同的长度,或则倍数关系

读取文件:在后续进行介绍

  • 下标和子集提取

提取与访问

df[行,列 ]

df$列名:提取某一列

subset(df,条件):提取符合条件的子集

添加新变量

df$列名=

df$列名=with(df,数值)

df=transform(df,列名=value,列名=value)

df=data.frame(INDEX=1:4,姓名=c("zhang","wang","qian","sun"),
            工龄= c(10,20,30,40))
df
INDEX 姓名 工龄
1 zhang 10
2 wang 20
3 qian 30
4 sun 40
teams <- c("PHI","NYM","FLA","ATL","WSN")
w<-c(92,89,94,72,59)
l<-c(70,73,77,90,102)
df1=data.frame(teams,w,l)
df1
teams w l
PHI 92 70
NYM 89 73
FLA 94 77
ATL 72 90
WSN 59 102
#提取单个元素,类似坐标定位
df1[1,2]

92

#指定列则使用向量
df1[c(1,2,4),c(1,2)]
teams w
1 PHI 92
2 NYM 89
4 ATL 72
#使用变量名
df1[c(1,3),c("teams","w")]
teams w
1 PHI 92
3 FLA 94
#提取某一列
df1$teams
  1. PHI
  2. NYM
  3. FLA
  4. ATL
  5. WSN
Levels:
  1. 'ATL'
  2. 'FLA'
  3. 'NYM'
  4. 'PHI'
  5. 'WSN'
#提取符合特定条件的子集
subset(df1,w>85)
teams w l
PHI 92 70
NYM 89 73
FLA 94 77
#添加新变量
#$
df1$wr=w/w+l
df1
teams w l wr
PHI 92 70 71
NYM 89 73 74
FLA 94 77 78
ATL 72 90 91
WSN 59 102 103
#with():函数
df1$wrr=with(df1,w/w+2*l)
df1
teams w l wr wrr
PHI 92 70 71 141
NYM 89 73 74 147
FLA 94 77 78 155
ATL 72 90 91 181
WSN 59 102 103 205
#transform():一次性添加多个变量
df1=transform(df1,rw=l/(w+l),playoff="Yes")
df1

teams w l wr wrr rw playoff
PHI 92 70 71 141 0.4320988 Yes
NYM 89 73 74 147 0.4506173 Yes
FLA 94 77 78 155 0.4502924 Yes
ATL 72 90 91 181 0.5555556 Yes
WSN 59 102 103 205 0.6335404 Yes

课件2

  • 数据的存储

  • 数据的读取

2.1 数据存储

write.table(data,file=“”,rom.nanme=F,quote=F)

数据,文件名,行名是否写入,变量名不放入双引号

save():存储为R文件

#1.数据存储
df=data.frame(Index=c(1:5),grade=seq(10,50,10));
df
Index grade
1 10
2 20
3 30
4 40
5 50
#保存为文本文件.txt
write.table(df,file = "test1",row.names = F,quote = F)#默认为True;
#保存为csv文件
write.table(df,file = "test2.csv",sep = ",",row.names = F,quote = F)
#保存为R文件
save(df,file = "test3.Rdata")

2.2数据读取

  • read.table()
  • read.csv() “,”
  • read.csv2() “;”
  • read.delim() “制表符”

以上方法均可以使用下图参数

  • read.fwf():读取固定宽度的数据
  • read_excel(file,sheet=1) sheet指定表名
  • read
    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-TG3ilI3d-1678242376636)(attachment:HW%7B1FRW%28H%606SUP8(_8P%29)]4D.png)
#1.读取txt文件
df=read.table("test1",header=TRUE)
df
Index grade
1 10
2 20
3 30
4 40
5 50
#2.读取excel文件
library(readxl)
df1=read_excel("test4.xlsx",sheet=1)
df1[1:12,1:3]
New names:
* `` -> ...1
* `` -> ...5
* `` -> ...6
* `` -> ...7
* `` -> ...8
* ... and 43 more problems
...1 B7养老服务 B7-1
吉林省 72.89000 72.71000
长春市 77.47757 77.84353
吉林市 72.93831 71.76471
延边朝鲜族自治州 71.00000 69.76190
白城市 69.54338 69.77376
白山市 61.18881 60.57554
四平市 67.11039 65.95469
通化市 64.58101 64.21053
松原市 71.34831 72.02817
梅河口市 84.19492 84.15254
辽源市 71.55000 72.28000
长白山保护开发区 68.63014 68.21918

2.3 编程基础

  • 条件语句

if(conditon) 操作 else 操作

ifelse(condition,yes,no):别用

  • 循环语句

for(i in 1:5)print()

while(condition){操作}

repeat()

  • 函数

函数名=function(x,y){}

函数名=function(x=1,y=2),设置默认值

#条件语句
x=2
if(x>0)x=x+1 else x=x+2
x

3

grade=85
ifelse(grade>80,print("优秀"),print("良好"))
Error in parse(text = x, srcfile = src): :2:30: unexpected input
1: grade=85
2: ifelse(grade>80,print("优秀")n                                ^
Traceback:
#定义一个将b进制的数转换为10进制
base=function(x,y){
     x+y
}
base(2,3)

5

2.4R语言绘图

  • 散点图

  • 柱状图

  • 饼图

  • 直方图

  • 箱线图

  • 多图

2.4.1 散点图

plot()
function (x, y = NULL, type = “p”, xlim = NULL, ylim = NULL,
log = “”, main = NULL, sub = NULL, xlab = NULL, ylab = NULL,
ann = par(“ann”), axes = TRUE, frame.plot = axes, panel.first = NULL,
panel.last = NULL, asp = NA, …
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QUAmvfrs-1678242376637)(attachment:96a8d98f1c9ff18f7237157f820b478.png)]

mian:图形标题
axes:两个坐标轴是否都画出

x=1:5
y=x*x+1;
plot(x,y,main = "散点图")


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DSaEtCwl-1678242376637)(output_89_0.png)]

#获得一个点或者一个点的坐标
locator(3)
Error in locator(3): plot.new has not been called yet
Traceback:


1. locator(3)
#一个特别的散点图
u=1:25
plot(u~1,pch = u,col=u,cex=3)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-I2As5CMN-1678242376638)(output_91_0.png)]

2.4.2 柱状图

  • 横向并列
  • 分段柱状图

barplot(las=2,):无法直接处理数据框,需要将数据框转换为矩阵

as.matrix():表示将其参数转换为datatype类型

df=read.csv("student.csv");
df_m=as.matrix(df)
barplot(df_m)
Warning message in apply(height, 2L, cumsum):
"强制改变过程中产生了NA"


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7qCqov6V-1678242376638)(output_93_1.png)]

barplot(df_m,las=1)#las=2可以修改标签样式,等于2时——标签垂直于坐标轴
Warning message in apply(height, 2L, cumsum):
"强制改变过程中产生了NA"


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-s3CiHr4C-1678242376639)(output_94_1.png)]

#横向并列的柱状图
barplot(df_m,beside = TRUE,horiz = TRUE,legend=TRUE,cex.names = .75)
Error in -0.01 * height: 二进列运算符中有非数值参数
Traceback:


1. barplot(df_m, beside = TRUE, horiz = TRUE, legend = TRUE, cex.names = 0.75)

2. barplot.default(df_m, beside = TRUE, horiz = TRUE, legend = TRUE, 
 .     cex.names = 0.75)

2.4.3饼图

pie(x, labels = names(x), edges = 200, radius = 0.8,
clockwise = FALSE, init.angle = if(clockwise) 90 else 0,
density = NULL, angle = 45, col = NULL, border = NULL,
lty = NULL, main = NULL, …)

  • labels:给出每一扇区标签
  • edges:越大yue
  • radius:-1 默认0角度从正左边逆时针开始,+1默认从正右边逆时针开始
  • clockwise:逻辑值,f逆时针排雷,T顺时针
  • angle:阴影线斜率
  • col:一个颜色向量,填充扇区
  • border:扇区边框严肃
catch=c(7752,1166,463,108)
#为向量赋值
names(catch)=c("新疆","周期","篮协","姚明")
pie(catch,cex=.6)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-R2e0FPLb-1678242376639)(output_97_0.png)]

2.4.4直方图

  • 质量分布图

hist(x)

  • 密度图

plot(density(x))

df=data.frame(姓名=c("s","z","s","t"),
                排名=1:4)
df
姓名 排名
s 1
z 2
s 3
t 4
#直方图绘制
hist(df$排名)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gtVTfHgP-1678242376639)(output_100_0.png)]

#密度图
plot(density(df$排名))


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-X4DeyMun-1678242376640)(output_101_0.png)]

2.4.5箱线图

boxplot(x, …)
Default S3 method:
boxplot(x, …, range = 1.5, width = NULL, varwidth = FALSE,
notch = FALSE, outline = TRUE, names, plot = TRUE,
border = par(“fg”), col = NULL, log = “”,
pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5),
horizontal = FALSE, add = FALSE, at = NULL)

boxplot(df$排名)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-CSJLKdE3-1678242376640)(output_103_0.png)]

A <- c(79.98, 80.04, 80.02, 80.04, 80.03, 80.03, 80.04, 79.97, 80.05, 80.03, 
80.02, 80.00, 80.02)
B <- c(80.02, 79.94, 79.98, 79.97, 79.97, 80.03, 79.95,79.97)
#不带切口的图像
boxplot(A,B,names = c("A","B"),col=c(2:3))


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UvakDZnB-1678242376640)(output_104_0.png)]

#notch=T画出的图形带切口
boxplot(A,B,notch = T,names = c("A","B"),col=c(2:3))
Warning message in bxp(list(stats = structure(c(80, 80.02, 80.03, 80.04, 80.05, :
"some notches went outside hinges ('box'): maybe set notch=FALSE"


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cZIM4cIY-1678242376640)(output_105_1.png)]

课件3

3.1描述性统计

summary():查看一个概述,数据集大致

mean(x):计算均值

range():同时计算最大值,最小值

quantile(x,probs=c()):返回不同的分位数

fivenum():计算 最小值,25%,中位数,75%分位数,最大值

IQR():计算75%-25%的差

str():展示一个对象的结构

stem(x,scale,width,atom)查看数值型向量分布信息,图长度,宽度,精度

sd()标准差,var()方差

attach(mtcars)
#均值
mean(mpg)

20.090625

#最值
range(mpg)
  1. 10.4
  2. 33.9
#分位数
quantile(mpg,probs = c(0,0.3,0.6,1))
0%
10.4
30%
15.98
60%
21
100%
33.9
fivenum(mpg)
  1. 10.4
  2. 15.35
  3. 19.2
  4. 22.8
  5. 33.9
IQR(mpg)

7.375

summary(mpg)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.40   15.43   19.20   20.09   22.80   33.90 
str(mtcars)
'data.frame':	32 obs. of  11 variables:
 $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
 $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
 $ disp: num  160 160 108 258 360 ...
 $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
 $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
 $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
 $ qsec: num  16.5 17 18.6 19.4 17 ...
 $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
 $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
 $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
 $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
stem(subset(mtcars,mpg>15)$mpg)#截取mtcars中  mpg大于15的子集的mpg列
  The decimal point is at the |

  14 | 2258
  16 | 438
  18 | 17227
  20 | 00445
  22 | 88
  24 | 4
  26 | 03
  28 | 
  30 | 44
  32 | 49

sd(mpg)

6.0269480520891

var(mpg)

36.3241028225806

3.2相关系数和协方差

  • 相关性概念
  • 相关分析

计算相关系数

cor(x, y = NULL, use = “everything”,
method = c(“pearson”, “kendall”, “spearman”))

use:处理NA值的方式,all.obs 报错; everything:返回NA; complete.obs:过滤NA值

相关分析

cor.test(x,y)

  • 协方差

cov():参数与cor()相同

#1.画出散点图大致判断
x<-c(1.21, 1.30, 1.39, 1.42, 1.47, 1.56, 1.68, 1.72, 1.98, 2.10)
y<-c(3.90, 4.50, 4.20, 4.83, 4.16, 4.93, 4.32, 4.99, 4.70, 5.20)
level=data.frame(x,y)
plot(level)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jd0Bk0ni-1678242376641)(output_119_0.png)]

#大致推测有正相关关系

#2.相关分析,pearson系数计算
cor(x,y)#计算

0.68072825953986

#相关分析
cor.test(x,y)#p小于0.05备,大于原假设

	Pearson's product-moment correlation

data:  x and y
t = 2.6284, df = 8, p-value = 0.03025
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.08943359 0.91722701
sample estimates:
      cor 
0.6807283 
x1=1:5
y1=6:10
y2=10:14
cor(x1,y1 )

1

cor(x1,y2)#我们可发现结果相同

1

cov(x,y)

0.0830455555555556

3.3概率与分布

  • 随机抽样

sample(x,n,replace,prob)

replace=T:有放回抽样;prob=y:指定x中元素出现的额概率

  • 排列组合

prod():计算向量的乘积,可用作排列数

choose()组合数

  • 概率分布

3.3.1随机抽样

#等可能不放回随机抽样
sample(1:52,10)
  1. 25
  2. 39
  3. 7
  4. 11
  5. 49
  6. 30
  7. 22
  8. 8
  9. 24
  10. 14
#等可能有放回的随机抽样
sample(1:11,10,replace = T)
  1. 9
  2. 11
  3. 9
  4. 10
  5. 1
  6. 8
  7. 11
  8. 11
  9. 11
  10. 4
#不等可能随机抽样
sample(1:5,2,prob = c(0.1,0.2,0.3,0.2,0.2))
  1. 2
  2. 5

3.3.2排列组合计算

prod(1:10)

3628800

choose(52,4)

270725

3.3.3概率分布

  • 贝努利分布:一个实验中成功与失败的概率
  • 二项分布:多次贝努利分布

binom(n,p):n为1时贝努利分布

dbinom(x, size, prob):返回成功x次的概

pbinom(q, size, prob):返回至多成功x次的概率,即累积概

qbinom(p, size, prob):返回相应分位点x,详情见下面的例

rbinom(n, size, prob):返回每组试验的成功次数

  • 泊松分布:某一事件发生的次数

pois(k)

poisson.test():使用泊松分布进行假设检验

课件三-29
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-R1yg9eGQ-1678242376641)(attachment:%E5%9B%BE%E7%89%87.png)]

binom.test(x=23,n=1000,p=0.02,alternative="greater",conf.level=0.95)

	Exact binomial test

data:  23 and 1000
number of successes = 23, number of trials = 1000, p-value = 0.2778
alternative hypothesis: true probability of success is greater than 0.02
95 percent confidence interval:
 0.01576927 1.00000000
sample estimates:
probability of success 
                 0.023 

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GtjvpruU-1678242376641)(attachment:%E5%9B%BE%E7%89%87.png)]

#泊松分布
poisson.test(x=60,T = 50,alternative = "greater",conf.level = 0.95)

	Exact Poisson test

data:  60 time base: 50
number of events = 60, time base = 50, p-value = 0.09227
alternative hypothesis: true event rate is greater than 1
95 percent confidence interval:
 0.9570464       Inf
sample estimates:
event rate 
       1.2 

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IqbeILku-1678242376642)(attachment:%E5%9B%BE%E7%89%87.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eNkxAR8e-1678242376642)(attachment:%E5%9B%BE%E7%89%87-2.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nffv7r0R-1678242376642)(attachment:%E5%9B%BE%E7%89%87-3.png)]

3.3.1均匀分布

unif(a,b):[a,b]上服从均匀分布

#生成一个长度为10的向量
runif(10)
  1. 0.53073152853176
  2. 0.313985942630097
  3. 0.245464456267655
  4. 0.119863975094631
  5. 0.10933571960777
  6. 0.710637729149312
  7. 0.201255693566054
  8. 0.300185269443318
  9. 0.0965773987118155
  10. 0.109989551594481
runif(4,min = 85,max = 95)
  1. 92.8510010032915
  2. 86.2329574953765
  3. 93.1314002070576
  4. 86.8352652085014

3.3.2正态分布

norm()

#生成服从正态分布的随机数
rnorm(10,mean = 1,sd = 10)
  1. 0.219713001901152
  2. -1.31277021318814
  3. 0.909639054748842
  4. -7.38834035398254
  5. 13.1470032293548
  6. 4.01981894087973
  7. -8.68465742737563
  8. -10.9816258932923
  9. 2.4959036577382
  10. -9.10946348120309
curve(sin,-2*pi,2*pi)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ADmcNyVH-1678242376642)(output_145_0.png)]

#连续分布的密度函数,qq图
qqnorm(rnorm(1000))


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-AmuyK7yr-1678242376643)(output_146_0.png)]

课件4

  • z检验:自己定义

  • t检验(均值的比较):单个正态总体的u检验,两个正太总体 u是否相等的检验

  • f检验:方差的检验(var.test())

t.test(x, y = NULL,
alternative = c(“two.sided”, “less”, “greater”),
mu = 0, paired = FALSE, var.equal = FALSE,
conf.level = 0.95, …)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FoacXq0l-1678242376643)(attachment:%E5%9B%BE%E7%89%87.png)]

Z检验

#z检验
z.test<-function(x,n,sigma,alpha,u0=0,alternative="two.sided"){
options(digits=4)
result<-list( )
mean<-mean(x)
z<-(mean-u0)/(sigma/sqrt(n))
p<-pnorm(z,lower.tail=FALSE)
result$mean<-mean
result$z<-z
result$p.value<-p
if(alternative=="two.sided"){
p<-2*p
result$p.value<-p
}
else if (alternative == "greater"|alternative =="less" ){
result$p.value<-p
}
 else return("your input is wrong")
 result$conf.int<- c(
 mean-sigma*qnorm(1-alpha/2,mean=0, sd=1,
 lower.tail = TRUE)/sqrt(n),
 mean+sigma*qnorm(1-alpha/2,mean=0, sd=1,
 lower.tail = TRUE)/sqrt(n))
 result }
z.test(0.13,25,0.1,0.05,u0 = 0.12,alternative = "less")
$mean
0.13
$z
0.5
$p.value
0.308537538725987
$conf.int
  1. 0.0908007203091989
  2. 0.169199279690801

4.1 T检验

  • 单个正态的检验 u==c
  • 多个正态总体的(不成对) u的比较
  • 成对的数据的 u检验
#检验单个总体 u==c
salt=c(490 , 506, 508, 502, 498, 511, 510, 515 , 512)
t.test(salt,mu = 500)

	One Sample t-test

data:  salt
t = 2.2, df = 8, p-value = 0.06
alternative hypothesis: true mean is not equal to 500
95 percent confidence interval:
 499.7 511.8
sample estimates:
mean of x 
    505.8 
#检验多个总体  u1==u2 
x<-c(20.5, 19.8, 19.7, 20.4, 20.1, 20.0, 19.0, 19.9)
y<-c(20.7, 19.8, 19.5, 20.8, 20.4, 19.6, 20.2)
t.test(x,y,var.equal = TRUE)

	Two Sample t-test

data:  x and y
t = -0.85, df = 13, p-value = 0.4
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.7684  0.3327
sample estimates:
mean of x mean of y 
    19.93     20.14 
x<-c(20.5, 19.8, 19.7, 20.4, 20.1, 20.0, 19.0, 19.9)
y=c(20.7,19.8,19.5,20.8,20.4,19.6,20.2)
t.test(x,y,var.equal = TRUE)

	Two Sample t-test

data:  x and y
t = -0.85, df = 13, p-value = 0.4
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.7684  0.3327
sample estimates:
mean of x mean of y 
    19.93     20.14 
#成对数据的t检验,s使用paired参数
x<-c(20.5, 18.8, 19.8, 20.9, 21.5, 19.5, 21.0, 21.2)
y<-c(17.7, 20.3, 20.0, 18.8, 19.0, 20.1, 20.0, 19.1)
t.test(x,y,var.equal = TRUE,paired = TRUE)

	Paired t-test

data:  x and y
t = 1.8, df = 7, p-value = 0.1
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.3214  2.3714
sample estimates:
mean of the differences 
                  1.025 

4.2F检验

var.test()

4.3非参数的假设检验

  • 卡方独立性检验:20%以下的格子期望频数小于5

chisq.test()

  • Fisher精确检验

4.3.1卡方独立性检验

  • 原假设:没有关系
    [外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-4M1NFeZf-1678242376643)(attachment:%E5%9B%BE%E7%89%87.png)]
#卡方独立性检验
compare=matrix(c(60,32,3,11),nr=2,dimnames = list(c("cancer","normal"),c("smoke","Not smoke")))
chisq.test(compare)

	Pearson's Chi-squared test with Yates' continuity correction

data:  compare
X-squared = 7.9, df = 1, p-value = 0.005

4.3.2Fisher精确性研究

#Fisher精确检验
fisher.test(compare,alternative = "greater")

	Fisher's Exact Test for Count Data

data:  compare
p-value = 0.002
alternative hypothesis: true odds ratio is greater than 1
95 percent confidence interval:
 1.95  Inf
sample estimates:
odds ratio 
     6.747 

4.4方差分析

并非对方差进行分析,而是通过对方差的分析比较均值

  • 正态性检验
  • 方差齐性检验
  • 单因素方差分析:aov()
  • 双因子方差分析

4.4.1正态性检验

  • 原假设:数据呈现正态分布
shapiro.test(rnorm(1000))

	Shapiro-Wilk normality test

data:  rnorm(1000)
W = 1, p-value = 0.6

4.4.2方差齐次性检验

  • 原假设:各因子水平下方差相同
  • barrlett检验
  • Levene检验
bartlett.test(X~A,data = miscellany)#p值够大,方差相同

	Bartlett test of homogeneity of variances

data:  X by A
Bartlett's K-squared = 7.1, df = 4, p-value = 0.1
library(car)
Loading required package: carData
#LeveneTest检验
leveneTest(miscellany$X,miscellany$A)
Df F value Pr(>F)
group 4 1.737 0.1942
15 NA NA

4.4.3单因素方差分析

aov.is= aov()

summary(aov.is)

X<-c(25.6, 22.2, 28.0, 29.8, 24.4, 30.0, 29.0, 27.5, 25.0, 27.7,
23.0, 32.2, 28.8, 28.0, 31.5, 25.9, 20.6, 21.2, 22.0, 21.2)
A=factor(rep(1:5,each=4))#因子存储不同类型的数据
A
  1. 1
  2. 1
  3. 1
  4. 1
  5. 2
  6. 2
  7. 2
  8. 2
  9. 3
  10. 3
  11. 3
  12. 3
  13. 4
  14. 4
  15. 4
  16. 4
  17. 5
  18. 5
  19. 5
  20. 5
Levels:
  1. '1'
  2. '2'
  3. '3'
  4. '4'
  5. '5'
#生成数据框
miscellany=data.frame(X,A)
#拟合ANOVA模型:aov()
aov.mis=aov(X~A,data = miscellany)#X是实验数据,A是对应方法
summary(aov.mis)

            Df Sum Sq Mean Sq F value Pr(>F)  
A            4    132    33.0    4.31  0.016 *
Residuals   15    115     7.7                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Df:自由度 Sum Sq :平方和 Mean Sq:均方和 Fvalue f统计量值 Pr :p值
由计算结果可知这里p值(Pr) 0.016<0.05 所以拒绝原假设,5种不同的方法除杂效果不完全相同同

4.4.4 双因子方差分析

对比单因素方差分析 X~A

  • 无交互作用的方差分析

同样使用aov方法,模型如下

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mi5JxnV8-1678242376644)(attachment:%E5%9B%BE%E7%89%87.png)]

  • A,B为各种因素,x为数据集
  • 有交互作用的方差分析

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-D1Xs9egy-1678242376644)(attachment:%E5%9B%BE%E7%89%87-2.png)]

无交互作用方差分析

gl(x, k, length, labels, ordered)

  • x:级别数
  • k:重复次数
  • length:结果长度
  • labels:向量的标签(可选)
  • ordered:用于对级别进行排序的布尔值

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-snzluevZ-1678242376645)(attachment:%E5%9B%BE%E7%89%87.png)]

 juice<-data.frame(
 X = c(0.05, 0.46, 0.12, 0.16, 0.84, 1.30, 0.08, 0.38, 0.4,
 0.10, 0.92, 1.57, 0.11, 0.43, 0.05, 0.10, 0.94, 1.10,
 0.11, 0.44, 0.08, 0.03, 0.93, 1.15),
 A = gl(4, 6),# 1 到4 每个数重复6次
 B = gl(6, 1, 24)# 1:6  每个一次,由于不够24 数组循环4次
     
 )
juice
X A B
0.05 1 1
0.46 1 2
0.12 1 3
0.16 1 4
0.84 1 5
1.30 1 6
0.08 2 1
0.38 2 2
0.40 2 3
0.10 2 4
0.92 2 5
1.57 2 6
0.11 3 1
0.43 3 2
0.05 3 3
0.10 3 4
0.94 3 5
1.10 3 6
0.11 4 1
0.44 4 2
0.08 4 3
0.03 4 4
0.93 4 5
1.15 4 6
avo.juice=aov(X~A+B,data=juice)
summary(avo.juice)#A p值0.22>0.05 A无B有
            Df Sum Sq Mean Sq F value Pr(>F)    
A            3   0.06   0.019    1.63   0.22    
B            5   4.90   0.980   83.98  2e-10 ***
Residuals   15   0.18   0.012                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
有交互作用的方差分析

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Afcfnkkq-1678242376645)(attachment:%E5%9B%BE%E7%89%87.png)]

rats=data.frame(Time=c(0.31, 0.45, 0.46, 0.43, 0.82, 1.10, 0.88, 0.72, 0.43, 0.45,
 0.63, 0.76, 0.45, 0.71, 0.66, 0.62, 0.38, 0.29, 0.40, 0.23,
 0.92, 0.61, 0.49, 1.24, 0.44, 0.35, 0.31, 0.40, 0.56, 1.02,
 0.71, 0.38, 0.22, 0.21, 0.18, 0.23, 0.30, 0.37, 0.38, 0.29,
 0.23, 0.25, 0.24, 0.22, 0.30, 0.36, 0.31, 0.33),
 Toxicant=gl(3, 16, 48, labels = c("I", "II", "III")),
 Cure=gl(4, 4, 48, labels = c("A", "B", "C", "D")))
rats
Time Toxicant Cure
0.31 I A
0.45 I A
0.46 I A
0.43 I A
0.82 I B
1.10 I B
0.88 I B
0.72 I B
0.43 I C
0.45 I C
0.63 I C
0.76 I C
0.45 I D
0.71 I D
0.66 I D
0.62 I D
0.38 II A
0.29 II A
0.40 II A
0.23 II A
0.92 II B
0.61 II B
0.49 II B
1.24 II B
0.44 II C
0.35 II C
0.31 II C
0.40 II C
0.56 II D
1.02 II D
0.71 II D
0.38 II D
0.22 III A
0.21 III A
0.18 III A
0.23 III A
0.30 III B
0.37 III B
0.38 III B
0.29 III B
0.23 III C
0.25 III C
0.24 III C
0.22 III C
0.30 III D
0.36 III D
0.31 III D
0.33 III D
aov.rat=aov(Time~Toxicant*Cure,data = rats)
summary(aov.rat)
              Df Sum Sq Mean Sq F value  Pr(>F)    
Toxicant       2  1.036   0.518   23.23 3.3e-07 ***
Cure           3  0.915   0.305   13.67 4.1e-06 ***
Toxicant:Cure  6  0.248   0.041    1.85    0.12    
Residuals     36  0.803   0.022                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

三行分别为 毒药,治疗,交互作用对于存活时间的影响

课件5

  • 回归分析

5.1回归分析

lm(formula, data, subset, weights, na.action,
method = “qr”, model = TRUE, x = FALSE, y = FALSE, qr = TRUE,
singular.ok = TRUE, contrasts = NULL, offset, …)

: formula是显示回归模型,data是数据框,subset是样本观察的子
集,weights是用于拟合的加权向量,na.action显示数据是否包含缺失值,
method是指出用于拟合的方法,model, x, y, qr是逻辑表达,如果是TRUE,
应返回其值,除了第一个选项formula是必选项,其它都是可选项

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-X2pWlvdU-1678242376645)(attachment:%E5%9B%BE%E7%89%87.png)]

confint(object,parm,level=0.95…)回归模型,所求区间估计的参数,置信水平(求解回归参数的置信区间)

predict(object,x,interval=“predction”,level)#predction 表示同时给出相应的预测区间

x<-c(318, 910, 200, 409, 415, 502, 314, 1210, 1022, 1225)
y<-c(524, 1019, 638, 815, 913, 928, 605, 1516, 1219, 1624)
lm.reg=lm(y~x)
summary(lm.reg)

Call:
lm(formula = y ~ x)

Residuals:
   Min     1Q Median     3Q    Max 
-191.8  -87.0   44.8   77.9  145.7 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  395.567     80.261    4.93   0.0012 ** 
x              0.896      0.107    8.40  3.1e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 127 on 8 degrees of freedom
Multiple R-squared:  0.898,	Adjusted R-squared:  0.886 
F-statistic: 70.6 on 1 and 8 DF,  p-value: 3.06e-05
confint(lm.reg,level = 0.95)
2.5 % 97.5 %
(Intercept) 210.48 580.650
x 0.65 1.142

5.1.2多元线性回归模型

  • 模型建立
  • 回归系数的显著性检验:看系数的p值
  • 回归方程的显著性检验:看F分布的p值
#模型建立
y<-c(162, 120, 223, 131, 67, 169, 81, 192, 116, 55,
 252, 232, 144, 103, 212)
 x1<-c(274, 180, 375, 205, 86, 265, 98, 330, 195, 53,
 430, 372, 236, 157, 370)
 x2<-c(2450, 3250, 3802, 2838, 2347, 3782, 3008, 2450,
 2137, 2560, 4020, 4427, 2660, 2088, 2605)
sales=data.frame(y,x1,x2)
lm.reg=lm(y~x1+x2,data = sales)
summary(lm.reg)

Call:
lm(formula = y ~ x1 + x2, data = sales)

Residuals:
   Min     1Q Median     3Q    Max 
-3.831 -1.206 -0.244  1.482  3.302 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.445728   2.426693    1.42     0.18    
x1          0.495972   0.006046   82.04  < 2e-16 ***
x2          0.009205   0.000967    9.52  6.1e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.17 on 12 degrees of freedom
Multiple R-squared:  0.999,	Adjusted R-squared:  0.999 
F-statistic: 5.7e+03 on 2 and 12 DF,  p-value: <2e-16

结论:系数的p值,F分布的p值很小,所以回归方程与回归系数的检验是显著的

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-j1A56ySu-1678242376646)(attachment:%E5%9B%BE%E7%89%87.png)]

 y<-c(11.2, 8.8, 12.3, 11.6, 13.4, 18.3, 11.1, 12.1, 9.6, 8.4, 9.3, 10.6, 8.4, 9.6, 
10.9, 10.1,14.8, 9.1, 10.8, 10.2, 13.6, 14.9, 16.0, 13.2,20.0, 13.3, 10.4)
x1<-c(5.68, 3.79, 6.02, 4.85, 4.60, 6.05, 4.90, 7.08,3.85,4.65, 4.59, 4.29, 7.97, 
6.19, 6.13, 5.71,6.40,6.06, 5.09, 6.13, 5.78, 5.43, 6.50, 7.98,11.54,5.84, 3.84)
 x2<-c(1.90, 1.64, 3.56, 1.07, 2.32, 0.64, 8.50, 3.00,2.11, 0.63, 1.97, 1.97, 
1.93, 1.18, 2.06, 1.78,2.40, 3.67, 1.03, 1.71, 3.36, 1.13, 6.21, 7.92,10.89, 0.92, 
1.20)
x3<-c(4.53, 7.32, 6.95, 5.88, 4.05, 1.42, 12.60, 6.75,16.28, 6.59, 3.61, 6.61, 
7.57, 1.42, 10.35, 8.53,4.53,12.79, 2.53, 5.28, 2.96, 4.31, 3.47, 3.37,1.20, 8.61, 
6.45)
x4<-c(8.2, 6.9, 10.8, 8.3, 7.5, 13.6, 8.5, 11.5,7.9, 7.1, 8.7, 7.8, 9.9, 6.9, 
10.5, 8.0,10.3, 7.1, 8.9, 9.9, 8.0, 11.3, 12.3, 9.8, 10.5, 6.4, 9.6)
blood=data.frame(y,x1,x2,x3,x4)
lm.reg=lm(y~x1+x2+x3+x4,data = blood)
summary(lm.reg)

Call:
lm(formula = y ~ x1 + x2 + x3 + x4, data = blood)

Residuals:
   Min     1Q Median     3Q    Max 
-3.627 -1.200 -0.228  1.539  4.447 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)    5.943      2.829    2.10    0.047 *
x1             0.142      0.366    0.39    0.701  
x2             0.351      0.204    1.72    0.099 .
x3            -0.271      0.121   -2.23    0.036 *
x4             0.638      0.243    2.62    0.016 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.01 on 22 degrees of freedom
Multiple R-squared:  0.601,	Adjusted R-squared:  0.528 
F-statistic: 8.28 on 4 and 22 DF,  p-value: 0.000312

结论:回归系数方程显著性不高

5.2逐步回归

在OLS线性回归的基础上进行建模

step(object, scope, scale = 0,direction = c(“both”, “backward”, “forward”),
trace = 1, keep = NULL, steps = 1000, k = 2, …)

object是线性模型或广义线性模型分析的结果,scope是确定逐步
搜索的区域,direction确定逐步搜索的方向:“both”是“一切子集回归
法”,“backward”是“向后法”,“forward”是向前法, 默认值为both

lm.step=step(lm.reg)
summary(lm.step)
Start:  AIC=42.16
y ~ x1 + x2 + x3 + x4

       Df Sum of Sq   RSS  AIC
- x1    1      0.61  89.5 40.3
               88.8 42.2
- x2    1     11.96 100.8 43.6
- x3    1     20.06 108.9 45.7
- x4    1     27.79 116.6 47.5

Step:  AIC=40.34
y ~ x2 + x3 + x4

       Df Sum of Sq   RSS  AIC
               89.5 40.3
- x3    1      25.7 115.1 45.2
- x2    1      26.5 116.0 45.4
- x4    1      32.3 121.7 46.7




Call:
lm(formula = y ~ x2 + x3 + x4, data = blood)

Residuals:
   Min     1Q Median     3Q    Max 
-3.269 -1.231 -0.202  1.489  4.657 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)    6.500      2.396    2.71   0.0124 * 
x2             0.402      0.154    2.61   0.0156 * 
x3            -0.287      0.112   -2.57   0.0171 * 
x4             0.663      0.230    2.88   0.0084 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.97 on 23 degrees of freedom
Multiple R-squared:  0.598,	Adjusted R-squared:  0.546 
F-statistic: 11.4 on 3 and 23 DF,  p-value: 8.79e-05

AIC统计量42.16 如果去掉变量x1 AIC 40.34.
依次类推。由于去掉X1
使AIC统计量达到最小,因此R软件
会自动去掉变量X1,进入下一轮计
算。在下一轮中,无论去掉哪一个
变量,AIC统计量的值均会升高,因
此R软件自动终止计算,得到“最优”
回归方程

#再次提取相关信息,这是优化之后的
summary(lm.step)

Call:
lm(formula = y ~ x2 + x3 + x4, data = blood)

Residuals:
   Min     1Q Median     3Q    Max 
-3.269 -1.231 -0.202  1.489  4.657 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)    6.500      2.396    2.71   0.0124 * 
x2             0.402      0.154    2.61   0.0156 * 
x3            -0.287      0.112   -2.57   0.0171 * 
x4             0.663      0.230    2.88   0.0084 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.97 on 23 degrees of freedom
Multiple R-squared:  0.598,	Adjusted R-squared:  0.546 
F-statistic: 11.4 on 3 and 23 DF,  p-value: 8.79e-05

5.3诊断分析

主要用于检验回归假设是否成立

  • 残差否为随机性、是否为正态性、是否不为异方差;
  • 高度相关的自变量是否引起了共线性
  • 模型的函数形式是否错误或在模型中是否缺少重要的自变量
  • 样本数据中是否存在异常值

主要内容

  • 残差分析:参数全部为object(模型)

residuals():计算残差

rstandard():计算标准差

rsrudent():计算学生化残差

predict():求预测值

update():更新模型

coef():提取参数的估计

  • 影响分析
  • 共线性诊断

5.3.1方差分析

#残差计算
y<-c(162, 120, 223, 131, 67, 169, 81, 192, 116, 55, 252, 232, 144, 103, 212)
x1<-c(274, 180, 375, 205, 86, 265, 98, 330, 195, 53,430, 372, 236, 157, 370)
x2<-c(2450, 3250, 3802, 2838, 2347, 3782, 3008, 2450,2137, 2560, 4020, 4427, 
2660, 2088, 2605)
sales<-data.frame(y,x1,x2)
lm.reg=lm(y~x1+x2,data = sales)
summary(lm.reg)

Call:
lm(formula = y ~ x1 + x2, data = sales)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.8312 -1.2063 -0.2436  1.4819  3.3025 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.4457284  2.4266934   1.420    0.181    
x1          0.4959724  0.0060455  82.039  < 2e-16 ***
x2          0.0092049  0.0009668   9.521 6.07e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.173 on 12 degrees of freedom
Multiple R-squared:  0.9989,	Adjusted R-squared:  0.9988 
F-statistic:  5699 on 2 and 12 DF,  p-value: < 2.2e-16
#残差计算
y.res=residuals(lm.reg)
#计算标准化残差
y.rst=rstandard(lm.reg)
y.fit=predict(lm.reg)
op=par(mfrow = c(1,2))
plot(y.res~y.fit)
plot(y.rst~y.fit)
par(op)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mNRy5BPy-1678242376646)(output_204_0.png)]

5.3.2方差齐性诊断分析

 x<-c(294, 247, 267, 358, 423, 311, 450, 534, 438, 697,688, 630, 709, 627, 
 615, 999, 1022, 1015, 700, 850, 980, 1025, 1021, 1200, 1250, 1500, 1650)
 y<-c(50, 40, 45, 55, 70, 65, 55, 62, 68, 78,80, 84, 88, 97, 100, 109, 114, 
117, 106, 128,130, 160, 97, 180, 112, 210, 135)
 persons<-data.frame(x,y)
lm.reg=lm(y~x+1,data = persons)
summary(lm.reg)

Call:
lm(formula = y ~ x + 1, data = persons)

Residuals:
    Min      1Q  Median      3Q     Max 
-47.645 -11.136  -4.278  11.683  41.677 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 25.09434    9.27542   2.705   0.0121 *  
x            0.09549    0.01099   8.691 5.02e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 21.08 on 25 degrees of freedom
Multiple R-squared:  0.7513,	Adjusted R-squared:  0.7414 
F-statistic: 75.54 on 1 and 25 DF,  p-value: 5.018e-09
rst=rstandard(lm.reg)
res=residuals(lm.reg)
lm.fit=predict(lm.reg)
#标准残差化图
plot(rst~lm.fit)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-r9kcxJhu-1678242376646)(output_209_0.png)]

#残差图
plot(res~lm.fit)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-4YE47ETX-1678242376647)(output_210_0.png)]

#平方变换后建立新的模型
lm.ng=update(lm.reg,sqrt(.)~.)
coef(lm.ng)
(Intercept)
6.04464422287795
x
0.00478066441280935

5.3.3 异常点识别

  • 计算普通残差和标准化残差
  • 绝对值>=2 为可疑点
  • 绝对值>=3 异常点

5.3.4共线性诊断

  • 自变量存在的线性关系,增加参数估计的误差

方法

  • 特征值法
  • 条件指数法
  • 方差膨胀因子

eigen(x,symetrict,only.value=FALSE,EISPACK=FALSE) x所求矩阵,symmetric 规定方程的对称性 only.val=TRUE 只返回特征值

kappa(x,exact=FALSE): exact=TRUE,精确计算条件数

vif(lmobj,digits=5) lmobj为lm生成的对象,digits为小数点位数

library(DAAG)
x1<-c(7, 1, 11, 11, 7, 11, 3, 1, 2, 21, 1, 11, 10)
x2<-c(26, 29, 56, 31, 52, 55, 71, 31, 54, 47, 40, 66, 68) 
x3<-c(6, 15, 8, 8, 6, 9, 17, 22, 18, 4, 23, 9, 8)
x4<-c(60, 52, 20, 57, 33, 22, 6, 44, 22, 18, 34, 12, 12) 
y<-c(78.5, 74.3, 104.3, 87.6, 95.9, 109.2, 102.7, 72.5, 93.1, 115.9, 83.8, 113.3, 
109.4)
df=data.frame(x1,x2,x3,x4,y)
lm.reg=lm(y~x1+x2+x3+x4,data=df)
summary(lm.reg)

Call:
lm(formula = y ~ x1 + x2 + x3 + x4, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.2777 -1.3956 -0.2374  1.1650  4.0379 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  64.8044    22.8867   2.832  0.02210 * 
x1            1.4805     0.3598   4.115  0.00337 **
x2            0.4918     0.2285   2.153  0.06351 . 
x3            0.0510     0.3299   0.155  0.88097   
x4           -0.1563     0.2120  -0.737  0.48205   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.373 on 8 degrees of freedom
Multiple R-squared:  0.9834,	Adjusted R-squared:  0.9751 
F-statistic: 118.6 on 4 and 8 DF,  p-value: 3.736e-07
#查看膨胀因子
vif(lm.reg,digits = 3)
x1
9.54
x2
26.9
x3
9.51
x4
31.4

结论:由于x2,x4的因子大于10,所以他们之间可能存在共线性

#查看二者相关
cor(x2,x4)

-0.947969951546351

5.4广义线性模型

  • 逻辑回归:y为类别型
  • 泊松回归:y为计数型

glm(formula,family=family(link=function),data=)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-PmSpic20-1678242376647)(attachment:%E5%9B%BE%E7%89%87.png)]

  • binomial:

5.4.1逻辑回归

x1=rep(c(1,0,1,0,1),c(5,10,10,10,10))
x2<-c(17, 44, 48, 55, 75, 35, 42, 57, 28, 20, 38, 45, 47, 52, 55, 68, 18, 68, 48, 
17, 70, 72, 35, 19, 62, 39, 40, 55, 68, 25, 17, 45, 44, 67, 55, 61, 19, 69, 23, 19,
72, 74, 31, 16, 61)
x3<-c(1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 
1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1) 
y<-c(1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 
0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0)
accident=data.frame(x1,x2,x3,y)
#建立广义线性模型
log.glm=glm(y~x1+x2+x3,family =binomial,data = accident)
summary(log.glm)

Call:
glm(formula = y ~ x1 + x2 + x3, family = binomial, data = accident)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5636  -0.9131  -0.7892   0.9637   1.6000  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)  
(Intercept)  0.597610   0.894831   0.668   0.5042  
x1          -1.496084   0.704861  -2.123   0.0338 *
x2          -0.001595   0.016758  -0.095   0.9242  
x3           0.315865   0.701093   0.451   0.6523  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 62.183  on 44  degrees of freedom
Residual deviance: 57.026  on 41  degrees of freedom
AIC: 65.026

Number of Fisher Scoring iterations: 4
lm.step=step(log.glm)
summary(lm.step)
Start:  AIC=65.03
y ~ x1 + x2 + x3

       Df Deviance    AIC
- x2    1   57.035 63.035
- x3    1   57.232 63.232
      57.026 65.026
- x1    1   61.936 67.936

Step:  AIC=63.03
y ~ x1 + x3

       Df Deviance    AIC
- x3    1   57.241 61.241
      57.035 63.035
- x1    1   61.991 65.991

Step:  AIC=61.24
y ~ x1

       Df Deviance    AIC
      57.241 61.241
- x1    1   62.183 64.183




Call:
glm(formula = y ~ x1, family = binomial, data = accident)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4490  -0.8782  -0.8782   0.9282   1.5096  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)  
(Intercept)   0.6190     0.4688   1.320   0.1867  
x1           -1.3728     0.6353  -2.161   0.0307 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 62.183  on 44  degrees of freedom
Residual deviance: 57.241  on 43  degrees of freedom
AIC: 61.241

Number of Fisher Scoring iterations: 4
log.pre=predict(lm.step,data.frame(x1=1))
p1=exp(log.pre)/(1+exp(log.pre))
p1

1: 0.320000000000098

log.pre=predict(lm.step,data.frame(x1=0))
p2=exp(log.pre)/(1+exp(log.pre))
p2

1: 0.649999999999986

data("Affairs",package="AER")
summary(Affairs)

    affairs          gender         age         yearsmarried    children 
 Min.   : 0.000   female:315   Min.   :17.50   Min.   : 0.125   no :171  
 1st Qu.: 0.000   male  :286   1st Qu.:27.00   1st Qu.: 4.000   yes:430  
 Median : 0.000                Median :32.00   Median : 7.000            
 Mean   : 1.456                Mean   :32.49   Mean   : 8.178            
 3rd Qu.: 0.000                3rd Qu.:37.00   3rd Qu.:15.000            
 Max.   :12.000                Max.   :57.00   Max.   :15.000            
 religiousness     education       occupation        rating     
 Min.   :1.000   Min.   : 9.00   Min.   :1.000   Min.   :1.000  
 1st Qu.:2.000   1st Qu.:14.00   1st Qu.:3.000   1st Qu.:3.000  
 Median :3.000   Median :16.00   Median :5.000   Median :4.000  
 Mean   :3.116   Mean   :16.17   Mean   :4.195   Mean   :3.932  
 3rd Qu.:4.000   3rd Qu.:18.00   3rd Qu.:6.000   3rd Qu.:5.000  
 Max.   :5.000   Max.   :20.00   Max.   :7.000   Max.   :5.000  
table(Affairs$affairs)

  0   1   2   3   7  12 
451  34  17  19  42  38 
#将婚外情转换为二值数据
Affairs$ynaffairs[Affairs$affairs>0]=1;
Affairs$ynaffairs[Affairs$affairs==0]=0;
Affairs$ynaffairs=factor(Affairs$ynaffairs,levels = c(0,1),labels = c("No","Yes"))
table(Affairs$ynaffairs)

 No Yes 
451 150 
aff_glm=glm(ynaffairs~gender+age+yearsmarried+children+religiousness+education+occupation+rating,family = binomial, data = Affairs)
summary(aff_glm)

Call:
glm(formula = ynaffairs ~ gender + age + yearsmarried + children + 
    religiousness + education + occupation + rating, family = binomial, 
    data = Affairs)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5713  -0.7499  -0.5690  -0.2539   2.5191  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)    1.37726    0.88776   1.551 0.120807    
gendermale     0.28029    0.23909   1.172 0.241083    
age           -0.04426    0.01825  -2.425 0.015301 *  
yearsmarried   0.09477    0.03221   2.942 0.003262 ** 
childrenyes    0.39767    0.29151   1.364 0.172508    
religiousness -0.32472    0.08975  -3.618 0.000297 ***
education      0.02105    0.05051   0.417 0.676851    
occupation     0.03092    0.07178   0.431 0.666630    
rating        -0.46845    0.09091  -5.153 2.56e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 675.38  on 600  degrees of freedom
Residual deviance: 609.51  on 592  degrees of freedom
AIC: 627.51

Number of Fisher Scoring iterations: 4
#建立逐步回归模型
aff.step=step(aff_glm)
summary(aff.step)
Start:  AIC=627.51
ynaffairs ~ gender + age + yearsmarried + children + religiousness + 
    education + occupation + rating

                Df Deviance    AIC
- education      1   609.68 625.68
- occupation     1   609.70 625.70
- gender         1   610.89 626.89
- children       1   611.40 627.40
               609.51 627.51
- age            1   615.67 631.67
- yearsmarried   1   618.34 634.34
- religiousness  1   622.92 638.92
- rating         1   636.75 652.75

Step:  AIC=625.68
ynaffairs ~ gender + age + yearsmarried + children + religiousness + 
    occupation + rating

                Df Deviance    AIC
- occupation     1   610.15 624.15
- gender         1   611.29 625.29
- children       1   611.62 625.62
               609.68 625.68
- age            1   615.78 629.78
- yearsmarried   1   618.46 632.46
- religiousness  1   623.27 637.27
- rating         1   636.93 650.93

Step:  AIC=624.15
ynaffairs ~ gender + age + yearsmarried + children + religiousness + 
    rating

                Df Deviance    AIC
- children       1   611.86 623.86
               610.15 624.15
- gender         1   613.41 625.41
- age            1   616.00 628.00
- yearsmarried   1   619.07 631.07
- religiousness  1   623.98 635.98
- rating         1   637.23 649.23

Step:  AIC=623.86
ynaffairs ~ gender + age + yearsmarried + religiousness + rating

                Df Deviance    AIC
               611.86 623.86
- gender         1   615.36 625.36
- age            1   618.05 628.05
- religiousness  1   625.57 635.57
- yearsmarried   1   626.23 636.23
- rating         1   639.93 649.93




Call:
glm(formula = ynaffairs ~ gender + age + yearsmarried + religiousness + 
    rating, family = binomial, data = Affairs)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5623  -0.7495  -0.5664  -0.2671   2.3975  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)    1.94760    0.61234   3.181 0.001470 ** 
gendermale     0.38612    0.20703   1.865 0.062171 .  
age           -0.04393    0.01806  -2.432 0.015011 *  
yearsmarried   0.11133    0.02983   3.732 0.000190 ***
religiousness -0.32714    0.08947  -3.656 0.000256 ***
rating        -0.46721    0.08928  -5.233 1.67e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 675.38  on 600  degrees of freedom
Residual deviance: 611.86  on 595  degrees of freedom
AIC: 623.86

Number of Fisher Scoring iterations: 4

课件6

  • 聚类分析
  • 判别分析
  • 决策树(选修)

6.1聚类分析

  • 距离的计算

dist(x, method = “euclidean”, diag = FALSE, upper = FALSE, p = 2)

  • euclidean(欧式距离),manhattan(曼哈顿距离)
  • 聚类模型

6.1.1距离的计算

x=c(1,2,4.5,6,8)
dim(x)=c(5,1)

x
1.0
2.0
4.5
6.0
8.0
d=dist(x)
d
    1   2   3   4
2 1.0            
3 3.5 2.5        
4 5.0 4.0 1.5    
5 7.0 6.0 3.5 2.0
dist(x,upper = TRUE)
    1   2   3   4   5
1     1.0 3.5 5.0 7.0
2 1.0     2.5 4.0 6.0
3 3.5 2.5     1.5 3.5
4 5.0 4.0 1.5     2.0
5 7.0 6.0 3.5 2.0    
dist(x,upper = TRUE,diag = TRUE)
    1   2   3   4   5
1 0.0 1.0 3.5 5.0 7.0
2 1.0 0.0 2.5 4.0 6.0
3 3.5 2.5 0.0 1.5 3.5
4 5.0 4.0 1.5 0.0 2.0
5 7.0 6.0 3.5 2.0 0.0
dist(x,method = "manhattan",upper = TRUE,diag = TRUE)
    1   2   3   4   5
1 0.0 1.0 3.5 5.0 7.0
2 1.0 0.0 2.5 4.0 6.0
3 3.5 2.5 0.0 1.5 3.5
4 5.0 4.0 1.5 0.0 2.0
5 7.0 6.0 3.5 2.0 0.0

6.1.2聚类模型

hclust(d, method = “complete”, members=NULL)

method

  • complete:最长距离

  • single:最短距离

  • median:中间距离

  • ward.D:离差平方和

  • 类个数的确定

rect.hclust(tree, k = NULL, which = NULL, x = NULL, h = NULL, border = 2, cluster = NULL)

tree是由hclust生成的结构。k是类的个数。h是谱系图中的阈值,
要求分成的各类的距离大于h。border是数或向量,表明矩形框的颜色


x<-c(1, 2, 4.5, 6, 8)
dim(x)=c(5,1)
d=dist(x)
d
    1   2   3   4
2 1.0            
3 3.5 2.5        
4 5.0 4.0 1.5    
5 7.0 6.0 3.5 2.0
#最长距离模型建立
model_com=hclust(d,method = "complete",members = NULL)
summary(model_com)
            Length Class  Mode     
merge       8      -none- numeric  
height      4      -none- numeric  
order       5      -none- numeric  
labels      0      -none- NULL     
method      1      -none- character
call        4      -none- call     
dist.method 1      -none- character
#最短距离
mod_sing=hclust(d,method = "single",members = NULL)
#最长距离
mod_med=hclust(d,method = "median")
mod_ward=hclust(d,"ward.D")
opar=par(mfrow = c(2,2))
plot(model_com,hang =-1)#hang去复制是谱系图从底部画起
plot(mod_med,hang=-1)
plot(mod_sing,hang=-1)
plot(mod_ward,hang=-1)
par(opar)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jLcwcPto-1678242376648)(output_246_0.png)]

X<-data.frame(x1=c(2959.19, 2459.77, 1495.63, 1046.33, 1303.97, 1730.84, 
1561.86, 1410.11, 3712.31, 2207.58, 2629.16, 1844.78, 2709.46, 1563.78, 
1675.75, 1427.65, 1783.43, 1942.23, 3055.17, 2033.87, 2057.86, 2303.29, 
1974.28, 1673.82, 2194.25, 2646.61, 1472.95, 1525.57, 1654.69, 1375.46, 
1608.82),
x2=c(730.79, 495.47, 515.90, 477.77, 524.29, 553.90, 492.42, 510.71, 
550.74, 449.37, 557.32, 430.29, 428.11, 303.65, 613.32, 431.79, 511.88, 
512.27, 353.23, 300.82, 186.44, 589.99, 507.76, 437.75, 537.01, 839.70, 
390.89, 472.98, 437.77, 480.99, 536.05),
x3=c(749.41, 697.33, 362.37, 290.15, 254.83, 246.91, 200.49, 211.88, 
893.37, 572.40, 689.73, 271.28, 334.12, 233.81, 550.71, 288.55, 282.84, 
401.39, 564.56, 338.65, 202.72, 516.21, 344.79, 461.61, 369.07, 204.44, 
447.95, 328.90, 258.78, 273.84, 432.46),
x4=c(513.34, 302.87, 285.32, 208.57, 192.17, 279.81, 218.36, 277.11, 
346.93, 211.92, 435.69, 126.33, 160.77, 107.90, 219.79, 208.14, 201.01, 
206.06, 356.27, 157.78, 171.79, 236.55, 203.21, 153.32, 249.54, 209.11, 
259.51, 219.86, 303.00, 317.32, 235.82),
x5=c(467.87, 284.19, 272.95, 201.50, 249.81, 239.18, 220.69, 224.65, 
527.00, 302.09, 514.66, 250.56, 405.14, 209.70, 272.59, 217.00, 237.60, 
321.29, 811.88, 329.06, 329.65, 403.92, 240.24, 254.66, 290.84, 379.30, 
230.61, 206.65, 244.93, 251.08, 250.28),
x6=c(1141.82, 735.97, 540.58, 414.72, 463.09, 445.20, 459.62, 376.82, 
1034.98, 585.23, 795.87, 513.18, 461.67, 393.99, 599.43, 337.76, 617.74, 
697.22, 873.06, 621.74, 477.17, 730.05, 575.10, 445.59, 561.91, 371.04, 
490.90, 449.69, 479.53, 424.75, 541.30),
x7=c(478.42, 570.84, 364.91, 281.84, 287.87, 330.24, 360.48,317.61, 
720.33, 429.77, 575.76, 314.00, 535.13, 509.39, 371.62, 421.31, 523.52, 
492.60, 1082.82, 587.02, 312.93, 438.41, 430.36, 346.11, 407.70, 269.59, 
469.10, 249.66, 288.56, 228.73, 344.85),
x8=c(457.64, 305.08, 188.63, 212.10, 192.96, 163.86, 147.76, 152.85, 
462.03, 252.54, 323.36, 151.39, 232.29, 160.12, 211.84, 165.32, 182.52, 
226.45, 420.81, 218.27, 279.19, 225.80, 223.46, 191.48, 330.95, 389.33, 
191.34, 228.19, 236.51, 195.93, 214.40),
row.names=c("北京","天津","河北","山西","内蒙古","辽宁","吉
林","黑龙江","上海","江苏","浙江","安徽","福建","江西","山东","河南","
湖北","湖南","广东","广西","海南","重庆","四川","贵州","云南","西藏","
陕西","甘肃","青海","宁夏","新疆"))

#生成距离
d=dist(scale(X))
hc1=hclust(d,"single")
hc2=hclust(d,"complete")
hc3=hclust(d,"median")
hc4=hclust(d,"ward.D")
opar=par(mfrow = c(2,2))
plot(hc1,hang=-1)
plot(hc2,hang=-1)
plot(hc3,hang=-1)
plot(hc4,hang=-1)
par(opar)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-LmOBtRRg-1678242376648)(output_249_0.png)]

plot(hc2,hang=-1)


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-drsCspZt-1678242376648)(output_250_0.png)]

df_4.1=read.csv("eg4.1.csv",header = TRUE)
df_4.1
x1 x2 x3 x4 x5 x6
4.65 4.22 5.01 4.50 4.15 4.12
6.32 6.11 6.21 6.85 6.52 6.33
4.87 4.60 4.95 4.15 4.02 4.11
4.88 4.68 4.43 4.12 4.03 4.14
6.73 6.65 6.72 6.13 6.51 6.36
7.45 7.56 7.60 7.80 7.20 7.18
8.10 8.23 8.01 7.95 8.31 8.26
8.42 8.54 8.12 7.88 8.26 7.98
6.45 6.81 6.52 6.31 6.27 6.06
7.50 7.32 7.42 7.52 7.10 6.95
d=dist(df_4.1)
hc1=hclust(d,"single")
hc2=hclust(d,"complete")
plot(hc1,hang=-1)
rect.hclust(hc1,k=3,border = "red")#分三类的结果使用红色
rect.hclust(hc1,k=2,border = "blue")#分两类的记过


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ZKvmJCaC-1678242376649)(output_252_0.png)]

plot(hc2,hang=-1)
rect.hclust(hc2,k=3,border = "red")
rect.hclust(hc2,k=2,border = "blue")


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jIyrZczm-1678242376649)(output_253_0.png)]

cutree(hc1,k=3)#输出结果

  1. 1
  2. 2
  3. 1
  4. 1
  5. 2
  6. 3
  7. 3
  8. 3
  9. 2
  10. 3
cutree(hc1,k=2)
  1. 1
  2. 2
  3. 1
  4. 1
  5. 2
  6. 2
  7. 2
  8. 2
  9. 2
  10. 2

6.1.3kmeans聚类

kmeans(x, centers, iter.max = 10, nstart = 1,
algorithm = c(“Hartigan-Wong”, “Lloyd”, “Forgy”,
“MacQueen”), trace=FALSE)

其中x是由数据构成的矩阵或数据框,centers是聚类的个数或者是初始
类的中心。 iter.max 为最大迭代次数。nstart随机集合的个数(当centers
为聚类的个数时)。 algorithm为动态聚类的算法,缺省值为HartiganWong方法

#对31个省市进行聚类分析
km <- kmeans(scale(X), 5, nstart = 20)
km
K-means clustering with 5 clusters of sizes 16, 1, 10, 1, 3

Cluster means:
          x1          x2         x3         x4          x5         x6
1 -0.7008593 -0.33291790 -0.5450901 -0.2500165 -0.54749319 -0.6131804
2  1.1255255  2.91079330 -1.0645632 -0.4082114  0.53291392 -1.0476079
3  0.2646918  0.04585518  0.2487958 -0.3405821 -0.01812541  0.2587437
4  1.8042004 -1.12776493  0.9368961  1.2959544  3.90904835  1.6014419
5  1.8790347  1.02836873  2.1203833  2.1727806  1.49972764  2.2232050
          x7          x8
1 -0.5420723 -0.57966702
2 -0.9562089  1.66126641
3  0.2874133 -0.02413414
4  3.8803141  2.01876530
5  0.9583064  1.94532737

Clustering vector:
  北京   天津   河北   山西 内蒙古   辽宁 吉\n林 黑龙江   上海   江苏   浙江 
     5      3      1      1      1      1      1      1      5      3      5 
  安徽   福建   江西   山东   河南 \n湖北   湖南   广东   广西   海南   重庆 
     1      3      1      3      1      3      3      4      3      1      3 
  四川   贵州   云南   西藏 \n陕西   甘肃   青海   宁夏   新疆 
     3      1      3      2      1      1      1      1      1 

Within cluster sum of squares by cluster:
[1] 30.14432  0.00000 22.12662  0.00000 10.19134
 (between_SS / total_SS =  74.0 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
sort(km$cluster)
河北
1
山西
1
内蒙古
1
辽宁
1
吉 林
1
黑龙江
1
安徽
1
江西
1
河南
1
海南
1
贵州
1
陕西
1
甘肃
1
青海
1
宁夏
1
新疆
1
西藏
2
天津
3
江苏
3
福建
3
山东
3
湖北
3
湖南
3
广西
3
重庆
3
四川
3
云南
3
广东
4
北京
5
上海
5
浙江
5
#数据预处理
df4.2=read.csv("eg4.2.csv",header = T)
df4.2
Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"ERROR while rich displaying an object: Error in gsub(" &\\", "\\", r, fixed = TRUE): input string 1 is invalid in this locale

Traceback:
1. FUN(X[[i]], ...)
2. tryCatch(withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler), error = outer_handler)
3. tryCatchList(expr, classes, parentenv, handlers)
4. tryCatchOne(expr, names, parentenv, handlers[[1L]])
5. doTryCatch(return(expr), name, parentenv, handler)
6. withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler)
7. repr::mime2repr[[mime]](obj)
8. repr_latex.data.frame(obj)
9. gsub(" &\\", "\\", r, fixed = TRUE)
X x1 x2 x3 x4 x5 x6 x7 x8
鍖椾含 7584.2 2425.7 10350.2 2098.3 4489.6 3634.6 2228.6 991.4
澶╂触 7709.9 1949.4 5237.5 1514.0 3185.9 2096.0 1757.1 712.6
娌冲寳 3515.5 1055.3 2995.8 832.2 1807.6 1338.6 1192.0 293.6
灞辫タ 3089.4 1146.7 2297.4 672.5 1501.4 1628.0 1102.0 291.5
鍐呰挋鍙,4919.8 1759.7 2918.9 1030.9 2569.0 2067.1 1384.3 528.9 NA
杈藉畞 4858.0 1561.6 3471.6 1028.7 2282.0 1973.3 1522.4 502.2
鍚夋灄 3683.5 1254.9 2692.3 718.4 1810.0 1683.6 1527.5 393.7
榛戦緳姹<9f>,3704.1 1288.3 2619.6 672.4 1675.1 1526.2 1577.0 339.8 NA
涓婃捣 9271.5 1622.7 11307.5 1484.6 4206.5 3718.1 2268.3 904.3
姹熻嫃 5936.0 1415.1 4551.6 1238.2 2984.7 2423.8 1409.6 596.5
娴欐睙 6975.8 1646.5 5964.2 1159.3 3961.3 2428.3 1433.0 548.4
瀹夊窘 4424.2 924.6 2630.0 698.8 1622.3 1339.3 932.3 268.8
绂忓缓 6440.0 1134.6 4638.2 1047.5 2305.3 1784.7 1028.6 471.3
姹熻タ 4181.7 929.1 2783.4 736.6 1444.4 1354.0 698.8 275.5
灞变笢 4166.2 1276.6 2903.3 1038.1 2104.2 1557.3 1180.1 352.7
娌冲崡 3373.7 1141.9 2387.9 910.6 1355.4 1337.2 1023.1 305.4
婀栧寳 4499.9 1073.1 3007.0 868.6 1722.5 1577.6 1252.5 315.3
婀栧崡 4535.5 1028.0 2810.8 883.6 1624.6 2049.7 998.3 336.8
骞夸笢 7236.7 1103.4 4677.1 1245.3 3020.2 2117.3 976.1 599.8
骞胯タ 3960.8 503.1 2559.9 672.5 1445.7 1280.1 778.1 200.7
娴峰崡 5364.1 568.3 2628.0 697.9 1783.1 1278.0 987.0 268.7
閲嶅簡 5325.5 1334.7 2743.4 1064.3 1746.4 1513.4 1117.9 294.0
鍥涘窛 5001.4 1071.3 2400.9 918.4 1629.2 1207.9 1071.2 331.8
璐靛窞 3375.8 719.2 2185.7 636.2 1321.6 1401.2 604.6 169.5
浜戝崡 3587.7 625.7 2146.4 644.3 1632.6 1281.5 875.7 211.5
瑗胯棌 3919.8 764.3 1374.6 394.9 1025.9 314.1 229.2 223.1
闄曡タ 3646.4 989.4 2786.1 887.3 1537.1 1608.4 1363.5 269.1
鐢樿們 3447.6 967.7 2120.8 708.8 1214.7 1315.9 949.5 225.7
闈掓捣 3958.2 1232.0 2352.8 793.4 2263.3 1383.4 1318.2 310.0
瀹佸 3694.8 1237.9 2607.3 885.4 1806.1 1707.9 1482.9 393.4
鏂扮枂 4092.8 1274.5 2227.9 788.8 1842.4 1282.1 1078.3 280.6
library(factoextra)
df=USArrests
Warning message:
"package 'factoextra' was built under R version 3.6.3"Loading required package: ggplot2
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df
Murder Assault UrbanPop Rape
Alabama 13.2 236 58 21.2
Alaska 10.0 263 48 44.5
Arizona 8.1 294 80 31.0
Arkansas 8.8 190 50 19.5
California 9.0 276 91 40.6
Colorado 7.9 204 78 38.7
Connecticut 3.3 110 77 11.1
Delaware 5.9 238 72 15.8
Florida 15.4 335 80 31.9
Georgia 17.4 211 60 25.8
Hawaii 5.3 46 83 20.2
Idaho 2.6 120 54 14.2
Illinois 10.4 249 83 24.0
Indiana 7.2 113 65 21.0
Iowa 2.2 56 57 11.3
Kansas 6.0 115 66 18.0
Kentucky 9.7 109 52 16.3
Louisiana 15.4 249 66 22.2
Maine 2.1 83 51 7.8
Maryland 11.3 300 67 27.8
Massachusetts 4.4 149 85 16.3
Michigan 12.1 255 74 35.1
Minnesota 2.7 72 66 14.9
Mississippi 16.1 259 44 17.1
Missouri 9.0 178 70 28.2
Montana 6.0 109 53 16.4
Nebraska 4.3 102 62 16.5
Nevada 12.2 252 81 46.0
New Hampshire 2.1 57 56 9.5
New Jersey 7.4 159 89 18.8
New Mexico 11.4 285 70 32.1
New York 11.1 254 86 26.1
North Carolina 13.0 337 45 16.1
North Dakota 0.8 45 44 7.3
Ohio 7.3 120 75 21.4
Oklahoma 6.6 151 68 20.0
Oregon 4.9 159 67 29.3
Pennsylvania 6.3 106 72 14.9
Rhode Island 3.4 174 87 8.3
South Carolina 14.4 279 48 22.5
South Dakota 3.8 86 45 12.8
Tennessee 13.2 188 59 26.9
Texas 12.7 201 80 25.5
Utah 3.2 120 80 22.9
Vermont 2.2 48 32 11.2
Virginia 8.5 156 63 20.7
Washington 4.0 145 73 26.2
West Virginia 5.7 81 39 9.3
Wisconsin 2.6 53 66 10.8
Wyoming 6.8 161 60 15.6
df=na.omit(df)#去除NA值
head(df)
Murder Assault UrbanPop Rape
Alabama 13.2 236 58 21.2
Alaska 10.0 263 48 44.5
Arizona 8.1 294 80 31.0
Arkansas 8.8 190 50 19.5
California 9.0 276 91 40.6
Colorado 7.9 204 78 38.7
df=scale(df)
head(df)
Murder Assault UrbanPop Rape
Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
Arizona 0.07163341 1.4788032 0.9989801 1.042878388
Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
California 0.27826823 1.2628144 1.7589234 2.067820292
Colorado 0.02571456 0.3988593 0.8608085 1.864967207
km1=kmeans(df,4,nstart = 20,algorithm = "Hartigan-Wong")
km1
K-means clustering with 4 clusters of sizes 13, 8, 16, 13

Cluster means:
      Murder    Assault   UrbanPop        Rape
1  0.6950701  1.0394414  0.7226370  1.27693964
2  1.4118898  0.8743346 -0.8145211  0.01927104
3 -0.4894375 -0.3826001  0.5758298 -0.26165379
4 -0.9615407 -1.1066010 -0.9301069 -0.96676331

Clustering vector:
       Alabama         Alaska        Arizona       Arkansas     California 
             2              1              1              2              1 
      Colorado    Connecticut       Delaware        Florida        Georgia 
             1              3              3              1              2 
        Hawaii          Idaho       Illinois        Indiana           Iowa 
             3              4              1              3              4 
        Kansas       Kentucky      Louisiana          Maine       Maryland 
             3              4              2              4              1 
 Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
             3              1              4              2              1 
       Montana       Nebraska         Nevada  New Hampshire     New Jersey 
             4              4              1              4              3 
    New Mexico       New York North Carolina   North Dakota           Ohio 
             1              1              2              4              3 
      Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
             3              3              3              3              2 
  South Dakota      Tennessee          Texas           Utah        Vermont 
             4              2              1              3              4 
      Virginia     Washington  West Virginia      Wisconsin        Wyoming 
             3              3              4              4              3 

Within cluster sum of squares by cluster:
[1] 19.922437  8.316061 16.212213 11.952463
 (between_SS / total_SS =  71.2 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
sort(km1$cluster)
Alaska
1
Arizona
1
California
1
Colorado
1
Florida
1
Illinois
1
Maryland
1
Michigan
1
Missouri
1
Nevada
1
New Mexico
1
New York
1
Texas
1
Alabama
2
Arkansas
2
Georgia
2
Louisiana
2
Mississippi
2
North Carolina
2
South Carolina
2
Tennessee
2
Connecticut
3
Delaware
3
Hawaii
3
Indiana
3
Kansas
3
Massachusetts
3
New Jersey
3
Ohio
3
Oklahoma
3
Oregon
3
Pennsylvania
3
Rhode Island
3
Utah
3
Virginia
3
Washington
3
Wyoming
3
Idaho
4
Iowa
4
Kentucky
4
Maine
4
Minnesota
4
Montana
4
Nebraska
4
New Hampshire
4
North Dakota
4
South Dakota
4
Vermont
4
West Virginia
4
Wisconsin
4

6.2判别分析

  • Fisher判别

library(MASS)

lda(formula, data, … , subset, na.action)

formula用法为groups~ x1 + x2 + … , group表明总体来源, x1,
x2, …,表示分类指标; subset指明训练样本。

6.2.1Fisher 判别法(线性判别)

library(MASS)
data(iris)#加载数据
attach(iris)
iris
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5.1 3.5 1.4 0.2 setosa
4.9 3.0 1.4 0.2 setosa
4.7 3.2 1.3 0.2 setosa
4.6 3.1 1.5 0.2 setosa
5.0 3.6 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa
4.6 3.4 1.4 0.3 setosa
5.0 3.4 1.5 0.2 setosa
4.4 2.9 1.4 0.2 setosa
4.9 3.1 1.5 0.1 setosa
5.4 3.7 1.5 0.2 setosa
4.8 3.4 1.6 0.2 setosa
4.8 3.0 1.4 0.1 setosa
4.3 3.0 1.1 0.1 setosa
5.8 4.0 1.2 0.2 setosa
5.7 4.4 1.5 0.4 setosa
5.4 3.9 1.3 0.4 setosa
5.1 3.5 1.4 0.3 setosa
5.7 3.8 1.7 0.3 setosa
5.1 3.8 1.5 0.3 setosa
5.4 3.4 1.7 0.2 setosa
5.1 3.7 1.5 0.4 setosa
4.6 3.6 1.0 0.2 setosa
5.1 3.3 1.7 0.5 setosa
4.8 3.4 1.9 0.2 setosa
5.0 3.0 1.6 0.2 setosa
5.0 3.4 1.6 0.4 setosa
5.2 3.5 1.5 0.2 setosa
5.2 3.4 1.4 0.2 setosa
4.7 3.2 1.6 0.2 setosa
... ... ... ... ...
6.9 3.2 5.7 2.3 virginica
5.6 2.8 4.9 2.0 virginica
7.7 2.8 6.7 2.0 virginica
6.3 2.7 4.9 1.8 virginica
6.7 3.3 5.7 2.1 virginica
7.2 3.2 6.0 1.8 virginica
6.2 2.8 4.8 1.8 virginica
6.1 3.0 4.9 1.8 virginica
6.4 2.8 5.6 2.1 virginica
7.2 3.0 5.8 1.6 virginica
7.4 2.8 6.1 1.9 virginica
7.9 3.8 6.4 2.0 virginica
6.4 2.8 5.6 2.2 virginica
6.3 2.8 5.1 1.5 virginica
6.1 2.6 5.6 1.4 virginica
7.7 3.0 6.1 2.3 virginica
6.3 3.4 5.6 2.4 virginica
6.4 3.1 5.5 1.8 virginica
6.0 3.0 4.8 1.8 virginica
6.9 3.1 5.4 2.1 virginica
6.7 3.1 5.6 2.4 virginica
6.9 3.1 5.1 2.3 virginica
5.8 2.7 5.1 1.9 virginica
6.8 3.2 5.9 2.3 virginica
6.7 3.3 5.7 2.5 virginica
6.7 3.0 5.2 2.3 virginica
6.3 2.5 5.0 1.9 virginica
6.5 3.0 5.2 2.0 virginica
6.2 3.4 5.4 2.3 virginica
5.9 3.0 5.1 1.8 virginica
iris_lda=lda(Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width)
iris_lda
Call:
lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width)

Prior probabilities of groups:
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

Group means:
           Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa            5.006       3.428        1.462       0.246
versicolor        5.936       2.770        4.260       1.326
virginica         6.588       2.974        5.552       2.026

Coefficients of linear discriminants:
                    LD1         LD2
Sepal.Length  0.8293776  0.02410215
Sepal.Width   1.5344731  2.16452123
Petal.Length -2.2012117 -0.93192121
Petal.Width  -2.8104603  2.83918785

Proportion of trace:
   LD1    LD2 
0.9912 0.0088 
  • Group means:包含每组的平均向量
  • Coefficients of linear discriminants:线性判别系数
  • Proportion of trace:表明第 i 个判别式对区分各组的贡献大小
#利用模型进行预测
iris.pre=predict(iris_lda)$class
table(iris.pre,Species)#table 用来构建列联表,第一个参数为行,第二个为列
            Species
iris.pre     setosa versicolor virginica
  setosa         50          0         0
  versicolor      0         48         1
  virginica       0          2        49
#晴雨天预测分类
x1<-c( -1.9, -6.9,5.2 , 5.0 , 7.3, 6.8 , 0.9, -12.5, 1.5, 3.8, 0.2, -0.1,
0.4 , 2.7, 2.1 , -4.6, -1.7 , -2.6 , 2.6 , -2.8)
x2 <- c( 3.2 , 0.4 , 2.0 , 2.5 , 0.0, 12.7, -5.4, -2.5 , 1.3 , 6.8 , 6.2, 7.5, 14.6 ,
8.3, 0.8 , 4.3, 10.9,13.1, 12.8, 10.0)
G<-c(1, 1 ,1, 1 ,1 ,1, 1, 1, 1, 1 ,2 ,2, 2 ,2, 2 ,2, 2, 2, 2, 2)
d6.1<-data.frame(G,x1,x2)

d6.lda=lda(G~x1+x2,data = d6.1)
d6.lda
Call:
lda(G ~ x1 + x2, data = d6.1)

Prior probabilities of groups:
  1   2 
0.5 0.5 

Group means:
     x1   x2
1  0.92 2.10
2 -0.38 8.85

Coefficients of linear discriminants:
          LD1
x1 -0.1035305
x2  0.2247957
#预测值
d6.pre=predict(d6.lda)$class
cbind(d6.1$G,d6.pre)

d6.pre
1 1
1 1
1 1
1 1
1 1
1 2
1 1
1 1
1 1
1 1
2 2
2 2
2 2
2 2
2 1
2 2
2 2
2 2
2 2
2 2
tab=table(G,d6.pre)
#计算判对率:prop.table():将数据转换为百分比
sum(diag(prop.table(tab)))

0.9

data("iris")
library(MASS)
attach(iris)
iris.lda <- lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width)
newdata=data.frame( Sepal.Length=c(5.1,5.9,6.6), Sepal.Width =c(3.5,2.8,2.9), 
Petal.Length=c(1.5,4.3,5.6), Petal.Width =c(0.25,1.3,2.1))
The following objects are masked from iris (pos = 3):

    Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species

predict(iris.lda,newdata)
$class
  1. setosa
  2. versicolor
  3. virginica
Levels:
  1. 'setosa'
  2. 'versicolor'
  3. 'virginica'
$posterior
setosaversicolorvirginica 1.000000e+001.117074e-203.314219e-40 2.963609e-209.998273e-011.727265e-04 4.169387e-423.505610e-059.999649e-01 $x
LD1 LD2
7.701156 0.3491879
-1.823849 -0.7749274
-6.199781 0.5182489

6.2.2距离判别法

  • 根据距离的远近对样品的类别进行判定

mahalanobis(x,center,cov,inverted=FALSE):马氏距离的计算

  • x:用于计算距离的指定数据对象(向量或者矩阵)

  • center:分布的均值,即总体均值

  • cov:分布的协方差,即总体协方差,一般用样本的协方差进行评估

  • inverted:TRUE,表示参数cov应该包括协方差的逆

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QXPs30p4-1678242376650)(attachment:%E5%9B%BE%E7%89%87-2.png)]
整体思路,定义一个函数,比较到两个中心点的距离

compare=function(x,mu1,mu2,S){#数据,中心点1,中心点2,协方差矩阵
    mahalanobis(x,mu1,S)-mahalanobis(x,mu2,S)
    
}
x=c(7.2,5.6)
mu1=c(6,5)
mu2=c(8,6)
S=matrix(c(9,2,2,4),nrow = 2)
S
9 2
2 4
compare(x,mu1,mu2,S)#结果为正数,所属mu2更近,为雄性

0.10625

6.2.3曲线判别

qda(formula, data, …, subset, na.action)

formula为公式,data为数据框

qd=qda(G~x1+x2.d6.1)
Error in eval(predvars, data, env): 找不到对象'x2.d6.1'
Traceback:


1. qda(G ~ x1 + x2.d6.1)

2. qda.formula(G ~ x1 + x2.d6.1)

3. eval.parent(m)

4. eval(expr, p)

5. eval(expr, p)

6. stats::model.frame(formula = G ~ x1 + x2.d6.1)

7. model.frame.default(formula = G ~ x1 + x2.d6.1)

8. eval(predvars, data, env)

9. eval(predvars, data, env)
Q<-c(8.3, 9.5, 8.0,7.4,8.8 ,9.0,7.0,9.2,8.0,7.6,7.2,6.4,7.3,6.0,6.4,6.8,5.2,5.8,
5.5, 6.0)
C=c(4.0 ,7.0 ,5.0,7.0,6.5,7.5,6.0,8.0,7.0,9.0,8.5,7.0,5.0,2.0,4.0,5.0,3.0,3.5,
4.0,4.5)
P<-c( 29,68,39,50,55,58,75,82,67,90,86,53,48,20,39,48,29,32,34,36)
G3<-c(1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3)
d6.3<-data.frame(G3,Q,C,P)
d6.3
G3 Q C P
1 8.3 4.0 29
1 9.5 7.0 68
1 8.0 5.0 39
1 7.4 7.0 50
1 8.8 6.5 55
2 9.0 7.5 58
2 7.0 6.0 75
2 9.2 8.0 82
2 8.0 7.0 67
2 7.6 9.0 90
2 7.2 8.5 86
2 6.4 7.0 53
2 7.3 5.0 48
3 6.0 2.0 20
3 6.4 4.0 39
3 6.8 5.0 48
3 5.2 3.0 29
3 5.8 3.5 32
3 5.5 4.0 34
3 6.0 4.5 36
library(MASS)
mod.lda=lda(G3~Q+C+P,data = d6.3)
mod.lda
Call:
lda(G3 ~ Q + C + P, data = d6.3)

Prior probabilities of groups:
   1    2    3 
0.25 0.40 0.35 

Group means:
         Q        C      P
1 8.400000 5.900000 48.200
2 7.712500 7.250000 69.875
3 5.957143 3.714286 34.000

Coefficients of linear discriminants:
          LD1         LD2
Q -0.81173396  0.88406311
C -0.63090549  0.20134565
P  0.01579385 -0.08775636

Proportion of trace:
   LD1    LD2 
0.7403 0.2597 
#线性判别进行预测
pre.lda=predict(mod.lda,data.frame(Q=8.0,C=7.5,P=65))
pre.lda
$class
2 Levels:
  1. '1'
  2. '2'
  3. '3'
$posterior
1 2 3
0.2114514 0.786773 0.001775594
$x
LD1 LD2
-1.537069 -0.1367865

Warning message in cbind(G3, Wx = pre.lda$x, newG = pre.lda$class):
"number of rows of result is not a multiple of vector length (arg 1)"
G3 LD1 LD2 newG
1 -1.537069 -0.1367865 2
mod.qda=qda(G3~Q+C+P,data = d6.3)
mod.qda
Call:
qda(G3 ~ Q + C + P, data = d6.3)

Prior probabilities of groups:
   1    2    3 
0.25 0.40 0.35 

Group means:
         Q        C      P
1 8.400000 5.900000 48.200
2 7.712500 7.250000 69.875
3 5.957143 3.714286 34.000
cbind(G3,newG=predict(mod.qda)$class)
G3 newG
1 1
1 1
1 1
1 1
1 1
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 3
3 3
3 3
3 3
3 3
3 3
3 3
3 3
predict(mod.qda,data.frame(Q=8,C=7.5,P=65))
$class
2 Levels:
  1. '1'
  2. '2'
  3. '3'
$posterior
1 2 3
0.008221165 0.9915392 0.0002396287

6.2.4Bayes判别

  • 即求解条件概率最大的那个类别

ld1=lda(G3~Q+C+P,prior=c(1,1,1)/3,data=d6.3)

library(MASS)
Q<-c(8.3, 9.5, 8.0,7.4,8.8 ,9.0,7.0,9.2,8.0,7.6,7.2,6.4,7.3,6.0,6.4,6.8,5.2,5.8,
5.5, 6.0)
C=c(4.0 ,7.0 ,5.0,7.0,6.5,7.5,6.0,8.0,7.0,9.0,8.5,7.0,5.0,2.0,4.0,5.0,3.0,3.5,
4.0,4.5)
P<-c( 29,68,39,50,55,58,75,82,67,90,86,53,48,20,39,48,29,32,34,36)
G3<-c(1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3)
d6.3<-data.frame(G3,Q,C,P)

ld=lda(G3~Q+C+P,prior = c(1,1,1)/3,data = d6.3)
ld
Call:
lda(G3 ~ Q + C + P, data = d6.3, prior = c(1, 1, 1)/3)

Prior probabilities of groups:
        1         2         3 
0.3333333 0.3333333 0.3333333 

Group means:
         Q        C      P
1 8.400000 5.900000 48.200
2 7.712500 7.250000 69.875
3 5.957143 3.714286 34.000

Coefficients of linear discriminants:
          LD1         LD2
Q -0.92307369  0.76708185
C -0.65222524  0.11482179
P  0.02743244 -0.08484154

Proportion of trace:
   LD1    LD2 
0.7259 0.2741 
ld2=lda(G3~Q+C+P,prior=c(5,8,7)/20,data=d6.3)
ld2
Call:
lda(G3 ~ Q + C + P, data = d6.3, prior = c(5, 8, 7)/20)

Prior probabilities of groups:
   1    2    3 
0.25 0.40 0.35 

Group means:
         Q        C      P
1 8.400000 5.900000 48.200
2 7.712500 7.250000 69.875
3 5.957143 3.714286 34.000

Coefficients of linear discriminants:
          LD1         LD2
Q -0.81173396  0.88406311
C -0.63090549  0.20134565
P  0.01579385 -0.08775636

Proportion of trace:
   LD1    LD2 
0.7403 0.2597 
z2=predict(ld2)
z2
$class
  1. 1
  2. 1
  3. 1
  4. 1
  5. 1
  6. 1
  7. 2
  8. 2
  9. 2
  10. 2
  11. 2
  12. 2
  13. 3
  14. 3
  15. 3
  16. 3
  17. 3
  18. 3
  19. 3
  20. 3
Levels:
  1. '1'
  2. '2'
  3. '3'
$posterior
123 9.747144e-010.00881989141.646573e-02 7.069622e-010.29292737201.104490e-04 9.066215e-010.06671619012.666229e-02 5.420713e-010.44730988581.061882e-02 8.565024e-010.14286199236.355979e-04 8.893310e-010.11061461045.440087e-05 2.123192e-030.87898389801.188929e-01 1.188275e-010.88110914906.337076e-05 1.241051e-010.87139525054.499663e-03 1.781107e-030.99806550881.533841e-04 1.374025e-030.99779418568.317893e-04 7.358709e-020.82549381021.009191e-01 2.163910e-010.38570933323.978997e-01 5.425064e-040.00022607429.992314e-01 8.646125e-030.02600225549.653516e-01 5.604906e-020.27404428846.699067e-01 5.674877e-050.00043313269.995101e-01 9.943552e-040.00321146839.957942e-01 7.112081e-040.00490727779.943815e-01 9.840783e-030.02886590409.612933e-01 $x
LD1 LD2
-0.1409984 2.582951755
-2.3918356 0.825366275
-0.3704452 1.641514840
-0.9714835 0.548448277
-1.7134891 1.246681993
-2.4593598 1.361571174
0.3789617 -2.200431689
-2.5581070 -0.467096091
-1.1900285 -0.412972027
-1.7638874 -2.382302324
-1.1869165 -2.485574940
-0.1123680 -0.598883922
0.3399132 0.232863397
2.8456561 0.936722573
1.5592346 0.025668216
0.7457802 -0.209168159
3.0062824 -0.358989534
2.2511708 0.008852067
2.2108260 -0.331206768
1.5210939 0.035984885
#混淆矩阵
table(G3,z2$class)

G3  1 2 3
  1 5 0 0
  2 1 6 1
  3 0 0 7

你可能感兴趣的:(R,吉林大学,r语言,开发语言,数据挖掘)