#有问题
#假设检验,
z.test<-function(x,n,sigma,alpha,u0=0,alternative="two.sided"){
options(digits=4)
result<-list( )
mean<-mean(x)
z<-(mean-u0)/(sigma/sqrt(n))
p<-pnorm(z,lower.tail=FALSE)
result$mean<-mean
result$z<-z
result$p.value<-p
if(alternative=="two.sided"){
p<-2*p
result$p.value<-p
}
else if (alternative == "greater"|alternative =="less" ){
result$p.value<-p
}
else return("your input is wrong")
result$conf.int<- c(
mean-sigma*qnorm(1-alpha/2,mean=0, sd=1,
lower.tail = TRUE)/sqrt(n),
mean+sigma*qnorm(1-alpha/2,mean=0, sd=1,
lower.tail = TRUE)/sqrt(n))
result }
z.test(0.13,25,0.1,0.95,u0=0.12,alternative="less")
#包装精盐
x=c(490,506,508,502,498,511,510,515,512)
t.test(x,mu = 500)
One Sample t-test
data: x
t = 2.2, df = 8, p-value = 0.06
alternative hypothesis: true mean is not equal to 500
95 percent confidence interval:
499.7 511.8
sample estimates:
mean of x
505.8
#生活费检验
ttest=read.csv("TTest1.csv",header = TRUE)
t.test(ttest,mu = 1000)
One Sample t-test
data: ttest
t = -2.8, df = 584, p-value = 0.005
alternative hypothesis: true mean is not equal to 1000
95 percent confidence interval:
568.9 922.2
sample estimates:
mean of x
745.5
x=c(20.5,19.8,19.7,20.4,20.1,20.0,19.0,19.9)
y=c(20.7,19.8,19.5,20.8,20.4,19.6,20.2)
t.test(x,y,alternative = "two.sided")
Welch Two Sample t-test
data: x and y
t = -0.85, df = 12, p-value = 0.4
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.7768 0.3411
sample estimates:
mean of x mean of y
19.93 20.14
t.test(salary,phd,var.equal = TRUE)
Two Sample t-test
data: salary and phd
t = 29, df = 88, p-value <2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
7290 8373
sample estimates:
mean of x mean of y
7833.333 1.533
ttest$phd
#月薪差异
ttest=read.csv("TTest1.csv",header = TRUE)
attach(ttest)
#这样写是因为phd是一个离散型变量么???
t.test(salary~phd,data = ttest,var.equal = TRUE)
The following objects are masked from ttest (pos = 3):
bgfriend, costofeat, face, gender, hoursofathlete, howdifficult,
index, like, phd, ref, salary, star, totalvalue
Two Sample t-test
data: salary by phd
t = -3.4, df = 43, p-value = 0.002
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-2643.4 -660.2
sample estimates:
mean in group 1 mean in group 2
6952 8604
t.test(like~phd,data = ttest,var.equal = TRUE)
Two Sample t-test
data: like by phd
t = -0.74, df = 43, p-value = 0.5
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.2431 0.5764
sample estimates:
mean in group 1 mean in group 2
5.333 5.667
#成对数据的t检验
x=c(20.5,18.8,19.8,20.9,21.5,19.5,21.0,21.2)
y=c(17.7,20.3,20.0,18.8,19.0,20.1,20.0,19.1)
t.test(x,y,var.equal = TRUE,paired = TRUE)
Paired t-test
data: x and y
t = 1.8, df = 7, p-value = 0.1
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.3214 2.3714
sample estimates:
mean of the differences
1.025
#方差比较——F检验
x<-c(20.5, 19.8, 19.7, 20.4, 20.1, 20.0, 19.0, 19.9)
y<-c(20.7, 19.8, 19.5, 20.8, 20.4, 19.6, 20.2)
var.test(x,y)
F test to compare two variances
data: x and y
F = 0.79, num df = 7, denom df = 6, p-value = 0.8
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.1393 4.0600
sample estimates:
ratio of variances
0.7932
#卡方独立性检验
cold=matrix(c(43,28,96,84),nrow = 2,dimnames = list(c("drink","not drink"),c("colding","not colding")))
cold
colding | not colding | |
---|---|---|
drink | 43 | 96 |
not drink | 28 | 84 |
#牛奶卡方独立性检验
chisq.test(cold)#P值>0.05所以接受原假设,喝牛奶对感冒没有影响
Pearson's Chi-squared test with Yates' continuity correction
data: cold
X-squared = 0.8, df = 1, p-value = 0.4
library(vcd)
head(Arthritis)
Loading required package: grid
ID | Treatment | Sex | Age | Improved |
---|---|---|---|---|
57 | Treated | Male | 27 | Some |
46 | Treated | Male | 29 | None |
77 | Treated | Male | 30 | None |
17 | Treated | Male | 32 | Marked |
36 | Treated | Male | 46 | Marked |
23 | Treated | Male | 58 | Marked |
“+” , “-”, “*”, “/”, “^”(指数),
sqrt()开放,exp()指数函数
数值型:整型,单精度实型,双精度实型
字符型
复数型((不讨论)
逻辑型:FALSE,TRUE,NA(注意与NaN(不是一个数字)相区分)
mode(x):获取类型
length():获得长度
#1.简单的赋值操作
#=
n=10
n
10
#assign 函数赋值
assign("a",20)
a
20
#2.基本运算
((10+2)*5-2^4)/4#11
11
sqrt(3)+exp(-2)#开方,指数函数
1.86738609080549
#3.基本数据类型与方法
x=123
mode(x)
‘numeric’
length(x)
1
#逻辑型
flag=1>3
flag
FALSE
#lnf:正无穷
x=5/0
x#无穷
Inf
Inf-Inf#NaN:不是一个数字
NaN
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YxoGnF9x-1678242376633)(attachment:0ac2a92512490b88bad8a14631f4451.png)]
^,sqrt(),tan():分别为每一个元素的对应操作
#1.数值型向量的建立
# seq(起,终,间距)
z=seq(1,5,by=0.5)
z
# :
1:10
1:10-1 #先生成向量后全部减一
# rep( ,times=,each)
z=rep(2:5,2)#重复几次
z=rep(1:2,each=2)#=
z
#2.向量的计算
#同一个表达式不需要向量有相同的长度
x=1:5
y=1:4
x+y#y重复自己与x等长
Warning message in x + y:
"长的对象长度不是短的对象长度的整倍数"
sqrt(x)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SQKzexw3-1678242376634)(attachment:54ad8d1be750d1fb0991c6533b295dd.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-K4IByAbS-1678242376635)(attachment:fcd8efc2dea0908c89d0e0d88df4954.png)]
#计算A(3)
prod(1:3)
6
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vsVLRGEK-1678242376635)(attachment:e11e8ce447e534adfd5295ec33e6d22.png)]
x=c(3,1,2,5)#3,1,2,4(5为第四大)
rank(x);
z=c("green","Kobe","lebrn")
z
labels=paste("x",1:10,sep="")
labels
labs=paste(c("X","Y"),1:10,sep ="")
labs
x=1:10
flag=x>5#生成一个逻辑型向量
flag
x=c(1:10)
x[1:5]#选取前5个
x[-(1:5)]#删除前1到5个元素
array(x,dim=c())#向量,维度
dimnames(A)=list(x1,x2)#行名,列名。。。。。
matrix(x,nc,nr,byrow=TRUE):循环准则依然适用,但是数据项个数必须是矩阵列数的
倍数diag(n):生成单位阵,对角阵
#1.数组的建立
a=array(1:12,dim=c(3:4))#默认按列排列
a
1 | 4 | 7 | 10 |
2 | 5 | 8 | 11 |
3 | 6 | 9 | 12 |
a=array(c(1:9),dim = c(3,3))#注意用逗号
print(a)
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
dimnames(a)=list(c("a","b","e"),c("c","d","f"))#第一个参数为行名,第二个为列名
a
c | d | f | |
---|---|---|---|
a | 1 | 4 | 7 |
b | 2 | 5 | 8 |
e | 3 | 6 | 9 |
#2.矩阵的建立
#matrix()
m=matrix(1:4,nr=2,nc=2)
m
1 | 3 |
2 | 4 |
#diag:生成单位阵
m1=diag(3)
m1
1 | 0 | 0 |
0 | 1 | 0 |
0 | 0 | 1 |
#生成对角阵
v=1:5
m2=diag(v)
m2
1 | 0 | 0 | 0 | 0 |
0 | 2 | 0 | 0 | 0 |
0 | 0 | 3 | 0 | 0 |
0 | 0 | 0 | 4 | 0 |
0 | 0 | 0 | 0 | 5 |
#生成特殊对角阵
v=1:5
m3=diag(v,nr=5,nc=6)
m3
1 | 0 | 0 | 0 | 0 | 0 |
0 | 2 | 0 | 0 | 0 | 0 |
0 | 0 | 3 | 0 | 0 | 0 |
0 | 0 | 0 | 4 | 0 | 0 |
0 | 0 | 0 | 0 | 5 | 0 |
t():转置
diag():提取对角元素
rbind(m1,m2):按行合并
cbind(m1,m2):按列合并
*:逐元乘积
%*%:代数乘积
m4=matrix(1:12,nr=3,nc=4)
m4
1 | 4 | 7 | 10 |
2 | 5 | 8 | 11 |
3 | 6 | 9 | 12 |
#提取
m4[c(1,2),c(2,3),drop=FALSE]#选取第1,2行的2,3列
4 | 7 |
5 | 8 |
m4[-1]#去掉第一行
#替换与添加
m4[,3]=NA #第四列替换
m4
1 | 4 | NA | 10 |
2 | 5 | NA | 11 |
3 | 6 | NA | 12 |
m4[is.na(m4)]=-1#缺失值全部用-1替换
m4
1 | 4 | -1 | 10 |
2 | 5 | -1 | 11 |
3 | 6 | -1 | 12 |
#矩阵运算
data.frame(INDEX=y,VALUE=x),向量必须有相同的长度,或则倍数关系
读取文件:在后续进行介绍
提取与访问
df[行,列 ]
df$列名:提取某一列
subset(df,条件):提取符合条件的子集
添加新变量
df$列名=
df$列名=with(df,数值)
df=transform(df,列名=value,列名=value)
df=data.frame(INDEX=1:4,姓名=c("zhang","wang","qian","sun"),
工龄= c(10,20,30,40))
df
INDEX | 姓名 | 工龄 |
---|---|---|
1 | zhang | 10 |
2 | wang | 20 |
3 | qian | 30 |
4 | sun | 40 |
teams <- c("PHI","NYM","FLA","ATL","WSN")
w<-c(92,89,94,72,59)
l<-c(70,73,77,90,102)
df1=data.frame(teams,w,l)
df1
teams | w | l |
---|---|---|
PHI | 92 | 70 |
NYM | 89 | 73 |
FLA | 94 | 77 |
ATL | 72 | 90 |
WSN | 59 | 102 |
#提取单个元素,类似坐标定位
df1[1,2]
92
#指定列则使用向量
df1[c(1,2,4),c(1,2)]
teams | w | |
---|---|---|
1 | PHI | 92 |
2 | NYM | 89 |
4 | ATL | 72 |
#使用变量名
df1[c(1,3),c("teams","w")]
teams | w | |
---|---|---|
1 | PHI | 92 |
3 | FLA | 94 |
#提取某一列
df1$teams
#提取符合特定条件的子集
subset(df1,w>85)
teams | w | l |
---|---|---|
PHI | 92 | 70 |
NYM | 89 | 73 |
FLA | 94 | 77 |
#添加新变量
#$
df1$wr=w/w+l
df1
teams | w | l | wr |
---|---|---|---|
PHI | 92 | 70 | 71 |
NYM | 89 | 73 | 74 |
FLA | 94 | 77 | 78 |
ATL | 72 | 90 | 91 |
WSN | 59 | 102 | 103 |
#with():函数
df1$wrr=with(df1,w/w+2*l)
df1
teams | w | l | wr | wrr |
---|---|---|---|---|
PHI | 92 | 70 | 71 | 141 |
NYM | 89 | 73 | 74 | 147 |
FLA | 94 | 77 | 78 | 155 |
ATL | 72 | 90 | 91 | 181 |
WSN | 59 | 102 | 103 | 205 |
#transform():一次性添加多个变量
df1=transform(df1,rw=l/(w+l),playoff="Yes")
df1
teams | w | l | wr | wrr | rw | playoff |
---|---|---|---|---|---|---|
PHI | 92 | 70 | 71 | 141 | 0.4320988 | Yes |
NYM | 89 | 73 | 74 | 147 | 0.4506173 | Yes |
FLA | 94 | 77 | 78 | 155 | 0.4502924 | Yes |
ATL | 72 | 90 | 91 | 181 | 0.5555556 | Yes |
WSN | 59 | 102 | 103 | 205 | 0.6335404 | Yes |
数据的存储
数据的读取
write.table(data,file=“”,rom.nanme=F,quote=F)
数据,文件名,行名是否写入,变量名不放入双引号
save():存储为R文件
#1.数据存储
df=data.frame(Index=c(1:5),grade=seq(10,50,10));
df
Index | grade |
---|---|
1 | 10 |
2 | 20 |
3 | 30 |
4 | 40 |
5 | 50 |
#保存为文本文件.txt
write.table(df,file = "test1",row.names = F,quote = F)#默认为True;
#保存为csv文件
write.table(df,file = "test2.csv",sep = ",",row.names = F,quote = F)
#保存为R文件
save(df,file = "test3.Rdata")
以上方法均可以使用下图参数
#1.读取txt文件
df=read.table("test1",header=TRUE)
df
Index | grade |
---|---|
1 | 10 |
2 | 20 |
3 | 30 |
4 | 40 |
5 | 50 |
#2.读取excel文件
library(readxl)
df1=read_excel("test4.xlsx",sheet=1)
df1[1:12,1:3]
New names:
* `` -> ...1
* `` -> ...5
* `` -> ...6
* `` -> ...7
* `` -> ...8
* ... and 43 more problems
...1 | B7养老服务 | B7-1 |
---|---|---|
吉林省 | 72.89000 | 72.71000 |
长春市 | 77.47757 | 77.84353 |
吉林市 | 72.93831 | 71.76471 |
延边朝鲜族自治州 | 71.00000 | 69.76190 |
白城市 | 69.54338 | 69.77376 |
白山市 | 61.18881 | 60.57554 |
四平市 | 67.11039 | 65.95469 |
通化市 | 64.58101 | 64.21053 |
松原市 | 71.34831 | 72.02817 |
梅河口市 | 84.19492 | 84.15254 |
辽源市 | 71.55000 | 72.28000 |
长白山保护开发区 | 68.63014 | 68.21918 |
if(conditon) 操作 else 操作
ifelse(condition,yes,no):别用
for(i in 1:5)print()
while(condition){操作}
repeat()
函数名=function(x,y){}
函数名=function(x=1,y=2),设置默认值
#条件语句
x=2
if(x>0)x=x+1 else x=x+2
x
3
grade=85
ifelse(grade>80,print("优秀"),print("良好"))
Error in parse(text = x, srcfile = src): :2:30: unexpected input
1: grade=85
2: ifelse(grade>80,print("优秀")n ^
Traceback:
#定义一个将b进制的数转换为10进制
base=function(x,y){
x+y
}
base(2,3)
5
散点图
柱状图
饼图
直方图
箱线图
多图
plot()
function (x, y = NULL, type = “p”, xlim = NULL, ylim = NULL,
log = “”, main = NULL, sub = NULL, xlab = NULL, ylab = NULL,
ann = par(“ann”), axes = TRUE, frame.plot = axes, panel.first = NULL,
panel.last = NULL, asp = NA, …
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QUAmvfrs-1678242376637)(attachment:96a8d98f1c9ff18f7237157f820b478.png)]
mian:图形标题
axes:两个坐标轴是否都画出
x=1:5
y=x*x+1;
plot(x,y,main = "散点图")
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DSaEtCwl-1678242376637)(output_89_0.png)]
#获得一个点或者一个点的坐标
locator(3)
Error in locator(3): plot.new has not been called yet
Traceback:
1. locator(3)
#一个特别的散点图
u=1:25
plot(u~1,pch = u,col=u,cex=3)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-I2As5CMN-1678242376638)(output_91_0.png)]
barplot(las=2,):无法直接处理数据框,需要将数据框转换为矩阵
as.matrix():表示将其参数转换为datatype类型
df=read.csv("student.csv");
df_m=as.matrix(df)
barplot(df_m)
Warning message in apply(height, 2L, cumsum):
"强制改变过程中产生了NA"
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7qCqov6V-1678242376638)(output_93_1.png)]
barplot(df_m,las=1)#las=2可以修改标签样式,等于2时——标签垂直于坐标轴
Warning message in apply(height, 2L, cumsum):
"强制改变过程中产生了NA"
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-s3CiHr4C-1678242376639)(output_94_1.png)]
#横向并列的柱状图
barplot(df_m,beside = TRUE,horiz = TRUE,legend=TRUE,cex.names = .75)
Error in -0.01 * height: 二进列运算符中有非数值参数
Traceback:
1. barplot(df_m, beside = TRUE, horiz = TRUE, legend = TRUE, cex.names = 0.75)
2. barplot.default(df_m, beside = TRUE, horiz = TRUE, legend = TRUE,
. cex.names = 0.75)
pie(x, labels = names(x), edges = 200, radius = 0.8,
clockwise = FALSE, init.angle = if(clockwise) 90 else 0,
density = NULL, angle = 45, col = NULL, border = NULL,
lty = NULL, main = NULL, …)
catch=c(7752,1166,463,108)
#为向量赋值
names(catch)=c("新疆","周期","篮协","姚明")
pie(catch,cex=.6)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-R2e0FPLb-1678242376639)(output_97_0.png)]
hist(x)
plot(density(x))
df=data.frame(姓名=c("s","z","s","t"),
排名=1:4)
df
姓名 | 排名 |
---|---|
s | 1 |
z | 2 |
s | 3 |
t | 4 |
#直方图绘制
hist(df$排名)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gtVTfHgP-1678242376639)(output_100_0.png)]
#密度图
plot(density(df$排名))
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-X4DeyMun-1678242376640)(output_101_0.png)]
boxplot(x, …)
Default S3 method:
boxplot(x, …, range = 1.5, width = NULL, varwidth = FALSE,
notch = FALSE, outline = TRUE, names, plot = TRUE,
border = par(“fg”), col = NULL, log = “”,
pars = list(boxwex = 0.8, staplewex = 0.5, outwex = 0.5),
horizontal = FALSE, add = FALSE, at = NULL)
boxplot(df$排名)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-CSJLKdE3-1678242376640)(output_103_0.png)]
A <- c(79.98, 80.04, 80.02, 80.04, 80.03, 80.03, 80.04, 79.97, 80.05, 80.03,
80.02, 80.00, 80.02)
B <- c(80.02, 79.94, 79.98, 79.97, 79.97, 80.03, 79.95,79.97)
#不带切口的图像
boxplot(A,B,names = c("A","B"),col=c(2:3))
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UvakDZnB-1678242376640)(output_104_0.png)]
#notch=T画出的图形带切口
boxplot(A,B,notch = T,names = c("A","B"),col=c(2:3))
Warning message in bxp(list(stats = structure(c(80, 80.02, 80.03, 80.04, 80.05, :
"some notches went outside hinges ('box'): maybe set notch=FALSE"
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-cZIM4cIY-1678242376640)(output_105_1.png)]
summary():查看一个概述,数据集大致
mean(x):计算均值
range():同时计算最大值,最小值
quantile(x,probs=c()):返回不同的分位数
fivenum():计算 最小值,25%,中位数,75%分位数,最大值
IQR():计算75%-25%的差
str():展示一个对象的结构
stem(x,scale,width,atom)查看数值型向量分布信息,图长度,宽度,精度
sd()标准差,var()方差
attach(mtcars)
#均值
mean(mpg)
20.090625
#最值
range(mpg)
#分位数
quantile(mpg,probs = c(0,0.3,0.6,1))
fivenum(mpg)
IQR(mpg)
7.375
summary(mpg)
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.40 15.43 19.20 20.09 22.80 33.90
str(mtcars)
'data.frame': 32 obs. of 11 variables:
$ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
$ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
$ disp: num 160 160 108 258 360 ...
$ hp : num 110 110 93 110 175 105 245 62 95 123 ...
$ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
$ wt : num 2.62 2.88 2.32 3.21 3.44 ...
$ qsec: num 16.5 17 18.6 19.4 17 ...
$ vs : num 0 0 1 1 0 1 0 1 1 1 ...
$ am : num 1 1 1 0 0 0 0 0 0 0 ...
$ gear: num 4 4 4 3 3 3 3 4 4 4 ...
$ carb: num 4 4 1 1 2 1 4 2 2 4 ...
stem(subset(mtcars,mpg>15)$mpg)#截取mtcars中 mpg大于15的子集的mpg列
The decimal point is at the |
14 | 2258
16 | 438
18 | 17227
20 | 00445
22 | 88
24 | 4
26 | 03
28 |
30 | 44
32 | 49
sd(mpg)
6.0269480520891
var(mpg)
36.3241028225806
计算相关系数
cor(x, y = NULL, use = “everything”,
method = c(“pearson”, “kendall”, “spearman”))use:处理NA值的方式,all.obs 报错; everything:返回NA; complete.obs:过滤NA值
相关分析
cor.test(x,y)
cov():参数与cor()相同
#1.画出散点图大致判断
x<-c(1.21, 1.30, 1.39, 1.42, 1.47, 1.56, 1.68, 1.72, 1.98, 2.10)
y<-c(3.90, 4.50, 4.20, 4.83, 4.16, 4.93, 4.32, 4.99, 4.70, 5.20)
level=data.frame(x,y)
plot(level)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jd0Bk0ni-1678242376641)(output_119_0.png)]
#大致推测有正相关关系
#2.相关分析,pearson系数计算
cor(x,y)#计算
0.68072825953986
#相关分析
cor.test(x,y)#p小于0.05备,大于原假设
Pearson's product-moment correlation
data: x and y
t = 2.6284, df = 8, p-value = 0.03025
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.08943359 0.91722701
sample estimates:
cor
0.6807283
x1=1:5
y1=6:10
y2=10:14
cor(x1,y1 )
1
cor(x1,y2)#我们可发现结果相同
1
cov(x,y)
0.0830455555555556
sample(x,n,replace,prob)
replace=T:有放回抽样;prob=y:指定x中元素出现的额概率
prod():计算向量的乘积,可用作排列数
choose()组合数
#等可能不放回随机抽样
sample(1:52,10)
#等可能有放回的随机抽样
sample(1:11,10,replace = T)
#不等可能随机抽样
sample(1:5,2,prob = c(0.1,0.2,0.3,0.2,0.2))
prod(1:10)
3628800
choose(52,4)
270725
binom(n,p):n为1时贝努利分布
dbinom(x, size, prob):返回成功x次的概
pbinom(q, size, prob):返回至多成功x次的概率,即累积概
qbinom(p, size, prob):返回相应分位点x,详情见下面的例
rbinom(n, size, prob):返回每组试验的成功次数
pois(k)
poisson.test():使用泊松分布进行假设检验
课件三-29
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-R1yg9eGQ-1678242376641)(attachment:%E5%9B%BE%E7%89%87.png)]
binom.test(x=23,n=1000,p=0.02,alternative="greater",conf.level=0.95)
Exact binomial test
data: 23 and 1000
number of successes = 23, number of trials = 1000, p-value = 0.2778
alternative hypothesis: true probability of success is greater than 0.02
95 percent confidence interval:
0.01576927 1.00000000
sample estimates:
probability of success
0.023
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GtjvpruU-1678242376641)(attachment:%E5%9B%BE%E7%89%87.png)]
#泊松分布
poisson.test(x=60,T = 50,alternative = "greater",conf.level = 0.95)
Exact Poisson test
data: 60 time base: 50
number of events = 60, time base = 50, p-value = 0.09227
alternative hypothesis: true event rate is greater than 1
95 percent confidence interval:
0.9570464 Inf
sample estimates:
event rate
1.2
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IqbeILku-1678242376642)(attachment:%E5%9B%BE%E7%89%87.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eNkxAR8e-1678242376642)(attachment:%E5%9B%BE%E7%89%87-2.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nffv7r0R-1678242376642)(attachment:%E5%9B%BE%E7%89%87-3.png)]
unif(a,b):[a,b]上服从均匀分布
#生成一个长度为10的向量
runif(10)
runif(4,min = 85,max = 95)
norm()
#生成服从正态分布的随机数
rnorm(10,mean = 1,sd = 10)
curve(sin,-2*pi,2*pi)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ADmcNyVH-1678242376642)(output_145_0.png)]
#连续分布的密度函数,qq图
qqnorm(rnorm(1000))
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-AmuyK7yr-1678242376643)(output_146_0.png)]
z检验:自己定义
t检验(均值的比较):单个正态总体的u检验,两个正太总体 u是否相等的检验
f检验:方差的检验(var.test())
t.test(x, y = NULL,
alternative = c(“two.sided”, “less”, “greater”),
mu = 0, paired = FALSE, var.equal = FALSE,
conf.level = 0.95, …)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FoacXq0l-1678242376643)(attachment:%E5%9B%BE%E7%89%87.png)]
#z检验
z.test<-function(x,n,sigma,alpha,u0=0,alternative="two.sided"){
options(digits=4)
result<-list( )
mean<-mean(x)
z<-(mean-u0)/(sigma/sqrt(n))
p<-pnorm(z,lower.tail=FALSE)
result$mean<-mean
result$z<-z
result$p.value<-p
if(alternative=="two.sided"){
p<-2*p
result$p.value<-p
}
else if (alternative == "greater"|alternative =="less" ){
result$p.value<-p
}
else return("your input is wrong")
result$conf.int<- c(
mean-sigma*qnorm(1-alpha/2,mean=0, sd=1,
lower.tail = TRUE)/sqrt(n),
mean+sigma*qnorm(1-alpha/2,mean=0, sd=1,
lower.tail = TRUE)/sqrt(n))
result }
z.test(0.13,25,0.1,0.05,u0 = 0.12,alternative = "less")
#检验单个总体 u==c
salt=c(490 , 506, 508, 502, 498, 511, 510, 515 , 512)
t.test(salt,mu = 500)
One Sample t-test
data: salt
t = 2.2, df = 8, p-value = 0.06
alternative hypothesis: true mean is not equal to 500
95 percent confidence interval:
499.7 511.8
sample estimates:
mean of x
505.8
#检验多个总体 u1==u2
x<-c(20.5, 19.8, 19.7, 20.4, 20.1, 20.0, 19.0, 19.9)
y<-c(20.7, 19.8, 19.5, 20.8, 20.4, 19.6, 20.2)
t.test(x,y,var.equal = TRUE)
Two Sample t-test
data: x and y
t = -0.85, df = 13, p-value = 0.4
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.7684 0.3327
sample estimates:
mean of x mean of y
19.93 20.14
x<-c(20.5, 19.8, 19.7, 20.4, 20.1, 20.0, 19.0, 19.9)
y=c(20.7,19.8,19.5,20.8,20.4,19.6,20.2)
t.test(x,y,var.equal = TRUE)
Two Sample t-test
data: x and y
t = -0.85, df = 13, p-value = 0.4
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.7684 0.3327
sample estimates:
mean of x mean of y
19.93 20.14
#成对数据的t检验,s使用paired参数
x<-c(20.5, 18.8, 19.8, 20.9, 21.5, 19.5, 21.0, 21.2)
y<-c(17.7, 20.3, 20.0, 18.8, 19.0, 20.1, 20.0, 19.1)
t.test(x,y,var.equal = TRUE,paired = TRUE)
Paired t-test
data: x and y
t = 1.8, df = 7, p-value = 0.1
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.3214 2.3714
sample estimates:
mean of the differences
1.025
var.test()
chisq.test()
#卡方独立性检验
compare=matrix(c(60,32,3,11),nr=2,dimnames = list(c("cancer","normal"),c("smoke","Not smoke")))
chisq.test(compare)
Pearson's Chi-squared test with Yates' continuity correction
data: compare
X-squared = 7.9, df = 1, p-value = 0.005
#Fisher精确检验
fisher.test(compare,alternative = "greater")
Fisher's Exact Test for Count Data
data: compare
p-value = 0.002
alternative hypothesis: true odds ratio is greater than 1
95 percent confidence interval:
1.95 Inf
sample estimates:
odds ratio
6.747
并非对方差进行分析,而是通过对方差的分析比较均值
shapiro.test(rnorm(1000))
Shapiro-Wilk normality test
data: rnorm(1000)
W = 1, p-value = 0.6
bartlett.test(X~A,data = miscellany)#p值够大,方差相同
Bartlett test of homogeneity of variances
data: X by A
Bartlett's K-squared = 7.1, df = 4, p-value = 0.1
library(car)
Loading required package: carData
#LeveneTest检验
leveneTest(miscellany$X,miscellany$A)
Df | F value | Pr(>F) | |
---|---|---|---|
group | 4 | 1.737 | 0.1942 |
15 | NA | NA |
aov.is= aov()
summary(aov.is)
X<-c(25.6, 22.2, 28.0, 29.8, 24.4, 30.0, 29.0, 27.5, 25.0, 27.7,
23.0, 32.2, 28.8, 28.0, 31.5, 25.9, 20.6, 21.2, 22.0, 21.2)
A=factor(rep(1:5,each=4))#因子存储不同类型的数据
A
#生成数据框
miscellany=data.frame(X,A)
#拟合ANOVA模型:aov()
aov.mis=aov(X~A,data = miscellany)#X是实验数据,A是对应方法
summary(aov.mis)
Df Sum Sq Mean Sq F value Pr(>F)
A 4 132 33.0 4.31 0.016 *
Residuals 15 115 7.7
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Df:自由度 Sum Sq :平方和 Mean Sq:均方和 Fvalue f统计量值 Pr :p值
由计算结果可知这里p值(Pr) 0.016<0.05 所以拒绝原假设,5种不同的方法除杂效果不完全相同同
对比单因素方差分析 X~A
同样使用aov方法,模型如下
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mi5JxnV8-1678242376644)(attachment:%E5%9B%BE%E7%89%87.png)]
- A,B为各种因素,x为数据集
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-D1Xs9egy-1678242376644)(attachment:%E5%9B%BE%E7%89%87-2.png)]
gl(x, k, length, labels, ordered)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-snzluevZ-1678242376645)(attachment:%E5%9B%BE%E7%89%87.png)]
juice<-data.frame(
X = c(0.05, 0.46, 0.12, 0.16, 0.84, 1.30, 0.08, 0.38, 0.4,
0.10, 0.92, 1.57, 0.11, 0.43, 0.05, 0.10, 0.94, 1.10,
0.11, 0.44, 0.08, 0.03, 0.93, 1.15),
A = gl(4, 6),# 1 到4 每个数重复6次
B = gl(6, 1, 24)# 1:6 每个一次,由于不够24 数组循环4次
)
juice
X | A | B |
---|---|---|
0.05 | 1 | 1 |
0.46 | 1 | 2 |
0.12 | 1 | 3 |
0.16 | 1 | 4 |
0.84 | 1 | 5 |
1.30 | 1 | 6 |
0.08 | 2 | 1 |
0.38 | 2 | 2 |
0.40 | 2 | 3 |
0.10 | 2 | 4 |
0.92 | 2 | 5 |
1.57 | 2 | 6 |
0.11 | 3 | 1 |
0.43 | 3 | 2 |
0.05 | 3 | 3 |
0.10 | 3 | 4 |
0.94 | 3 | 5 |
1.10 | 3 | 6 |
0.11 | 4 | 1 |
0.44 | 4 | 2 |
0.08 | 4 | 3 |
0.03 | 4 | 4 |
0.93 | 4 | 5 |
1.15 | 4 | 6 |
avo.juice=aov(X~A+B,data=juice)
summary(avo.juice)#A p值0.22>0.05 A无B有
Df Sum Sq Mean Sq F value Pr(>F)
A 3 0.06 0.019 1.63 0.22
B 5 4.90 0.980 83.98 2e-10 ***
Residuals 15 0.18 0.012
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Afcfnkkq-1678242376645)(attachment:%E5%9B%BE%E7%89%87.png)]
rats=data.frame(Time=c(0.31, 0.45, 0.46, 0.43, 0.82, 1.10, 0.88, 0.72, 0.43, 0.45,
0.63, 0.76, 0.45, 0.71, 0.66, 0.62, 0.38, 0.29, 0.40, 0.23,
0.92, 0.61, 0.49, 1.24, 0.44, 0.35, 0.31, 0.40, 0.56, 1.02,
0.71, 0.38, 0.22, 0.21, 0.18, 0.23, 0.30, 0.37, 0.38, 0.29,
0.23, 0.25, 0.24, 0.22, 0.30, 0.36, 0.31, 0.33),
Toxicant=gl(3, 16, 48, labels = c("I", "II", "III")),
Cure=gl(4, 4, 48, labels = c("A", "B", "C", "D")))
rats
Time | Toxicant | Cure |
---|---|---|
0.31 | I | A |
0.45 | I | A |
0.46 | I | A |
0.43 | I | A |
0.82 | I | B |
1.10 | I | B |
0.88 | I | B |
0.72 | I | B |
0.43 | I | C |
0.45 | I | C |
0.63 | I | C |
0.76 | I | C |
0.45 | I | D |
0.71 | I | D |
0.66 | I | D |
0.62 | I | D |
0.38 | II | A |
0.29 | II | A |
0.40 | II | A |
0.23 | II | A |
0.92 | II | B |
0.61 | II | B |
0.49 | II | B |
1.24 | II | B |
0.44 | II | C |
0.35 | II | C |
0.31 | II | C |
0.40 | II | C |
0.56 | II | D |
1.02 | II | D |
0.71 | II | D |
0.38 | II | D |
0.22 | III | A |
0.21 | III | A |
0.18 | III | A |
0.23 | III | A |
0.30 | III | B |
0.37 | III | B |
0.38 | III | B |
0.29 | III | B |
0.23 | III | C |
0.25 | III | C |
0.24 | III | C |
0.22 | III | C |
0.30 | III | D |
0.36 | III | D |
0.31 | III | D |
0.33 | III | D |
aov.rat=aov(Time~Toxicant*Cure,data = rats)
summary(aov.rat)
Df Sum Sq Mean Sq F value Pr(>F)
Toxicant 2 1.036 0.518 23.23 3.3e-07 ***
Cure 3 0.915 0.305 13.67 4.1e-06 ***
Toxicant:Cure 6 0.248 0.041 1.85 0.12
Residuals 36 0.803 0.022
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
三行分别为 毒药,治疗,交互作用对于存活时间的影响
lm(formula, data, subset, weights, na.action,
method = “qr”, model = TRUE, x = FALSE, y = FALSE, qr = TRUE,
singular.ok = TRUE, contrasts = NULL, offset, …): formula是显示回归模型,data是数据框,subset是样本观察的子
集,weights是用于拟合的加权向量,na.action显示数据是否包含缺失值,
method是指出用于拟合的方法,model, x, y, qr是逻辑表达,如果是TRUE,
应返回其值,除了第一个选项formula是必选项,其它都是可选项
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-X2pWlvdU-1678242376645)(attachment:%E5%9B%BE%E7%89%87.png)]
confint(object,parm,level=0.95…)回归模型,所求区间估计的参数,置信水平(求解回归参数的置信区间)
predict(object,x,interval=“predction”,level)#predction 表示同时给出相应的预测区间
x<-c(318, 910, 200, 409, 415, 502, 314, 1210, 1022, 1225)
y<-c(524, 1019, 638, 815, 913, 928, 605, 1516, 1219, 1624)
lm.reg=lm(y~x)
summary(lm.reg)
Call:
lm(formula = y ~ x)
Residuals:
Min 1Q Median 3Q Max
-191.8 -87.0 44.8 77.9 145.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 395.567 80.261 4.93 0.0012 **
x 0.896 0.107 8.40 3.1e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 127 on 8 degrees of freedom
Multiple R-squared: 0.898, Adjusted R-squared: 0.886
F-statistic: 70.6 on 1 and 8 DF, p-value: 3.06e-05
confint(lm.reg,level = 0.95)
2.5 % | 97.5 % | |
---|---|---|
(Intercept) | 210.48 | 580.650 |
x | 0.65 | 1.142 |
#模型建立
y<-c(162, 120, 223, 131, 67, 169, 81, 192, 116, 55,
252, 232, 144, 103, 212)
x1<-c(274, 180, 375, 205, 86, 265, 98, 330, 195, 53,
430, 372, 236, 157, 370)
x2<-c(2450, 3250, 3802, 2838, 2347, 3782, 3008, 2450,
2137, 2560, 4020, 4427, 2660, 2088, 2605)
sales=data.frame(y,x1,x2)
lm.reg=lm(y~x1+x2,data = sales)
summary(lm.reg)
Call:
lm(formula = y ~ x1 + x2, data = sales)
Residuals:
Min 1Q Median 3Q Max
-3.831 -1.206 -0.244 1.482 3.302
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.445728 2.426693 1.42 0.18
x1 0.495972 0.006046 82.04 < 2e-16 ***
x2 0.009205 0.000967 9.52 6.1e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.17 on 12 degrees of freedom
Multiple R-squared: 0.999, Adjusted R-squared: 0.999
F-statistic: 5.7e+03 on 2 and 12 DF, p-value: <2e-16
结论:系数的p值,F分布的p值很小,所以回归方程与回归系数的检验是显著的
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-j1A56ySu-1678242376646)(attachment:%E5%9B%BE%E7%89%87.png)]
y<-c(11.2, 8.8, 12.3, 11.6, 13.4, 18.3, 11.1, 12.1, 9.6, 8.4, 9.3, 10.6, 8.4, 9.6,
10.9, 10.1,14.8, 9.1, 10.8, 10.2, 13.6, 14.9, 16.0, 13.2,20.0, 13.3, 10.4)
x1<-c(5.68, 3.79, 6.02, 4.85, 4.60, 6.05, 4.90, 7.08,3.85,4.65, 4.59, 4.29, 7.97,
6.19, 6.13, 5.71,6.40,6.06, 5.09, 6.13, 5.78, 5.43, 6.50, 7.98,11.54,5.84, 3.84)
x2<-c(1.90, 1.64, 3.56, 1.07, 2.32, 0.64, 8.50, 3.00,2.11, 0.63, 1.97, 1.97,
1.93, 1.18, 2.06, 1.78,2.40, 3.67, 1.03, 1.71, 3.36, 1.13, 6.21, 7.92,10.89, 0.92,
1.20)
x3<-c(4.53, 7.32, 6.95, 5.88, 4.05, 1.42, 12.60, 6.75,16.28, 6.59, 3.61, 6.61,
7.57, 1.42, 10.35, 8.53,4.53,12.79, 2.53, 5.28, 2.96, 4.31, 3.47, 3.37,1.20, 8.61,
6.45)
x4<-c(8.2, 6.9, 10.8, 8.3, 7.5, 13.6, 8.5, 11.5,7.9, 7.1, 8.7, 7.8, 9.9, 6.9,
10.5, 8.0,10.3, 7.1, 8.9, 9.9, 8.0, 11.3, 12.3, 9.8, 10.5, 6.4, 9.6)
blood=data.frame(y,x1,x2,x3,x4)
lm.reg=lm(y~x1+x2+x3+x4,data = blood)
summary(lm.reg)
Call:
lm(formula = y ~ x1 + x2 + x3 + x4, data = blood)
Residuals:
Min 1Q Median 3Q Max
-3.627 -1.200 -0.228 1.539 4.447
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.943 2.829 2.10 0.047 *
x1 0.142 0.366 0.39 0.701
x2 0.351 0.204 1.72 0.099 .
x3 -0.271 0.121 -2.23 0.036 *
x4 0.638 0.243 2.62 0.016 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.01 on 22 degrees of freedom
Multiple R-squared: 0.601, Adjusted R-squared: 0.528
F-statistic: 8.28 on 4 and 22 DF, p-value: 0.000312
结论:回归系数方程显著性不高
在OLS线性回归的基础上进行建模
step(object, scope, scale = 0,direction = c(“both”, “backward”, “forward”),
trace = 1, keep = NULL, steps = 1000, k = 2, …)object是线性模型或广义线性模型分析的结果,scope是确定逐步
搜索的区域,direction确定逐步搜索的方向:“both”是“一切子集回归
法”,“backward”是“向后法”,“forward”是向前法, 默认值为both
lm.step=step(lm.reg)
summary(lm.step)
Start: AIC=42.16
y ~ x1 + x2 + x3 + x4
Df Sum of Sq RSS AIC
- x1 1 0.61 89.5 40.3
88.8 42.2
- x2 1 11.96 100.8 43.6
- x3 1 20.06 108.9 45.7
- x4 1 27.79 116.6 47.5
Step: AIC=40.34
y ~ x2 + x3 + x4
Df Sum of Sq RSS AIC
89.5 40.3
- x3 1 25.7 115.1 45.2
- x2 1 26.5 116.0 45.4
- x4 1 32.3 121.7 46.7
Call:
lm(formula = y ~ x2 + x3 + x4, data = blood)
Residuals:
Min 1Q Median 3Q Max
-3.269 -1.231 -0.202 1.489 4.657
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.500 2.396 2.71 0.0124 *
x2 0.402 0.154 2.61 0.0156 *
x3 -0.287 0.112 -2.57 0.0171 *
x4 0.663 0.230 2.88 0.0084 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.97 on 23 degrees of freedom
Multiple R-squared: 0.598, Adjusted R-squared: 0.546
F-statistic: 11.4 on 3 and 23 DF, p-value: 8.79e-05
AIC统计量42.16 如果去掉变量x1 AIC 40.34.
依次类推。由于去掉X1
使AIC统计量达到最小,因此R软件
会自动去掉变量X1,进入下一轮计
算。在下一轮中,无论去掉哪一个
变量,AIC统计量的值均会升高,因
此R软件自动终止计算,得到“最优”
回归方程
#再次提取相关信息,这是优化之后的
summary(lm.step)
Call:
lm(formula = y ~ x2 + x3 + x4, data = blood)
Residuals:
Min 1Q Median 3Q Max
-3.269 -1.231 -0.202 1.489 4.657
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.500 2.396 2.71 0.0124 *
x2 0.402 0.154 2.61 0.0156 *
x3 -0.287 0.112 -2.57 0.0171 *
x4 0.663 0.230 2.88 0.0084 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.97 on 23 degrees of freedom
Multiple R-squared: 0.598, Adjusted R-squared: 0.546
F-statistic: 11.4 on 3 and 23 DF, p-value: 8.79e-05
主要用于检验回归假设是否成立
主要内容
residuals():计算残差
rstandard():计算标准差
rsrudent():计算学生化残差
predict():求预测值
update():更新模型
coef():提取参数的估计
#残差计算
y<-c(162, 120, 223, 131, 67, 169, 81, 192, 116, 55, 252, 232, 144, 103, 212)
x1<-c(274, 180, 375, 205, 86, 265, 98, 330, 195, 53,430, 372, 236, 157, 370)
x2<-c(2450, 3250, 3802, 2838, 2347, 3782, 3008, 2450,2137, 2560, 4020, 4427,
2660, 2088, 2605)
sales<-data.frame(y,x1,x2)
lm.reg=lm(y~x1+x2,data = sales)
summary(lm.reg)
Call:
lm(formula = y ~ x1 + x2, data = sales)
Residuals:
Min 1Q Median 3Q Max
-3.8312 -1.2063 -0.2436 1.4819 3.3025
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.4457284 2.4266934 1.420 0.181
x1 0.4959724 0.0060455 82.039 < 2e-16 ***
x2 0.0092049 0.0009668 9.521 6.07e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.173 on 12 degrees of freedom
Multiple R-squared: 0.9989, Adjusted R-squared: 0.9988
F-statistic: 5699 on 2 and 12 DF, p-value: < 2.2e-16
#残差计算
y.res=residuals(lm.reg)
#计算标准化残差
y.rst=rstandard(lm.reg)
y.fit=predict(lm.reg)
op=par(mfrow = c(1,2))
plot(y.res~y.fit)
plot(y.rst~y.fit)
par(op)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mNRy5BPy-1678242376646)(output_204_0.png)]
x<-c(294, 247, 267, 358, 423, 311, 450, 534, 438, 697,688, 630, 709, 627,
615, 999, 1022, 1015, 700, 850, 980, 1025, 1021, 1200, 1250, 1500, 1650)
y<-c(50, 40, 45, 55, 70, 65, 55, 62, 68, 78,80, 84, 88, 97, 100, 109, 114,
117, 106, 128,130, 160, 97, 180, 112, 210, 135)
persons<-data.frame(x,y)
lm.reg=lm(y~x+1,data = persons)
summary(lm.reg)
Call:
lm(formula = y ~ x + 1, data = persons)
Residuals:
Min 1Q Median 3Q Max
-47.645 -11.136 -4.278 11.683 41.677
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.09434 9.27542 2.705 0.0121 *
x 0.09549 0.01099 8.691 5.02e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 21.08 on 25 degrees of freedom
Multiple R-squared: 0.7513, Adjusted R-squared: 0.7414
F-statistic: 75.54 on 1 and 25 DF, p-value: 5.018e-09
rst=rstandard(lm.reg)
res=residuals(lm.reg)
lm.fit=predict(lm.reg)
#标准残差化图
plot(rst~lm.fit)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-r9kcxJhu-1678242376646)(output_209_0.png)]
#残差图
plot(res~lm.fit)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-4YE47ETX-1678242376647)(output_210_0.png)]
#平方变换后建立新的模型
lm.ng=update(lm.reg,sqrt(.)~.)
coef(lm.ng)
方法
eigen(x,symetrict,only.value=FALSE,EISPACK=FALSE) x所求矩阵,symmetric 规定方程的对称性 only.val=TRUE 只返回特征值
kappa(x,exact=FALSE): exact=TRUE,精确计算条件数
vif(lmobj,digits=5) lmobj为lm生成的对象,digits为小数点位数
library(DAAG)
x1<-c(7, 1, 11, 11, 7, 11, 3, 1, 2, 21, 1, 11, 10)
x2<-c(26, 29, 56, 31, 52, 55, 71, 31, 54, 47, 40, 66, 68)
x3<-c(6, 15, 8, 8, 6, 9, 17, 22, 18, 4, 23, 9, 8)
x4<-c(60, 52, 20, 57, 33, 22, 6, 44, 22, 18, 34, 12, 12)
y<-c(78.5, 74.3, 104.3, 87.6, 95.9, 109.2, 102.7, 72.5, 93.1, 115.9, 83.8, 113.3,
109.4)
df=data.frame(x1,x2,x3,x4,y)
lm.reg=lm(y~x1+x2+x3+x4,data=df)
summary(lm.reg)
Call:
lm(formula = y ~ x1 + x2 + x3 + x4, data = df)
Residuals:
Min 1Q Median 3Q Max
-3.2777 -1.3956 -0.2374 1.1650 4.0379
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 64.8044 22.8867 2.832 0.02210 *
x1 1.4805 0.3598 4.115 0.00337 **
x2 0.4918 0.2285 2.153 0.06351 .
x3 0.0510 0.3299 0.155 0.88097
x4 -0.1563 0.2120 -0.737 0.48205
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.373 on 8 degrees of freedom
Multiple R-squared: 0.9834, Adjusted R-squared: 0.9751
F-statistic: 118.6 on 4 and 8 DF, p-value: 3.736e-07
#查看膨胀因子
vif(lm.reg,digits = 3)
结论:由于x2,x4的因子大于10,所以他们之间可能存在共线性
#查看二者相关
cor(x2,x4)
-0.947969951546351
glm(formula,family=family(link=function),data=)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-PmSpic20-1678242376647)(attachment:%E5%9B%BE%E7%89%87.png)]
x1=rep(c(1,0,1,0,1),c(5,10,10,10,10))
x2<-c(17, 44, 48, 55, 75, 35, 42, 57, 28, 20, 38, 45, 47, 52, 55, 68, 18, 68, 48,
17, 70, 72, 35, 19, 62, 39, 40, 55, 68, 25, 17, 45, 44, 67, 55, 61, 19, 69, 23, 19,
72, 74, 31, 16, 61)
x3<-c(1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1)
y<-c(1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0)
accident=data.frame(x1,x2,x3,y)
#建立广义线性模型
log.glm=glm(y~x1+x2+x3,family =binomial,data = accident)
summary(log.glm)
Call:
glm(formula = y ~ x1 + x2 + x3, family = binomial, data = accident)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5636 -0.9131 -0.7892 0.9637 1.6000
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.597610 0.894831 0.668 0.5042
x1 -1.496084 0.704861 -2.123 0.0338 *
x2 -0.001595 0.016758 -0.095 0.9242
x3 0.315865 0.701093 0.451 0.6523
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 62.183 on 44 degrees of freedom
Residual deviance: 57.026 on 41 degrees of freedom
AIC: 65.026
Number of Fisher Scoring iterations: 4
lm.step=step(log.glm)
summary(lm.step)
Start: AIC=65.03
y ~ x1 + x2 + x3
Df Deviance AIC
- x2 1 57.035 63.035
- x3 1 57.232 63.232
57.026 65.026
- x1 1 61.936 67.936
Step: AIC=63.03
y ~ x1 + x3
Df Deviance AIC
- x3 1 57.241 61.241
57.035 63.035
- x1 1 61.991 65.991
Step: AIC=61.24
y ~ x1
Df Deviance AIC
57.241 61.241
- x1 1 62.183 64.183
Call:
glm(formula = y ~ x1, family = binomial, data = accident)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.4490 -0.8782 -0.8782 0.9282 1.5096
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.6190 0.4688 1.320 0.1867
x1 -1.3728 0.6353 -2.161 0.0307 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 62.183 on 44 degrees of freedom
Residual deviance: 57.241 on 43 degrees of freedom
AIC: 61.241
Number of Fisher Scoring iterations: 4
log.pre=predict(lm.step,data.frame(x1=1))
p1=exp(log.pre)/(1+exp(log.pre))
p1
1: 0.320000000000098
log.pre=predict(lm.step,data.frame(x1=0))
p2=exp(log.pre)/(1+exp(log.pre))
p2
1: 0.649999999999986
data("Affairs",package="AER")
summary(Affairs)
affairs gender age yearsmarried children
Min. : 0.000 female:315 Min. :17.50 Min. : 0.125 no :171
1st Qu.: 0.000 male :286 1st Qu.:27.00 1st Qu.: 4.000 yes:430
Median : 0.000 Median :32.00 Median : 7.000
Mean : 1.456 Mean :32.49 Mean : 8.178
3rd Qu.: 0.000 3rd Qu.:37.00 3rd Qu.:15.000
Max. :12.000 Max. :57.00 Max. :15.000
religiousness education occupation rating
Min. :1.000 Min. : 9.00 Min. :1.000 Min. :1.000
1st Qu.:2.000 1st Qu.:14.00 1st Qu.:3.000 1st Qu.:3.000
Median :3.000 Median :16.00 Median :5.000 Median :4.000
Mean :3.116 Mean :16.17 Mean :4.195 Mean :3.932
3rd Qu.:4.000 3rd Qu.:18.00 3rd Qu.:6.000 3rd Qu.:5.000
Max. :5.000 Max. :20.00 Max. :7.000 Max. :5.000
table(Affairs$affairs)
0 1 2 3 7 12
451 34 17 19 42 38
#将婚外情转换为二值数据
Affairs$ynaffairs[Affairs$affairs>0]=1;
Affairs$ynaffairs[Affairs$affairs==0]=0;
Affairs$ynaffairs=factor(Affairs$ynaffairs,levels = c(0,1),labels = c("No","Yes"))
table(Affairs$ynaffairs)
No Yes
451 150
aff_glm=glm(ynaffairs~gender+age+yearsmarried+children+religiousness+education+occupation+rating,family = binomial, data = Affairs)
summary(aff_glm)
Call:
glm(formula = ynaffairs ~ gender + age + yearsmarried + children +
religiousness + education + occupation + rating, family = binomial,
data = Affairs)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5713 -0.7499 -0.5690 -0.2539 2.5191
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.37726 0.88776 1.551 0.120807
gendermale 0.28029 0.23909 1.172 0.241083
age -0.04426 0.01825 -2.425 0.015301 *
yearsmarried 0.09477 0.03221 2.942 0.003262 **
childrenyes 0.39767 0.29151 1.364 0.172508
religiousness -0.32472 0.08975 -3.618 0.000297 ***
education 0.02105 0.05051 0.417 0.676851
occupation 0.03092 0.07178 0.431 0.666630
rating -0.46845 0.09091 -5.153 2.56e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 675.38 on 600 degrees of freedom
Residual deviance: 609.51 on 592 degrees of freedom
AIC: 627.51
Number of Fisher Scoring iterations: 4
#建立逐步回归模型
aff.step=step(aff_glm)
summary(aff.step)
Start: AIC=627.51
ynaffairs ~ gender + age + yearsmarried + children + religiousness +
education + occupation + rating
Df Deviance AIC
- education 1 609.68 625.68
- occupation 1 609.70 625.70
- gender 1 610.89 626.89
- children 1 611.40 627.40
609.51 627.51
- age 1 615.67 631.67
- yearsmarried 1 618.34 634.34
- religiousness 1 622.92 638.92
- rating 1 636.75 652.75
Step: AIC=625.68
ynaffairs ~ gender + age + yearsmarried + children + religiousness +
occupation + rating
Df Deviance AIC
- occupation 1 610.15 624.15
- gender 1 611.29 625.29
- children 1 611.62 625.62
609.68 625.68
- age 1 615.78 629.78
- yearsmarried 1 618.46 632.46
- religiousness 1 623.27 637.27
- rating 1 636.93 650.93
Step: AIC=624.15
ynaffairs ~ gender + age + yearsmarried + children + religiousness +
rating
Df Deviance AIC
- children 1 611.86 623.86
610.15 624.15
- gender 1 613.41 625.41
- age 1 616.00 628.00
- yearsmarried 1 619.07 631.07
- religiousness 1 623.98 635.98
- rating 1 637.23 649.23
Step: AIC=623.86
ynaffairs ~ gender + age + yearsmarried + religiousness + rating
Df Deviance AIC
611.86 623.86
- gender 1 615.36 625.36
- age 1 618.05 628.05
- religiousness 1 625.57 635.57
- yearsmarried 1 626.23 636.23
- rating 1 639.93 649.93
Call:
glm(formula = ynaffairs ~ gender + age + yearsmarried + religiousness +
rating, family = binomial, data = Affairs)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5623 -0.7495 -0.5664 -0.2671 2.3975
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.94760 0.61234 3.181 0.001470 **
gendermale 0.38612 0.20703 1.865 0.062171 .
age -0.04393 0.01806 -2.432 0.015011 *
yearsmarried 0.11133 0.02983 3.732 0.000190 ***
religiousness -0.32714 0.08947 -3.656 0.000256 ***
rating -0.46721 0.08928 -5.233 1.67e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 675.38 on 600 degrees of freedom
Residual deviance: 611.86 on 595 degrees of freedom
AIC: 623.86
Number of Fisher Scoring iterations: 4
dist(x, method = “euclidean”, diag = FALSE, upper = FALSE, p = 2)
- euclidean(欧式距离),manhattan(曼哈顿距离)
x=c(1,2,4.5,6,8)
dim(x)=c(5,1)
x
1.0 |
2.0 |
4.5 |
6.0 |
8.0 |
d=dist(x)
d
1 2 3 4
2 1.0
3 3.5 2.5
4 5.0 4.0 1.5
5 7.0 6.0 3.5 2.0
dist(x,upper = TRUE)
1 2 3 4 5
1 1.0 3.5 5.0 7.0
2 1.0 2.5 4.0 6.0
3 3.5 2.5 1.5 3.5
4 5.0 4.0 1.5 2.0
5 7.0 6.0 3.5 2.0
dist(x,upper = TRUE,diag = TRUE)
1 2 3 4 5
1 0.0 1.0 3.5 5.0 7.0
2 1.0 0.0 2.5 4.0 6.0
3 3.5 2.5 0.0 1.5 3.5
4 5.0 4.0 1.5 0.0 2.0
5 7.0 6.0 3.5 2.0 0.0
dist(x,method = "manhattan",upper = TRUE,diag = TRUE)
1 2 3 4 5
1 0.0 1.0 3.5 5.0 7.0
2 1.0 0.0 2.5 4.0 6.0
3 3.5 2.5 0.0 1.5 3.5
4 5.0 4.0 1.5 0.0 2.0
5 7.0 6.0 3.5 2.0 0.0
hclust(d, method = “complete”, members=NULL)
method
complete:最长距离
single:最短距离
median:中间距离
ward.D:离差平方和
类个数的确定
rect.hclust(tree, k = NULL, which = NULL, x = NULL, h = NULL, border = 2, cluster = NULL)
tree是由hclust生成的结构。k是类的个数。h是谱系图中的阈值,
要求分成的各类的距离大于h。border是数或向量,表明矩形框的颜色
x<-c(1, 2, 4.5, 6, 8)
dim(x)=c(5,1)
d=dist(x)
d
1 2 3 4
2 1.0
3 3.5 2.5
4 5.0 4.0 1.5
5 7.0 6.0 3.5 2.0
#最长距离模型建立
model_com=hclust(d,method = "complete",members = NULL)
summary(model_com)
Length Class Mode
merge 8 -none- numeric
height 4 -none- numeric
order 5 -none- numeric
labels 0 -none- NULL
method 1 -none- character
call 4 -none- call
dist.method 1 -none- character
#最短距离
mod_sing=hclust(d,method = "single",members = NULL)
#最长距离
mod_med=hclust(d,method = "median")
mod_ward=hclust(d,"ward.D")
opar=par(mfrow = c(2,2))
plot(model_com,hang =-1)#hang去复制是谱系图从底部画起
plot(mod_med,hang=-1)
plot(mod_sing,hang=-1)
plot(mod_ward,hang=-1)
par(opar)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jLcwcPto-1678242376648)(output_246_0.png)]
X<-data.frame(x1=c(2959.19, 2459.77, 1495.63, 1046.33, 1303.97, 1730.84,
1561.86, 1410.11, 3712.31, 2207.58, 2629.16, 1844.78, 2709.46, 1563.78,
1675.75, 1427.65, 1783.43, 1942.23, 3055.17, 2033.87, 2057.86, 2303.29,
1974.28, 1673.82, 2194.25, 2646.61, 1472.95, 1525.57, 1654.69, 1375.46,
1608.82),
x2=c(730.79, 495.47, 515.90, 477.77, 524.29, 553.90, 492.42, 510.71,
550.74, 449.37, 557.32, 430.29, 428.11, 303.65, 613.32, 431.79, 511.88,
512.27, 353.23, 300.82, 186.44, 589.99, 507.76, 437.75, 537.01, 839.70,
390.89, 472.98, 437.77, 480.99, 536.05),
x3=c(749.41, 697.33, 362.37, 290.15, 254.83, 246.91, 200.49, 211.88,
893.37, 572.40, 689.73, 271.28, 334.12, 233.81, 550.71, 288.55, 282.84,
401.39, 564.56, 338.65, 202.72, 516.21, 344.79, 461.61, 369.07, 204.44,
447.95, 328.90, 258.78, 273.84, 432.46),
x4=c(513.34, 302.87, 285.32, 208.57, 192.17, 279.81, 218.36, 277.11,
346.93, 211.92, 435.69, 126.33, 160.77, 107.90, 219.79, 208.14, 201.01,
206.06, 356.27, 157.78, 171.79, 236.55, 203.21, 153.32, 249.54, 209.11,
259.51, 219.86, 303.00, 317.32, 235.82),
x5=c(467.87, 284.19, 272.95, 201.50, 249.81, 239.18, 220.69, 224.65,
527.00, 302.09, 514.66, 250.56, 405.14, 209.70, 272.59, 217.00, 237.60,
321.29, 811.88, 329.06, 329.65, 403.92, 240.24, 254.66, 290.84, 379.30,
230.61, 206.65, 244.93, 251.08, 250.28),
x6=c(1141.82, 735.97, 540.58, 414.72, 463.09, 445.20, 459.62, 376.82,
1034.98, 585.23, 795.87, 513.18, 461.67, 393.99, 599.43, 337.76, 617.74,
697.22, 873.06, 621.74, 477.17, 730.05, 575.10, 445.59, 561.91, 371.04,
490.90, 449.69, 479.53, 424.75, 541.30),
x7=c(478.42, 570.84, 364.91, 281.84, 287.87, 330.24, 360.48,317.61,
720.33, 429.77, 575.76, 314.00, 535.13, 509.39, 371.62, 421.31, 523.52,
492.60, 1082.82, 587.02, 312.93, 438.41, 430.36, 346.11, 407.70, 269.59,
469.10, 249.66, 288.56, 228.73, 344.85),
x8=c(457.64, 305.08, 188.63, 212.10, 192.96, 163.86, 147.76, 152.85,
462.03, 252.54, 323.36, 151.39, 232.29, 160.12, 211.84, 165.32, 182.52,
226.45, 420.81, 218.27, 279.19, 225.80, 223.46, 191.48, 330.95, 389.33,
191.34, 228.19, 236.51, 195.93, 214.40),
row.names=c("北京","天津","河北","山西","内蒙古","辽宁","吉
林","黑龙江","上海","江苏","浙江","安徽","福建","江西","山东","河南","
湖北","湖南","广东","广西","海南","重庆","四川","贵州","云南","西藏","
陕西","甘肃","青海","宁夏","新疆"))
#生成距离
d=dist(scale(X))
hc1=hclust(d,"single")
hc2=hclust(d,"complete")
hc3=hclust(d,"median")
hc4=hclust(d,"ward.D")
opar=par(mfrow = c(2,2))
plot(hc1,hang=-1)
plot(hc2,hang=-1)
plot(hc3,hang=-1)
plot(hc4,hang=-1)
par(opar)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-LmOBtRRg-1678242376648)(output_249_0.png)]
plot(hc2,hang=-1)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-drsCspZt-1678242376648)(output_250_0.png)]
df_4.1=read.csv("eg4.1.csv",header = TRUE)
df_4.1
x1 | x2 | x3 | x4 | x5 | x6 |
---|---|---|---|---|---|
4.65 | 4.22 | 5.01 | 4.50 | 4.15 | 4.12 |
6.32 | 6.11 | 6.21 | 6.85 | 6.52 | 6.33 |
4.87 | 4.60 | 4.95 | 4.15 | 4.02 | 4.11 |
4.88 | 4.68 | 4.43 | 4.12 | 4.03 | 4.14 |
6.73 | 6.65 | 6.72 | 6.13 | 6.51 | 6.36 |
7.45 | 7.56 | 7.60 | 7.80 | 7.20 | 7.18 |
8.10 | 8.23 | 8.01 | 7.95 | 8.31 | 8.26 |
8.42 | 8.54 | 8.12 | 7.88 | 8.26 | 7.98 |
6.45 | 6.81 | 6.52 | 6.31 | 6.27 | 6.06 |
7.50 | 7.32 | 7.42 | 7.52 | 7.10 | 6.95 |
d=dist(df_4.1)
hc1=hclust(d,"single")
hc2=hclust(d,"complete")
plot(hc1,hang=-1)
rect.hclust(hc1,k=3,border = "red")#分三类的结果使用红色
rect.hclust(hc1,k=2,border = "blue")#分两类的记过
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ZKvmJCaC-1678242376649)(output_252_0.png)]
plot(hc2,hang=-1)
rect.hclust(hc2,k=3,border = "red")
rect.hclust(hc2,k=2,border = "blue")
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jIyrZczm-1678242376649)(output_253_0.png)]
cutree(hc1,k=3)#输出结果
cutree(hc1,k=2)
kmeans(x, centers, iter.max = 10, nstart = 1,
algorithm = c(“Hartigan-Wong”, “Lloyd”, “Forgy”,
“MacQueen”), trace=FALSE)其中x是由数据构成的矩阵或数据框,centers是聚类的个数或者是初始
类的中心。 iter.max 为最大迭代次数。nstart随机集合的个数(当centers
为聚类的个数时)。 algorithm为动态聚类的算法,缺省值为HartiganWong方法
#对31个省市进行聚类分析
km <- kmeans(scale(X), 5, nstart = 20)
km
K-means clustering with 5 clusters of sizes 16, 1, 10, 1, 3
Cluster means:
x1 x2 x3 x4 x5 x6
1 -0.7008593 -0.33291790 -0.5450901 -0.2500165 -0.54749319 -0.6131804
2 1.1255255 2.91079330 -1.0645632 -0.4082114 0.53291392 -1.0476079
3 0.2646918 0.04585518 0.2487958 -0.3405821 -0.01812541 0.2587437
4 1.8042004 -1.12776493 0.9368961 1.2959544 3.90904835 1.6014419
5 1.8790347 1.02836873 2.1203833 2.1727806 1.49972764 2.2232050
x7 x8
1 -0.5420723 -0.57966702
2 -0.9562089 1.66126641
3 0.2874133 -0.02413414
4 3.8803141 2.01876530
5 0.9583064 1.94532737
Clustering vector:
北京 天津 河北 山西 内蒙古 辽宁 吉\n林 黑龙江 上海 江苏 浙江
5 3 1 1 1 1 1 1 5 3 5
安徽 福建 江西 山东 河南 \n湖北 湖南 广东 广西 海南 重庆
1 3 1 3 1 3 3 4 3 1 3
四川 贵州 云南 西藏 \n陕西 甘肃 青海 宁夏 新疆
3 1 3 2 1 1 1 1 1
Within cluster sum of squares by cluster:
[1] 30.14432 0.00000 22.12662 0.00000 10.19134
(between_SS / total_SS = 74.0 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
sort(km$cluster)
#数据预处理
df4.2=read.csv("eg4.2.csv",header = T)
df4.2
Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"Warning message in FUN(X[[i]], ...):
"输入的字符串1不适用于此语言环境"ERROR while rich displaying an object: Error in gsub(" &\\", "\\", r, fixed = TRUE): input string 1 is invalid in this locale
Traceback:
1. FUN(X[[i]], ...)
2. tryCatch(withCallingHandlers({
. if (!mime %in% names(repr::mime2repr))
. stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
. rpr <- repr::mime2repr[[mime]](obj)
. if (is.null(rpr))
. return(NULL)
. prepare_content(is.raw(rpr), rpr)
. }, error = error_handler), error = outer_handler)
3. tryCatchList(expr, classes, parentenv, handlers)
4. tryCatchOne(expr, names, parentenv, handlers[[1L]])
5. doTryCatch(return(expr), name, parentenv, handler)
6. withCallingHandlers({
. if (!mime %in% names(repr::mime2repr))
. stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
. rpr <- repr::mime2repr[[mime]](obj)
. if (is.null(rpr))
. return(NULL)
. prepare_content(is.raw(rpr), rpr)
. }, error = error_handler)
7. repr::mime2repr[[mime]](obj)
8. repr_latex.data.frame(obj)
9. gsub(" &\\", "\\", r, fixed = TRUE)
X | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 |
---|---|---|---|---|---|---|---|---|
鍖椾含 | 7584.2 | 2425.7 | 10350.2 | 2098.3 | 4489.6 | 3634.6 | 2228.6 | 991.4 |
澶╂触 | 7709.9 | 1949.4 | 5237.5 | 1514.0 | 3185.9 | 2096.0 | 1757.1 | 712.6 |
娌冲寳 | 3515.5 | 1055.3 | 2995.8 | 832.2 | 1807.6 | 1338.6 | 1192.0 | 293.6 |
灞辫タ | 3089.4 | 1146.7 | 2297.4 | 672.5 | 1501.4 | 1628.0 | 1102.0 | 291.5 |
鍐呰挋鍙,4919.8 | 1759.7 | 2918.9 | 1030.9 | 2569.0 | 2067.1 | 1384.3 | 528.9 | NA |
杈藉畞 | 4858.0 | 1561.6 | 3471.6 | 1028.7 | 2282.0 | 1973.3 | 1522.4 | 502.2 |
鍚夋灄 | 3683.5 | 1254.9 | 2692.3 | 718.4 | 1810.0 | 1683.6 | 1527.5 | 393.7 |
榛戦緳姹<9f>,3704.1 | 1288.3 | 2619.6 | 672.4 | 1675.1 | 1526.2 | 1577.0 | 339.8 | NA |
涓婃捣 | 9271.5 | 1622.7 | 11307.5 | 1484.6 | 4206.5 | 3718.1 | 2268.3 | 904.3 |
姹熻嫃 | 5936.0 | 1415.1 | 4551.6 | 1238.2 | 2984.7 | 2423.8 | 1409.6 | 596.5 |
娴欐睙 | 6975.8 | 1646.5 | 5964.2 | 1159.3 | 3961.3 | 2428.3 | 1433.0 | 548.4 |
瀹夊窘 | 4424.2 | 924.6 | 2630.0 | 698.8 | 1622.3 | 1339.3 | 932.3 | 268.8 |
绂忓缓 | 6440.0 | 1134.6 | 4638.2 | 1047.5 | 2305.3 | 1784.7 | 1028.6 | 471.3 |
姹熻タ | 4181.7 | 929.1 | 2783.4 | 736.6 | 1444.4 | 1354.0 | 698.8 | 275.5 |
灞变笢 | 4166.2 | 1276.6 | 2903.3 | 1038.1 | 2104.2 | 1557.3 | 1180.1 | 352.7 |
娌冲崡 | 3373.7 | 1141.9 | 2387.9 | 910.6 | 1355.4 | 1337.2 | 1023.1 | 305.4 |
婀栧寳 | 4499.9 | 1073.1 | 3007.0 | 868.6 | 1722.5 | 1577.6 | 1252.5 | 315.3 |
婀栧崡 | 4535.5 | 1028.0 | 2810.8 | 883.6 | 1624.6 | 2049.7 | 998.3 | 336.8 |
骞夸笢 | 7236.7 | 1103.4 | 4677.1 | 1245.3 | 3020.2 | 2117.3 | 976.1 | 599.8 |
骞胯タ | 3960.8 | 503.1 | 2559.9 | 672.5 | 1445.7 | 1280.1 | 778.1 | 200.7 |
娴峰崡 | 5364.1 | 568.3 | 2628.0 | 697.9 | 1783.1 | 1278.0 | 987.0 | 268.7 |
閲嶅簡 | 5325.5 | 1334.7 | 2743.4 | 1064.3 | 1746.4 | 1513.4 | 1117.9 | 294.0 |
鍥涘窛 | 5001.4 | 1071.3 | 2400.9 | 918.4 | 1629.2 | 1207.9 | 1071.2 | 331.8 |
璐靛窞 | 3375.8 | 719.2 | 2185.7 | 636.2 | 1321.6 | 1401.2 | 604.6 | 169.5 |
浜戝崡 | 3587.7 | 625.7 | 2146.4 | 644.3 | 1632.6 | 1281.5 | 875.7 | 211.5 |
瑗胯棌 | 3919.8 | 764.3 | 1374.6 | 394.9 | 1025.9 | 314.1 | 229.2 | 223.1 |
闄曡タ | 3646.4 | 989.4 | 2786.1 | 887.3 | 1537.1 | 1608.4 | 1363.5 | 269.1 |
鐢樿們 | 3447.6 | 967.7 | 2120.8 | 708.8 | 1214.7 | 1315.9 | 949.5 | 225.7 |
闈掓捣 | 3958.2 | 1232.0 | 2352.8 | 793.4 | 2263.3 | 1383.4 | 1318.2 | 310.0 |
瀹佸 | 3694.8 | 1237.9 | 2607.3 | 885.4 | 1806.1 | 1707.9 | 1482.9 | 393.4 |
鏂扮枂 | 4092.8 | 1274.5 | 2227.9 | 788.8 | 1842.4 | 1282.1 | 1078.3 | 280.6 |
library(factoextra)
df=USArrests
Warning message:
"package 'factoextra' was built under R version 3.6.3"Loading required package: ggplot2
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df
Murder | Assault | UrbanPop | Rape | |
---|---|---|---|---|
Alabama | 13.2 | 236 | 58 | 21.2 |
Alaska | 10.0 | 263 | 48 | 44.5 |
Arizona | 8.1 | 294 | 80 | 31.0 |
Arkansas | 8.8 | 190 | 50 | 19.5 |
California | 9.0 | 276 | 91 | 40.6 |
Colorado | 7.9 | 204 | 78 | 38.7 |
Connecticut | 3.3 | 110 | 77 | 11.1 |
Delaware | 5.9 | 238 | 72 | 15.8 |
Florida | 15.4 | 335 | 80 | 31.9 |
Georgia | 17.4 | 211 | 60 | 25.8 |
Hawaii | 5.3 | 46 | 83 | 20.2 |
Idaho | 2.6 | 120 | 54 | 14.2 |
Illinois | 10.4 | 249 | 83 | 24.0 |
Indiana | 7.2 | 113 | 65 | 21.0 |
Iowa | 2.2 | 56 | 57 | 11.3 |
Kansas | 6.0 | 115 | 66 | 18.0 |
Kentucky | 9.7 | 109 | 52 | 16.3 |
Louisiana | 15.4 | 249 | 66 | 22.2 |
Maine | 2.1 | 83 | 51 | 7.8 |
Maryland | 11.3 | 300 | 67 | 27.8 |
Massachusetts | 4.4 | 149 | 85 | 16.3 |
Michigan | 12.1 | 255 | 74 | 35.1 |
Minnesota | 2.7 | 72 | 66 | 14.9 |
Mississippi | 16.1 | 259 | 44 | 17.1 |
Missouri | 9.0 | 178 | 70 | 28.2 |
Montana | 6.0 | 109 | 53 | 16.4 |
Nebraska | 4.3 | 102 | 62 | 16.5 |
Nevada | 12.2 | 252 | 81 | 46.0 |
New Hampshire | 2.1 | 57 | 56 | 9.5 |
New Jersey | 7.4 | 159 | 89 | 18.8 |
New Mexico | 11.4 | 285 | 70 | 32.1 |
New York | 11.1 | 254 | 86 | 26.1 |
North Carolina | 13.0 | 337 | 45 | 16.1 |
North Dakota | 0.8 | 45 | 44 | 7.3 |
Ohio | 7.3 | 120 | 75 | 21.4 |
Oklahoma | 6.6 | 151 | 68 | 20.0 |
Oregon | 4.9 | 159 | 67 | 29.3 |
Pennsylvania | 6.3 | 106 | 72 | 14.9 |
Rhode Island | 3.4 | 174 | 87 | 8.3 |
South Carolina | 14.4 | 279 | 48 | 22.5 |
South Dakota | 3.8 | 86 | 45 | 12.8 |
Tennessee | 13.2 | 188 | 59 | 26.9 |
Texas | 12.7 | 201 | 80 | 25.5 |
Utah | 3.2 | 120 | 80 | 22.9 |
Vermont | 2.2 | 48 | 32 | 11.2 |
Virginia | 8.5 | 156 | 63 | 20.7 |
Washington | 4.0 | 145 | 73 | 26.2 |
West Virginia | 5.7 | 81 | 39 | 9.3 |
Wisconsin | 2.6 | 53 | 66 | 10.8 |
Wyoming | 6.8 | 161 | 60 | 15.6 |
df=na.omit(df)#去除NA值
head(df)
Murder | Assault | UrbanPop | Rape | |
---|---|---|---|---|
Alabama | 13.2 | 236 | 58 | 21.2 |
Alaska | 10.0 | 263 | 48 | 44.5 |
Arizona | 8.1 | 294 | 80 | 31.0 |
Arkansas | 8.8 | 190 | 50 | 19.5 |
California | 9.0 | 276 | 91 | 40.6 |
Colorado | 7.9 | 204 | 78 | 38.7 |
df=scale(df)
head(df)
Murder | Assault | UrbanPop | Rape | |
---|---|---|---|---|
Alabama | 1.24256408 | 0.7828393 | -0.5209066 | -0.003416473 |
Alaska | 0.50786248 | 1.1068225 | -1.2117642 | 2.484202941 |
Arizona | 0.07163341 | 1.4788032 | 0.9989801 | 1.042878388 |
Arkansas | 0.23234938 | 0.2308680 | -1.0735927 | -0.184916602 |
California | 0.27826823 | 1.2628144 | 1.7589234 | 2.067820292 |
Colorado | 0.02571456 | 0.3988593 | 0.8608085 | 1.864967207 |
km1=kmeans(df,4,nstart = 20,algorithm = "Hartigan-Wong")
km1
K-means clustering with 4 clusters of sizes 13, 8, 16, 13
Cluster means:
Murder Assault UrbanPop Rape
1 0.6950701 1.0394414 0.7226370 1.27693964
2 1.4118898 0.8743346 -0.8145211 0.01927104
3 -0.4894375 -0.3826001 0.5758298 -0.26165379
4 -0.9615407 -1.1066010 -0.9301069 -0.96676331
Clustering vector:
Alabama Alaska Arizona Arkansas California
2 1 1 2 1
Colorado Connecticut Delaware Florida Georgia
1 3 3 1 2
Hawaii Idaho Illinois Indiana Iowa
3 4 1 3 4
Kansas Kentucky Louisiana Maine Maryland
3 4 2 4 1
Massachusetts Michigan Minnesota Mississippi Missouri
3 1 4 2 1
Montana Nebraska Nevada New Hampshire New Jersey
4 4 1 4 3
New Mexico New York North Carolina North Dakota Ohio
1 1 2 4 3
Oklahoma Oregon Pennsylvania Rhode Island South Carolina
3 3 3 3 2
South Dakota Tennessee Texas Utah Vermont
4 2 1 3 4
Virginia Washington West Virginia Wisconsin Wyoming
3 3 4 4 3
Within cluster sum of squares by cluster:
[1] 19.922437 8.316061 16.212213 11.952463
(between_SS / total_SS = 71.2 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
sort(km1$cluster)
library(MASS)
lda(formula, data, … , subset, na.action)
formula用法为groups~ x1 + x2 + … , group表明总体来源, x1,
x2, …,表示分类指标; subset指明训练样本。
library(MASS)
data(iris)#加载数据
attach(iris)
iris
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
5.1 | 3.5 | 1.4 | 0.2 | setosa |
4.9 | 3.0 | 1.4 | 0.2 | setosa |
4.7 | 3.2 | 1.3 | 0.2 | setosa |
4.6 | 3.1 | 1.5 | 0.2 | setosa |
5.0 | 3.6 | 1.4 | 0.2 | setosa |
5.4 | 3.9 | 1.7 | 0.4 | setosa |
4.6 | 3.4 | 1.4 | 0.3 | setosa |
5.0 | 3.4 | 1.5 | 0.2 | setosa |
4.4 | 2.9 | 1.4 | 0.2 | setosa |
4.9 | 3.1 | 1.5 | 0.1 | setosa |
5.4 | 3.7 | 1.5 | 0.2 | setosa |
4.8 | 3.4 | 1.6 | 0.2 | setosa |
4.8 | 3.0 | 1.4 | 0.1 | setosa |
4.3 | 3.0 | 1.1 | 0.1 | setosa |
5.8 | 4.0 | 1.2 | 0.2 | setosa |
5.7 | 4.4 | 1.5 | 0.4 | setosa |
5.4 | 3.9 | 1.3 | 0.4 | setosa |
5.1 | 3.5 | 1.4 | 0.3 | setosa |
5.7 | 3.8 | 1.7 | 0.3 | setosa |
5.1 | 3.8 | 1.5 | 0.3 | setosa |
5.4 | 3.4 | 1.7 | 0.2 | setosa |
5.1 | 3.7 | 1.5 | 0.4 | setosa |
4.6 | 3.6 | 1.0 | 0.2 | setosa |
5.1 | 3.3 | 1.7 | 0.5 | setosa |
4.8 | 3.4 | 1.9 | 0.2 | setosa |
5.0 | 3.0 | 1.6 | 0.2 | setosa |
5.0 | 3.4 | 1.6 | 0.4 | setosa |
5.2 | 3.5 | 1.5 | 0.2 | setosa |
5.2 | 3.4 | 1.4 | 0.2 | setosa |
4.7 | 3.2 | 1.6 | 0.2 | setosa |
... | ... | ... | ... | ... |
6.9 | 3.2 | 5.7 | 2.3 | virginica |
5.6 | 2.8 | 4.9 | 2.0 | virginica |
7.7 | 2.8 | 6.7 | 2.0 | virginica |
6.3 | 2.7 | 4.9 | 1.8 | virginica |
6.7 | 3.3 | 5.7 | 2.1 | virginica |
7.2 | 3.2 | 6.0 | 1.8 | virginica |
6.2 | 2.8 | 4.8 | 1.8 | virginica |
6.1 | 3.0 | 4.9 | 1.8 | virginica |
6.4 | 2.8 | 5.6 | 2.1 | virginica |
7.2 | 3.0 | 5.8 | 1.6 | virginica |
7.4 | 2.8 | 6.1 | 1.9 | virginica |
7.9 | 3.8 | 6.4 | 2.0 | virginica |
6.4 | 2.8 | 5.6 | 2.2 | virginica |
6.3 | 2.8 | 5.1 | 1.5 | virginica |
6.1 | 2.6 | 5.6 | 1.4 | virginica |
7.7 | 3.0 | 6.1 | 2.3 | virginica |
6.3 | 3.4 | 5.6 | 2.4 | virginica |
6.4 | 3.1 | 5.5 | 1.8 | virginica |
6.0 | 3.0 | 4.8 | 1.8 | virginica |
6.9 | 3.1 | 5.4 | 2.1 | virginica |
6.7 | 3.1 | 5.6 | 2.4 | virginica |
6.9 | 3.1 | 5.1 | 2.3 | virginica |
5.8 | 2.7 | 5.1 | 1.9 | virginica |
6.8 | 3.2 | 5.9 | 2.3 | virginica |
6.7 | 3.3 | 5.7 | 2.5 | virginica |
6.7 | 3.0 | 5.2 | 2.3 | virginica |
6.3 | 2.5 | 5.0 | 1.9 | virginica |
6.5 | 3.0 | 5.2 | 2.0 | virginica |
6.2 | 3.4 | 5.4 | 2.3 | virginica |
5.9 | 3.0 | 5.1 | 1.8 | virginica |
iris_lda=lda(Species~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width)
iris_lda
Call:
lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width)
Prior probabilities of groups:
setosa versicolor virginica
0.3333333 0.3333333 0.3333333
Group means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa 5.006 3.428 1.462 0.246
versicolor 5.936 2.770 4.260 1.326
virginica 6.588 2.974 5.552 2.026
Coefficients of linear discriminants:
LD1 LD2
Sepal.Length 0.8293776 0.02410215
Sepal.Width 1.5344731 2.16452123
Petal.Length -2.2012117 -0.93192121
Petal.Width -2.8104603 2.83918785
Proportion of trace:
LD1 LD2
0.9912 0.0088
#利用模型进行预测
iris.pre=predict(iris_lda)$class
table(iris.pre,Species)#table 用来构建列联表,第一个参数为行,第二个为列
Species
iris.pre setosa versicolor virginica
setosa 50 0 0
versicolor 0 48 1
virginica 0 2 49
#晴雨天预测分类
x1<-c( -1.9, -6.9,5.2 , 5.0 , 7.3, 6.8 , 0.9, -12.5, 1.5, 3.8, 0.2, -0.1,
0.4 , 2.7, 2.1 , -4.6, -1.7 , -2.6 , 2.6 , -2.8)
x2 <- c( 3.2 , 0.4 , 2.0 , 2.5 , 0.0, 12.7, -5.4, -2.5 , 1.3 , 6.8 , 6.2, 7.5, 14.6 ,
8.3, 0.8 , 4.3, 10.9,13.1, 12.8, 10.0)
G<-c(1, 1 ,1, 1 ,1 ,1, 1, 1, 1, 1 ,2 ,2, 2 ,2, 2 ,2, 2, 2, 2, 2)
d6.1<-data.frame(G,x1,x2)
d6.lda=lda(G~x1+x2,data = d6.1)
d6.lda
Call:
lda(G ~ x1 + x2, data = d6.1)
Prior probabilities of groups:
1 2
0.5 0.5
Group means:
x1 x2
1 0.92 2.10
2 -0.38 8.85
Coefficients of linear discriminants:
LD1
x1 -0.1035305
x2 0.2247957
#预测值
d6.pre=predict(d6.lda)$class
cbind(d6.1$G,d6.pre)
d6.pre | |
---|---|
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 2 |
1 | 1 |
1 | 1 |
1 | 1 |
1 | 1 |
2 | 2 |
2 | 2 |
2 | 2 |
2 | 2 |
2 | 1 |
2 | 2 |
2 | 2 |
2 | 2 |
2 | 2 |
2 | 2 |
tab=table(G,d6.pre)
#计算判对率:prop.table():将数据转换为百分比
sum(diag(prop.table(tab)))
0.9
data("iris")
library(MASS)
attach(iris)
iris.lda <- lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width)
newdata=data.frame( Sepal.Length=c(5.1,5.9,6.6), Sepal.Width =c(3.5,2.8,2.9),
Petal.Length=c(1.5,4.3,5.6), Petal.Width =c(0.25,1.3,2.1))
The following objects are masked from iris (pos = 3):
Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species
predict(iris.lda,newdata)
$posterior
setosaversicolorvirginica 1.000000e+001.117074e-203.314219e-40 2.963609e-209.998273e-011.727265e-04 4.169387e-423.505610e-059.999649e-01 $x
LD1
LD2
7.701156
0.3491879
-1.823849
-0.7749274
-6.199781
0.5182489
6.2.2距离判别法
- 根据距离的远近对样品的类别进行判定
mahalanobis(x,center,cov,inverted=FALSE):马氏距离的计算
-
x:用于计算距离的指定数据对象(向量或者矩阵)
-
center:分布的均值,即总体均值
-
cov:分布的协方差,即总体协方差,一般用样本的协方差进行评估
-
inverted:TRUE,表示参数cov应该包括协方差的逆
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QXPs30p4-1678242376650)(attachment:%E5%9B%BE%E7%89%87-2.png)]
整体思路,定义一个函数,比较到两个中心点的距离
compare=function(x,mu1,mu2,S){#数据,中心点1,中心点2,协方差矩阵
mahalanobis(x,mu1,S)-mahalanobis(x,mu2,S)
}
x=c(7.2,5.6)
mu1=c(6,5)
mu2=c(8,6)
S=matrix(c(9,2,2,4),nrow = 2)
S
9
2
2
4
compare(x,mu1,mu2,S)#结果为正数,所属mu2更近,为雄性
0.10625
6.2.3曲线判别
qda(formula, data, …, subset, na.action)
formula为公式,data为数据框
qd=qda(G~x1+x2.d6.1)
Error in eval(predvars, data, env): 找不到对象'x2.d6.1'
Traceback:
1. qda(G ~ x1 + x2.d6.1)
2. qda.formula(G ~ x1 + x2.d6.1)
3. eval.parent(m)
4. eval(expr, p)
5. eval(expr, p)
6. stats::model.frame(formula = G ~ x1 + x2.d6.1)
7. model.frame.default(formula = G ~ x1 + x2.d6.1)
8. eval(predvars, data, env)
9. eval(predvars, data, env)
Q<-c(8.3, 9.5, 8.0,7.4,8.8 ,9.0,7.0,9.2,8.0,7.6,7.2,6.4,7.3,6.0,6.4,6.8,5.2,5.8,
5.5, 6.0)
C=c(4.0 ,7.0 ,5.0,7.0,6.5,7.5,6.0,8.0,7.0,9.0,8.5,7.0,5.0,2.0,4.0,5.0,3.0,3.5,
4.0,4.5)
P<-c( 29,68,39,50,55,58,75,82,67,90,86,53,48,20,39,48,29,32,34,36)
G3<-c(1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3)
d6.3<-data.frame(G3,Q,C,P)
d6.3
G3
Q
C
P
1
8.3
4.0
29
1
9.5
7.0
68
1
8.0
5.0
39
1
7.4
7.0
50
1
8.8
6.5
55
2
9.0
7.5
58
2
7.0
6.0
75
2
9.2
8.0
82
2
8.0
7.0
67
2
7.6
9.0
90
2
7.2
8.5
86
2
6.4
7.0
53
2
7.3
5.0
48
3
6.0
2.0
20
3
6.4
4.0
39
3
6.8
5.0
48
3
5.2
3.0
29
3
5.8
3.5
32
3
5.5
4.0
34
3
6.0
4.5
36
library(MASS)
mod.lda=lda(G3~Q+C+P,data = d6.3)
mod.lda
Call:
lda(G3 ~ Q + C + P, data = d6.3)
Prior probabilities of groups:
1 2 3
0.25 0.40 0.35
Group means:
Q C P
1 8.400000 5.900000 48.200
2 7.712500 7.250000 69.875
3 5.957143 3.714286 34.000
Coefficients of linear discriminants:
LD1 LD2
Q -0.81173396 0.88406311
C -0.63090549 0.20134565
P 0.01579385 -0.08775636
Proportion of trace:
LD1 LD2
0.7403 0.2597
#线性判别进行预测
pre.lda=predict(mod.lda,data.frame(Q=8.0,C=7.5,P=65))
pre.lda
$class
-
2 Levels:
- '1'
- '2'
- '3'
$posterior
-
1
2
3
0.2114514
0.786773
0.001775594
$x
-
LD1
LD2
-1.537069
-0.1367865
Warning message in cbind(G3, Wx = pre.lda$x, newG = pre.lda$class):
"number of rows of result is not a multiple of vector length (arg 1)"
G3
LD1
LD2
newG
1
-1.537069
-0.1367865
2
mod.qda=qda(G3~Q+C+P,data = d6.3)
mod.qda
Call:
qda(G3 ~ Q + C + P, data = d6.3)
Prior probabilities of groups:
1 2 3
0.25 0.40 0.35
Group means:
Q C P
1 8.400000 5.900000 48.200
2 7.712500 7.250000 69.875
3 5.957143 3.714286 34.000
cbind(G3,newG=predict(mod.qda)$class)
G3
newG
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
predict(mod.qda,data.frame(Q=8,C=7.5,P=65))
$class
-
2 Levels:
- '1'
- '2'
- '3'
$posterior
-
1
2
3
0.008221165
0.9915392
0.0002396287
6.2.4Bayes判别
- 即求解条件概率最大的那个类别
ld1=lda(G3~Q+C+P,prior=c(1,1,1)/3,data=d6.3)
library(MASS)
Q<-c(8.3, 9.5, 8.0,7.4,8.8 ,9.0,7.0,9.2,8.0,7.6,7.2,6.4,7.3,6.0,6.4,6.8,5.2,5.8,
5.5, 6.0)
C=c(4.0 ,7.0 ,5.0,7.0,6.5,7.5,6.0,8.0,7.0,9.0,8.5,7.0,5.0,2.0,4.0,5.0,3.0,3.5,
4.0,4.5)
P<-c( 29,68,39,50,55,58,75,82,67,90,86,53,48,20,39,48,29,32,34,36)
G3<-c(1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3)
d6.3<-data.frame(G3,Q,C,P)
ld=lda(G3~Q+C+P,prior = c(1,1,1)/3,data = d6.3)
ld
Call:
lda(G3 ~ Q + C + P, data = d6.3, prior = c(1, 1, 1)/3)
Prior probabilities of groups:
1 2 3
0.3333333 0.3333333 0.3333333
Group means:
Q C P
1 8.400000 5.900000 48.200
2 7.712500 7.250000 69.875
3 5.957143 3.714286 34.000
Coefficients of linear discriminants:
LD1 LD2
Q -0.92307369 0.76708185
C -0.65222524 0.11482179
P 0.02743244 -0.08484154
Proportion of trace:
LD1 LD2
0.7259 0.2741
ld2=lda(G3~Q+C+P,prior=c(5,8,7)/20,data=d6.3)
ld2
Call:
lda(G3 ~ Q + C + P, data = d6.3, prior = c(5, 8, 7)/20)
Prior probabilities of groups:
1 2 3
0.25 0.40 0.35
Group means:
Q C P
1 8.400000 5.900000 48.200
2 7.712500 7.250000 69.875
3 5.957143 3.714286 34.000
Coefficients of linear discriminants:
LD1 LD2
Q -0.81173396 0.88406311
C -0.63090549 0.20134565
P 0.01579385 -0.08775636
Proportion of trace:
LD1 LD2
0.7403 0.2597
z2=predict(ld2)
z2
$class
-
- 1
- 1
- 1
- 1
- 1
- 1
- 2
- 2
- 2
- 2
- 2
- 2
- 3
- 3
- 3
- 3
- 3
- 3
- 3
- 3
Levels:
- '1'
- '2'
- '3'
$posterior
123 9.747144e-010.00881989141.646573e-02 7.069622e-010.29292737201.104490e-04 9.066215e-010.06671619012.666229e-02 5.420713e-010.44730988581.061882e-02 8.565024e-010.14286199236.355979e-04 8.893310e-010.11061461045.440087e-05 2.123192e-030.87898389801.188929e-01 1.188275e-010.88110914906.337076e-05 1.241051e-010.87139525054.499663e-03 1.781107e-030.99806550881.533841e-04 1.374025e-030.99779418568.317893e-04 7.358709e-020.82549381021.009191e-01 2.163910e-010.38570933323.978997e-01 5.425064e-040.00022607429.992314e-01 8.646125e-030.02600225549.653516e-01 5.604906e-020.27404428846.699067e-01 5.674877e-050.00043313269.995101e-01 9.943552e-040.00321146839.957942e-01 7.112081e-040.00490727779.943815e-01 9.840783e-030.02886590409.612933e-01 $x
LD1
LD2
-0.1409984
2.582951755
-2.3918356
0.825366275
-0.3704452
1.641514840
-0.9714835
0.548448277
-1.7134891
1.246681993
-2.4593598
1.361571174
0.3789617
-2.200431689
-2.5581070
-0.467096091
-1.1900285
-0.412972027
-1.7638874
-2.382302324
-1.1869165
-2.485574940
-0.1123680
-0.598883922
0.3399132
0.232863397
2.8456561
0.936722573
1.5592346
0.025668216
0.7457802
-0.209168159
3.0062824
-0.358989534
2.2511708
0.008852067
2.2108260
-0.331206768
1.5210939
0.035984885
#混淆矩阵
table(G3,z2$class)
G3 1 2 3
1 5 0 0
2 1 6 1
3 0 0 7
你可能感兴趣的:(R,吉林大学,r语言,开发语言,数据挖掘)