一、分布检验
1 四种常用函数
- dnorm: density norm,表示正太分布的概率密度(f),即单点取值的概率。如果生成序列点回复即得到正太线
- pnorm:pribability,表示正态分布的累积分布,最终生成CDF线
- qnorm:与pnorm相反,pnorm根据数值求累积分布(0-1),qnorm根据累积分布求数值
- rnorm:生成一组正太随机数。
2 各种分布与检验
2.1 对数分布和检验
library(MASS)
# 1.1 log-noraml distribution
## 拟合lognormal模型
lognormal_distr <- fitdistr(as.array(data[,1]),"lognormal")
## 依次输出模型的系数、方差、最大似然值
lognormal_distr$estimate
lognormal_distr$sd
lognormal_distr$loglik
## 图形分布 + 拟合分布
par(pin=c(5,5))
h_lognormal <-hist(as.array(data[,1]),ylim = c(0,230), main = "Histogram of lognormal",xlab='data') # 绘制源数据的直方图
xfit <-seq(min(data[,1]), max(data[,1]), by=(max(data[,1])-min(data[,1]))/1000)
yfit <-dlnorm(xfit, meanlog = lognormal_distr$estimate[1], sdlog = lognormal_distr$estimate[2])
yfit <- yfit*diff(h_lognormal$mids[1:2])*length(xfit)
lines(xfit, yfit, col="blue", lwd=2)
# K-S test
## 没有直接检验对数正态分布的函数,需要转化后用对数分布检验
lognormal <- c(data[,1])
lognormal_to_normal <- log(lognormal)
## 进行K-S test 并输出结果
lognormal_ks_test <- ks.test(lognormal_to_normal, "pnorm")
lognormal_ks_test
# A-D test
library(fBasics)
lognormal_ad_test <- adTest(lognormal_to_normal)
lognormal_ad_test
# Q-Q图
## 自己实现QQ图
t <- (rank(lognormal_to_normal) -0.5)/length(lognormal_to_normal)
q <- qnorm(t)
plot(q, lognormal_to_normal,main = "Lognormal Q-Q plot",xlab = "Theoretical Quantiles",ylab = "Sample Quantiles")
abline(mean(lognormal_to_normal), sd(lognormal_to_normal), col=2, lwd=3)
2.2 gamma分布
# 1.2 gamma distribution
## 拟合gamma模型
gamma_distr <- fitdistr(as.array(data[,1]),"gamma")
## 依次输出模型的系数、方差、最大似然值
gamma_distr$estimate
gamma_distr$sd
gamma_distr$loglik
## 图形分布 + 拟合分布
par(pin=c(5,5))
h_gamma <-hist(as.array(data[,1]),ylim = c(0,230),main = "Histogram of Gamma",xlab='data') # 绘制源数据的直方图
xfit <-seq(min(data[,1]), max(data[,1]), by=(max(data[,1])-min(data[,1]))/1000)
yfit <-dgamma(xfit, shape = gamma_distr$estimate[1], rate = gamma_distr$estimate[2])
yfit <- yfit*diff(h_gamma$mids[1:2])*length(xfit)
lines(xfit, yfit, col="blue", lwd=2)
# K-S test
## 没有直接检验对数正态分布的函数,需要转化后用对数分布检验
gamma_ks_test <- ks.test(as.array(data[,1]), "gamma")
gamma_ks_test
## 进行K-S test 并输出结果
gamma_ad_test <- adTest(as.array(data[,1]), "pnorm")
gamma_ad_test
# Q-Q图 只能使用自己的QQ图画法
## 自己实现QQ图
gamma_data <- as.array(data[,1])
t <- (rank(gamma_data) -0.5)/length(gamma_data)
q <- qgamma(t,shape = gamma_distr$estimate[1], rate = gamma_distr$estimate[2])
plot(q, gamma_data,main = "Gamma Q-Q plot",xlab = "Theoretical Quantiles",ylab = "Sample Quantiles")
abline(0, 1, col=2, lwd=3)
2.3 帕累托分布
# 1.3 pareto distribution
library(actuar)
library(fitdistrplus)
pareto_data <- as.vector(as.array(data[,1]))
## 拟合pareto模型, method='mle'需要指定
pareto_distr <- fitdist(pareto_data,"pareto",method = 'mle', start=list(shape=0.1, scale=0.1))
## 依次输出模型的系数、方差、最大似然值
pareto_distr$estimate
pareto_distr$sd
pareto_distr$loglik
## 图形分布 + 拟合分布
par(pin=c(5,5))
h_pareto <-hist(pareto_data,ylim = c(0,230),main = "Histogram of Pareto",xlab='data') # 绘制源数据的直方图
xfit <-seq(min(data[,1]), max(data[,1]), by=(max(data[,1])-min(data[,1]))/1000)
yfit <-dpareto(xfit, shape = pareto_distr$estimate[1], scale = pareto_distr$estimate[2])
yfit <- yfit*diff(h_pareto$mids[1:2])*length(xfit)
lines(xfit, yfit, col="blue", lwd=2)
# K-S test
## 没有直接检验对数正态分布的函数,需要转化后用对数分布检验
t <- (rank(pareto_data) -0.5)/length(pareto_data)
q <- qweibull(t, shape = pareto_distr$estimate[1], scale = pareto_distr$estimate[2])
pareto_ks_test <- ks.test(as.array(data[,1]), q)
pareto_ks_test
# A-D test
pareto_ad_test <- adTest(as.array(data[,1]), "pnorm")
pareto_ad_test
# Q-Q图 只能使用自己的QQ图画法
## 自己实现QQ图
plot(q, pareto_data,main = "Pareto Q-Q plot",xlab = "Theoretical Quantiles",ylab = "Sample Quantiles")
abline(0, 1, col=2, lwd=3)
2.4 weibull分布
## 拟合weibull模型
weibull_distr <- fitdistr(as.array(data[,1]),"weibull")
## 依次输出模型的系数、方差、最大似然值
weibull_distr$estimate
weibull_distr$sd
weibull_distr$loglik
## 图形分布 + 拟合分布
par(pin=c(5,5))
weibull_data <- as.array(data[,1])
h_weibull <-hist(weibull_data,ylim = c(0,230),main = "Histogram of Weibull",xlab='data') # 绘制源数据的直方图
xfit <-seq(min(data[,1]), max(data[,1]), by=(max(data[,1])-min(data[,1]))/1000)
yfit <-dweibull(xfit, shape = weibull_distr$estimate[1], scale = weibull_distr$estimate[2])
yfit <- yfit*diff(h_weibull$mids[1:2])*length(xfit)
lines(xfit, yfit, col="blue", lwd=2)
# K-S test
## 没有直接检验对数正态分布的函数,需要转化后用对数分布检验
t <- (rank(weibull_data) -0.5)/length(weibull_data)
q <- qweibull(t, shape = weibull_distr$estimate[1], scale = weibull_distr$estimate[2])
weibull_ks_test <- ks.test(as.array(data[,1]), q)
gamma_ks_test
# A-D test
weibull_ad_test <- adTest(as.array(data[,1]), "pnorm")
weibull_ad_test
# Q-Q图 只能使用自己的QQ图画法
## 自己实现QQ图
plot(q, weibull_data,main = "Weibull Q-Q plot",xlab = "Theoretical Quantiles",ylab = "Sample Quantiles")
abline(0, 1, col=2, lwd=3)
二、方差分析
1 方差分析
1.1 单因素分析
attach(linseed)
table(Plot)
result_mean <-aggregate(Yield,by = list(Plot),FUN= mean)
result_sd <- aggregate(Yield,by = list(Plot),FUN= sd)
fit<-aov(Yield~Plot)
summary(fit)
TukeyHSD(fit)
detach(linseed)
1.2 双因素方差分析
attach(wafer) # 锁定数据集
table(Furnace, Wafer_Type) # 交叉查看两个因素
result_mean <- aggregate(Thickness , by = list(Furnace, Wafer_Type) , FUN = mean) # 交叉均值
result_df <- aggregate(Thickness by = list(Furnace, Wafer_Type) , FUN = sd) # 交叉方差
fit <- aov(Thickness ~ Furnace * Wafer.Type) # 双因素方差分析
summary(fit) # 输入结论
TukeyHSD(fit) # 对任意两组输出Tukey honest significant differences
detach(wafer) # 解锁数据集
2 列联表分析
- 双向无序列联表:行和列均只有两个且无序,使用Pearson卡方检验、Fisher精确概率
- 单项有序的列联表:常见的情况是结果变量有序,而原因变量无序。用Mann–Whitney U 检验、Kruskal-Wallis H检验
- 行列有序且属性相同:比如两列但阴阳。行列变量独立: Kappa一致性检验-即交叉表。配对行列表-McNemar检验、Bowker检验。
2.1 Pearson卡方检验
df <- tibble(count = c(56,283,55,360), Gender = c("Male", "Male", "Female", "Female"), Response = c("Mentioned", "Not Mentioned", "Mentioned","Not Mentioned"))
tbl <- xtabs(count~Response+Gender, df) # 生成一个列联表
chisq.test(tbl) # 结果结合皮尔逊系数检验即可
2.2 Kruskal-Wallis H
df<-tibble(Grade = rep(c("A", "B", "C", "D-F"),3), count = c(8,14,15,3,15,19,4,1,13,15,7,4), major = c(rep("Psychology",4), rep("Biology",4), rep("Other",4)))
xtabs(count~Grade+major, df)%>%
kable("html",table.attr = "style='width:50%;'",align = "c")%>%
kable_styling(position = "center")
# 这里专业是无序的,成绩是有序的,且分组数大于2,使用Kruskal-Wallis H
df$major_and_grade <- paste(df$Grade,df$major,sep="~")
kruskal.test(count~major_and_grade,data=df)
三、相关性分析
1 皮尔森相关系数
pearson_test <- cor.test(as.array(tem_data$age), as.array(tem_data$confidence), method = "pearson", use = "complete.obs")