总体方差的置信区间
from scipy.stats import chi2
def CI(n,sigma2,alpha):
"""
n:样本量
sigma2:样本方差
alpha:置信水平
"""
a = (1-alpha)/2
b = (n-1)*sigma2
lower = b/chi2.ppf(a,df=n-1)
upper = b/chi2.ppf(1-a,df=n-1)
return (lower,upper)
方差假设检验(卡方检验)
from scipy.stats import chi2
def chi2_test(n,s,sigma):
chisquare = (n-1)*s/sigma
return chi2.sf(chisquare,df=n-1)
两个总体方差齐性检验(F检验)
两样本方差估计两总体方差大小关系
from scipy.stats import f
def f_test(n1,n2,sig_squre1,sig_square2):
"""
sig_square1>sig_square2
"""
f = sig_sqaure1/sig_square2
return f.sf(f,n1,n2)#单尾p值
多个总体比率相等性检验(观察频率与期望频率的卡方检验)
import pandas as pd
import numpy as np
from scipy import stats
data = pd.read_excel(r"C:\Users\liuhao\Desktop\a.xls")
品牌 |
是 |
否 |
雪佛兰 |
69 |
56 |
福特 |
120 |
80 |
本田 |
123 |
52 |
kf = stats.chi2_contingency(data[["是","否"]])
(7.891045125087675, 0.019341106790498865, 2, array([[ 78. , 47. ],[124.8, 75.2],[109.2, 65.8]]))
(统计量,单尾p值,自由度,各观察项的期望频率)
多重比较方法(Marascuilo produce)
import pandas as pd
import numpy as np
from scipy import stats
data = pd.read_excel(r"C:\Users\liuhao\Desktop\a.xls",index_col=0)
data["sum"] = data.sum(axis="columns")
data["fre"] = data["是"]/data["sum"]
chi_square = stats.chi2.ppf(0.95,df=2)
a = []
for i in list(range(len(data.index))):
for j in list(range(i+1,len(data.index))):
p = abs(data.iloc[i,3] - data.iloc[i+1,3])
cv = np.sqrt(5.991)*np.sqrt((data.iloc[i,3]*(1-data.iloc[i,3])/data.iloc[i,2])+(data.iloc[j,3]*(1-data.iloc[j,3])/data.iloc[j,2]))
a.append([p,cv])
# 若p>cv,则差异显著
输出:
[[0.04799999999999993, 0.13799131381358756],
[0.04799999999999993, 0.13784846145853047],
[0.10285714285714287, 0.11974554538054417]]
两分类变量的独立性检验(卡方检验)
import pandas as pd
import numpy as np
from scipy import stats
data = pd.read_excel(r"C:\Users\liuhao\Desktop\s.xls",index_col=0)
stats.chi2_contingency(data)
啤酒 |
男 |
女 |
淡 |
51 |
39 |
普通 |
56 |
21 |
黑 |
25 |
8 |
(6.446821152703504,
0.03981902059909915,
2,
array([[59.4 , 30.6 ],[50.82, 26.18],[21.78, 11.22]]))
拟合优度检验(卡方检验)
多项概率分布(检验样本分布是否符合历史分布)
import pandas as pd
import numpy as np
from scipy import stats
f_obs = [48,98,54] #观察pinlv
f_exp = [60,100,40] #期望频率
stats.chisquare(f_obs,f_exp)
#Power_divergenceResult(statistic=7.34, pvalue=0.02547646994668102)
正态分布检验
import numpy as np
from scipy import stats
data=[71,66,61,65,54,93,60,86,70,70,73,73,55,63,56,62,76,54,82,79,76,68,53,58,85,80,56,61,61,64,65,62,90,69,76,79,77,54,64,74,65,65,61,56,63,80,56,71,79,84]
stats.normaltest (a, axis=0)
#NormaltestResult(statistic=3.220501948981951, pvalue=0.19983745367770303)
stats.kstest(data, 'norm', (np.mean(data),np.std(data)))
#Power_divergenceResult(statistic=7.34, pvalue=0.02547646994668102)
stats.shapiro(data)
#(0.9567651152610779, 0.06508207321166992)
# p值大于显著水平则不能拒绝原假设,符合正态分布