根据伯努利大数定律, 只要 n n n足够大, F n ( x ) F_n(x) Fn(x)依概率收敛于 F ( x ) F(x) F(x), 也就是说当样本量足够大时, 经验分布函数是母体分布函数的一个良好的近似.
from scipy.stats import norm
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(20)
mu = 0
sig = 1
xmin = -3
xmax = 3
s4 = np.random.normal(mu, sig, 20)
axis_x = np.arange(xmin, xmax, 0.01)
axis_y = norm.cdf(axis_x)
norm_ecdf = ECDF(s4)
plt.figure(figsize=(12, 8), dpi=70)
ax1 = plt.subplot(221)
ax1.plot(axis_x, axis_y, label="cdf")
ax1.plot(axis_x, norm_ecdf(axis_x), label="ecdf")
plt.ylim(-0.05, 1.05)
plt.vlines(x=0, ymin=-0.05, ymax=1.05, color="k")
#plt.hlines(y=0, xmin=xmin, xmax=xmax, color="k")
plt.title(r'$n=20$')
plt.legend()
s4 = np.random.normal(mu, sig, 100)
norm_ecdf = ECDF(s4)
ax2 = plt.subplot(222)
ax2.plot(axis_x, axis_y, label="cdf")
ax2.plot(axis_x, norm_ecdf(axis_x), label="ecdf")
plt.ylim(-0.05, 1.05)
plt.vlines(x=0, ymin=-0.05, ymax=1.05, color="k")
#plt.hlines(y=0, xmin=xmin, xmax=xmax, color="k")
plt.title(r'$n=100$')
plt.legend()
s4 = np.random.normal(mu, sig, 200)
norm_ecdf = ECDF(s4)
ax3 = plt.subplot(223)
ax3.plot(axis_x, axis_y, label="cdf")
ax3.plot(axis_x, norm_ecdf(axis_x), label="ecdf")
plt.ylim(-0.05, 1.05)
plt.vlines(x=0, ymin=-0.05, ymax=1.05, color="k")
#plt.hlines(y=0, xmin=xmin, xmax=xmax, color="k")
plt.title(r'$n=200$')
plt.legend()
s4 = np.random.normal(mu, sig, 400)
norm_ecdf = ECDF(s4)
ax4 = plt.subplot(224)
ax4.plot(axis_x, axis_y, label="cdf")
ax4.plot(axis_x, norm_ecdf(axis_x), label="ecdf")
plt.ylim(-0.05, 1.05)
plt.vlines(x=0, ymin=-0.05, ymax=1.05, color="k")
#plt.hlines(y=0, xmin=xmin, xmax=xmax, color="k")
plt.title(r'$n=400$')
plt.legend()
#plt.savefig("D:\\Desktop\\ecdf.png", dpi=200)
可以看到随着样本量的增加, 经验分布函数的曲线逐渐与真实cdf曲线重合.
import math
def cdf(k, Lam):
global Sum
Sum = 0
for i in range(k + 1):
Sum = Sum + ((Lam**i)/(math.factorial(i)))
#print(Sum)
fx = np.exp(-Lam)*Sum
return fx
n = 5
f = []
for j in range(n):
a = cdf(j, 1)
f.append(a)
#print(f)
for l in range(n):
plt.plot([l,l+1],[f[l],f[l]],"b")
plt.ylim(0,1.05)
接下来给出实验结果
import numpy as np
from statsmodels.distributions.empirical_distribution import ECDF
import matplotlib.pyplot as plt
from scipy.stats import poisson
Lam3 = 1 # 设定分布参数
xmin, xmax = 0, 10 # 坐标轴参数
n = [20, 100, 200, 400] # 设定样本量
axis_x = np.arange(xmin, xmax, 1)
plt.figure(figsize=(12, 8), dpi=100)
s4 = np.random.poisson(Lam3, n[0]) # 抽样
ecdf = ECDF(s4) # 构造样本经验分布函数
ax1 = plt.subplot(221)
for i in range(10):
ax1.plot([i, i+1], [ecdf(i), ecdf(i)], color="orange",markersize=1)
ax1.scatter(i, ecdf(i), color="orange")
if i == 8:
ax1.plot([i+1, i+2], [ecdf(i), ecdf(i)], color="orange",markersize=1,label="ecdf")
ax1.scatter(i, ecdf(i), color="orange")
break
for j in range(10):
ax1.plot([j, j+1], [poisson.cdf(j, Lam3), poisson.cdf(j, Lam3)], "b",markersize=1)
ax1.scatter(j, poisson.cdf(j, Lam3), color="b")
if j == 8:
ax1.plot([j+1, j+2], [poisson.cdf(j, Lam3), poisson.cdf(j, Lam3)], "b",markersize=1,label="cdf")
ax1.scatter(j, poisson.cdf(j, Lam3), color="b")
break
plt.ylim(0, 1.05)
plt.legend()
plt.title(r"$n=20$")
s4 = np.random.poisson(Lam3, n[1]) # 抽样
ecdf = ECDF(s4) # 构造样本经验分布函数
ax2 = plt.subplot(222)
for i in range(10):
ax2.plot([i, i+1], [ecdf(i), ecdf(i)], color="orange",markersize=1)
ax2.scatter(i, ecdf(i), color="orange")
if i == 8:
ax2.plot([i+1, i+2], [ecdf(i), ecdf(i)], color="orange",markersize=1,label="ecdf")
ax2.scatter(i, ecdf(i), color="orange")
break
for j in range(10):
ax2.plot([j, j+1], [poisson.cdf(j, Lam3), poisson.cdf(j, Lam3)], "b",markersize=1)
ax2.scatter(j, poisson.cdf(j, Lam3), color="b")
if j == 8:
ax2.plot([j+1, j+2], [poisson.cdf(j, Lam3), poisson.cdf(j, Lam3)], "b",markersize=1,label="cdf")
ax2.scatter(j, poisson.cdf(j, Lam3), color="b")
break
plt.ylim(0, 1.05)
plt.legend()
plt.title(r"$n=100$")
s4 = np.random.poisson(Lam3, n[2]) # 抽样
ecdf = ECDF(s4) # 构造样本经验分布函数
ax3 = plt.subplot(223)
for i in range(10):
ax3.plot([i, i+1], [ecdf(i), ecdf(i)], color="orange",markersize=1)
ax3.scatter(i, ecdf(i), color="orange")
if i == 8:
ax3.plot([i+1, i+2], [ecdf(i), ecdf(i)], color="orange",markersize=1,label="ecdf")
ax3.scatter(i, ecdf(i), color="orange")
break
for j in range(10):
ax3.plot([j, j+1], [poisson.cdf(j, Lam3), poisson.cdf(j, Lam3)], "b",markersize=1)
ax3.scatter(j, poisson.cdf(j, Lam3), color="b")
if j == 8:
ax3.plot([j+1, j+2], [poisson.cdf(j, Lam3), poisson.cdf(j, Lam3)], "b",markersize=1,label="cdf")
ax3.scatter(j, poisson.cdf(j, Lam3), color="b")
break
plt.ylim(0, 1.05)
plt.legend()
plt.title(r"$n=200$")
s4 = np.random.poisson(Lam3, n[3]) # 抽样
ecdf = ECDF(s4) # 构造样本经验分布函数
ax4 = plt.subplot(224)
for i in range(10):
ax4.plot([i, i+1], [ecdf(i), ecdf(i)], color="orange",markersize=1)
ax4.scatter(i, ecdf(i), color="orange")
if i == 8:
ax4.plot([i+1, i+2], [ecdf(i), ecdf(i)], color="orange",markersize=1,label="ecdf")
ax4.scatter(i, ecdf(i), color="orange")
break
for j in range(10):
ax4.plot([j, j+1], [poisson.cdf(j, Lam3), poisson.cdf(j, Lam3)], "b",markersize=1)
ax4.scatter(j, poisson.cdf(j, Lam3), color="b")
if j == 8:
ax4.plot([j+1, j+2], [poisson.cdf(j, Lam3), poisson.cdf(j, Lam3)], "b",markersize=1,label="cdf")
ax4.scatter(j, poisson.cdf(j, Lam3), color="b")
break
plt.ylim(0, 1.05)
plt.legend()
plt.title(r"$n=400$")
#plt.savefig("D:\\Desktop\\ecdf_pois.png", dpi=200)
可以看到随着样本量的增加我们得到了和标准正态分布相同的结果. 同时也可以从数值角度来观察这一过程, 例如下面的代码给出了不同方法下 F ( 1 ) F(1) F(1)的计算结果:
s4 = np.random.poisson(Lam3, 400) #抽样
prob_sci = poisson.cdf(1, Lam3) #调包
prob_cdf = cdf(1,Lam3) #自主编程
ecdf = ECDF(s3) #构造经验分布函数
prob_ecdf = ecdf(1) #调用经验分布函数
print(prob_sci, prob_cdf, prob_ecdf)
>>>0.7357588823428847 0.7357588823428847 0.73
可见在样本量400的条件下, 真实 F ( 1 ) F(1) F(1)与 F ^ ( 1 ) \hat{F}(1) F^(1)几乎不存在差距.