import numpy as np
from sklearn.datasets import make_regression
feature,target = make_regression(n_samples=100, n_features=5, n_informative=3, random_state=666)
复相关系数即可决系数 R 2 R^{2} R2的算术平方根,也即拟合优度的算术平方根。
这个可决系数 R i 2 R_{i}^{2} Ri2是指用 X i X_{i} Xi做因变量,对其他全部 X j , ( i = 1 , 2 , 3 , ⋯ , k ; i ≠ j ) X_{j},(i=1,2,3,\cdots,k;i \ne j) Xj,(i=1,2,3,⋯,k;i=j)做一个新的回归以后得到的可决系数。
from statsmodels.stats.outliers_influence import variance_inflation_factor
# 输入变量
# exog:所有解释变量
# exog_idx:解释变量的columns标签
for i in range(feature.shape[1]):
print("第{}个解释变量的VIF值为{}".format(i+1,variance_inflation_factor(feature,i)))
from scipy import stats
print("样本的数据长度:",feature.shape[0])
# Statstic: 代表显著性水平
# P: 代表概率论与数理统计中的P值
for i in range(feature.shape[1]):
jb_value,p = stats.jarque_bera(feature[:,i])
if p < 0.05:
judge = '拒绝原假设'
else:
judge = '接受原假设'
print("第{}个变量的Test Statstic为{}, P值为:{}, {}".format(i,jb_value,p,judge))
import pandas as pd
pd.DataFrame(feature).corr(method='pearson')
# 可以通过Scipy计算F统计量和T统计量
from scipy.stats import f,t
F_Theroy = f.ppf(q=0.95, dfn=5, dfd=10-5-1)
print('F: {}'.format(F_Theroy))
T_Theroy = t.ppf(q=0.975, df=100-5-1)
print('T: {}'.format(T_Theroy))
import statsmodels.api as sm
X = feature
Y = target
X = sm.add_constant(X) # 添加截距项
model = sm.OLS(Y,X).fit()
model.summary()
#绘制最佳拟合线:标签用的是训练数据的预测值y_train_pred
y_pred = model.predict(X)
fig,ax = plt.subplots(figsize=(8,4))
ax.plot(Y,color='#00b0ff',label="Observations",linewidth=1.5)
ax.plot(y_pred,color='#ff3d00',label="Prediction",linewidth=1.5)
ax.legend(loc="upper left",fontsize=12)
ax.grid(alpha=0.6)
ax.tick_params(labelsize=14)
import matplotlib.pyplot as plt
y_pred = model.predict(X)
fig,ax = plt.subplots(figsize=(8,4))
ax.plot(Y,color='#00b0ff',label="Observations",linewidth=1.5)
ax.plot(y_pred,color='#ff3d00',label="Prediction",linewidth=1.5)
ax.legend(loc="upper left",fontsize=12)
ax.grid(alpha=0.6)
ax.tick_params(labelsize=14)
from statsmodels.stats.diagnostic import spec_white
X= feature
X=sm.add_constant(X) # 添加截距项
error = model.resid #模型的残差
statistic,p,n = spec_white(error,X)
print("The test statistic: {};\n\
P Value: {};\n\
The degree of Freedom: {};".format(statistic,p,n))
from statsmodels.stats.diagnostic import acorr_breusch_godfrey
lm, p_lm, f, p_f = acorr_breusch_godfrey(model)
print("Lagrange multiplier test statistic: {};\n\
P Value-LM: {};\n\
F Statistic: {};\n\
P Value-F{}".format(lm, p_lm, f, p_f))