import pandas as pd
import numpy as np
import statsmodels.api as sm
# import statsmodels.formula.api as smf方法二
x = np.array([2,6,8,8,12,16,20,20,22,26])
y = np.array([58,105,88,118,117,137,157,169,149,202])
# s = pd.DataFrame()
# s['x'] = x
# s['y'] = y
# model2 = smf.ols(formula='y ~ x',data = s).fit()
# model2.summary()
X = sm.add_constant(x) # 添加一列
model = sm.OLS(y,X).fit()
model.summary()
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
re = np.array([2,6,8,3,2,7,9,8,4,6])
# typ = np.array(["电子","机械","电子","机械","电子","电子","机械","机械","电子","电子"])
typ = np.array([1,0,1,0,1,1,0,0,1,1]) # 创建 k-1 个虚拟变量
# 可以使用 pd.get_dummies()
time = np.array([2.9,3.0,4.8,1.8,2.9,4.9,4.2,4.8,4.4,4.5])
data = pd.DataFrame([re,typ,time]).T
model = ols('time ~ re+typ',data).fit()
model.summary()
outliers = model.get_influence()
# outliers.summary_table()
# 汇总表
#高杠杆值点(帽子矩阵)
leverage = outliers.hat_matrix_diag
#dffits值
dffits = outliers.dffits[0]
#学生化残差
resid_stu = outliers.resid_studentized_external
#cook距离
cook = outliers.cooks_distance[0]
# 散点图和拟合直线
import matplotlib.pyplot as plt
plt.scatter(x,y)
Y = model.predict(X)
plt.plot(x,Y,"r-")
plt.show()
# x残差图
plt.scatter(x,y-Y)
plt.axhline(0)
# 残差密度曲线
model.resid.plot.density()
# 标准化残差图
plt.scatter(x,resid_stu)
plt.axhline(0)
plt.axhline(2)
plt.axhline(-2)# 学生化残差正常应该处于(-2,2)之间,即随机误差符合正态分布
plt.show()
import pandas as pd
import numpy as np
import statsmodels.api as sma
# data = pd.read_excel(r"C:\Users\liuhao\Desktop\a.xls")
data = pd.read_excel(r"C:\Users\liuhao\Desktop\python_work\Python数据分析与挖掘实战\chapter5\demo\data\bankloan.xls")
x = data[["工龄","地址","负债率","信用卡负债"]].values
y = data.iloc[:,8].values
# x = data[["消费支出","信用卡"]].values
# y = data.iloc[:,-1].values
X = sma.add_constant(x)
logit = sma.Logit(y,X)
result = logit.fit()
result.summary2()
from sklearn.linear_model import LogisticRegression
modelLR=LogisticRegression()
modelLR.fit(x,y)
b=modelLR.coef_
a=modelLR.intercept_
print(a,b)
modelLR.score(x,y)
两种方法计算的参数会存在小小的差异