import numpy as np
import pandas as pd
import matplotlib.pylab as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
from sklearn import linear_model
model=linear_model.LinearRegression()
导入数据
from sklearn import datasets #从sklearn数据集库导入boston数据
from sklearn.model_selection import train_test_split #数据集划分包
boston =datasets.load_boston()
boston
转换格式
x=pd.DataFrame(boston.data) #将data转换为Dataframe格式
x
y=boston.target
y
给x添加列名
x.columns=boston.feature_names
x
划分训练、测试数据集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
恢复索引
for i in [x_train,x_test]:
i.index=range(i.shape[0])
数据标准化
x_train=(x_train-x_train.mean())/x_train.std()
x_test=(x_test-x_test.mean())/x_test.std()
x_test
拟合
model.fit(x_train,y_train)
预测
y_pre=model.predict(x_test)
y_pre
截距
model.intercept_
斜率
model.coef_
#将系数和对应名称组合起来
a=[*zip(x_train.columns,model.coef_)]
print(a)
b=[*zip(x_train.columns,abs(model.coef_))]
print(b)
c=pd.DataFrame(b).sort_values(by=1)
print(c)
c.iloc[10:]
评估模型
# 画图
plt.figure(figsize=(15,8),dpi=80)
plt.plot(range(len(y_test)),sorted(y_test),label="真实")
plt.plot(range(len(y_pre)),sorted(y_pre),label="预测")
plt.legend()
plt.show()
两个指标
#MSE用来检测模型的预测值和真实值之间的偏差,值越大,表明预测效果越差
from sklearn.metrics import mean_squared_error as MSE #均方误差
MSE(y_pre,y_test) #y_pre对x_test的预测
from sklearn.metrics import r2_score
r2=model.score(x_test,y_test)
r2 # r2 越接近于1,表示回归的效果越好