回到正题,对于简单线性回归有如下问题:
下面通过程序来实现简单的线性回归:
import numpy as np
import matplotlib.pyplot as plt
x=np.array([1,2,3,4,5])
y=np.array([1,3,2,3,5])
plt.scatter(x,y)
x_mean=np.mean(x)
y_mean=np.mean(y)
up=0.0
down=0.0
for x_i,y_i in zip(x,y):
up+=(x_i-x_mean)*(y_i-y_mean)
down+=(x_i-x_mean)**2
a=up/down
b=y_mean-a*x_mean
print(a,b)
y_hat=a*x+b
plt.plot(x,y_hat,c='red')
plt.axis([0,6,0,6])
plt.show()
创建自己的线性回归类:
import numpy as np
class SimpleLinearRegression1:
def __init__(self):
self.a_=None
self.b_=None
def fit(self,x_train,y_train):
x_mean = np.mean(x_train)
y_mean = np.mean(y_train)
up = 0.0
down = 0.0
for x_i, y_i in zip(x_train, y_train):
up += (x_i - x_mean) * (y_i - y_mean)
down += (x_i - x_mean) ** 2
self.a_ = up / down
self.b_ = y_mean - self.a_ * x_mean
return self
def _predict(self,x_single):
return x_single*self.a_+self.b_
def predict(self,x_predict):
return np.array([self._predict(x) for x in x_predict])
#改进,将for循环用向量化实现,增加效率
class SimpleLinearRegression2:
def __init__(self):
self.a_=None
self.b_=None
def fit(self,x_train,y_train):
x_mean = np.mean(x_train)
y_mean = np.mean(y_train)
# up = np.dot((x_train-x_mean),(y_train-y_mean))
# down =np.dot((x_train-x_mean),(x_train-x_mean))
up = np.sum((x_train - x_mean)*(y_train - y_mean))
down = np.sum((x_train - x_mean)*(x_train - x_mean))
self.a_ = up / down
self.b_ = y_mean - self.a_ * x_mean
return self
def _predict(self,x_single):
return x_single*self.a_+self.b_
def predict(self,x_predict):
return np.array([self._predict(x) for x in x_predict])
from ML import SimpleLinearRegression
s=SimpleLinearRegression.SimpleLinearRegression2()
s.fit(x,y)
y_hat=s.predict(np.array([6]))
y_hat=s.a_*x+s.b_
plt.plot(x,y_hat,c='red')
plt.axis([0,6,0,6])
plt.title('y=%s*x+%s'%(s.a_,s.b_))
plt.show()
均方误差MSE | 均方根误差RMSE(与用本同量纲) | 平均绝对误差MAE |
编程实现:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
#生成数据
boston=datasets.load_boston()
print(boston.DESCR)
print(boston.feature_names)
x=boston.data[:,5]
y=boston.target
x=x[y<50]
y=y[y<50]
#进行训练集和测试集划分
from ML.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,seed=666)
#进行简单的线性回归
from ML.SimpleLinearRegression import SimpleLinearRegression2
s=SimpleLinearRegression2()
s.fit(x_train,y_train)
print(s.a_,s.b_)
plt.scatter(x,y)
plt.plot(x,s.a_*x+s.b_,c='red')
plt.show()
#线性回归模型的评估指标
y_predict=s.predict(x_test)
#MSE
mse_test=np.sum((y_predict-y_test)**2)/len(x_test)
print(mse_test)
#RMSE
rmse_test=np.sqrt(mse_test)
print(rmse_test)
#MAE
mae_test=np.sum(np.absolute(y_predict-y_test))/len(x_test)
print(mae_test)
mse_test 24.1566021344
rmse_test 4.91493663585
mae_test 3.54309744095
from sklearn.metrics import mean_squared_error,mean_absolute_error
mean_squared_error(y_test,y_predict)
print('sk_mse_test',mse_test)
mean_absolute_error(y_test,y_predict)
print('sk_mae_test',mae_test)
sk_mse_test 24.1566021344
sk_mae_test 3.54309744095
#R Square
R=1-mean_squared_error(y_test,y_predict)/np.var(y_test)
print(R)
R Square: 0.612931680394
使用sklearn计算R Square
from sklearn.metrics import r2_score
r2=r2_score(y_test,y_predict)
print('r2_score',r2)
r2_score 0.612931680394
import numpy as np
from .metrics import r2_score
class LinearRegression:
def __init__(self):
self.coef_=None #系数
self.interception_=None #截距
self._theta=None
def fit_normal(self,X_train,y_train):
x_b=np.hstack([np.ones((len(X_train),1)),X_train])
self._theta=np.linalg.inv(x_b.T.dot(x_b)).dot(x_b.T).dot(y_train)
self.coef_=self._theta[1:]
self.interception_=self._theta[0]
return self
def predict(self,x_predict):
x_b = np.hstack([np.ones((len(x_predict), 1)), x_predict])
return x_b.dot(self._theta)
def score(self,x_test,y):
y_predict=self.predict(x_test)
return r2_score(y_predict,y)
使用自己的类实现线性回归:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from ML.LinearRegression import LinearRegression
boston=datasets.load_boston()
#使用全部的列
X=boston.data
y=boston.target
x=X[y<50]
y=y[y<50]
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
L=LinearRegression()
L.fit_normal(x_train,y_train)
print(L)
print(L.coef_)
print(L.interception_)
score=L.score(x_test,y_test)
print(score)
使用sklearn实现线性回归
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(x_train,y_train)
score=lin_reg.score(x_test,y_test)
print(score)