线性回归是最简单的机器学习模型,其形式简单,易于实现,以下是它的2种原生实现方式及通过sklearn库实现方式,供比较。
线性回归模型假设函数:
h(x)= wx +b
其中w和b为模型参数,也称为回归系数。w为权重向量,b为偏执(bias)
一、线性回归之最小二乘法实现
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# 数据集下载路径
# https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
class OLSLinearRegression(object):
def _ols(self, X, y):
"""
最小二乘法估算 方程 w=(XTX)-1XTy
:param X: 输入值x的矩阵
:param y: 预测值
:return:
"""
tmp = np.linalg.inv(np.matmul(X.T, X)) # 求逆矩阵
tmp = np.matmul(tmp, X.T) # 求逆矩阵和转置矩阵的内积
return np.matmul(tmp, y)
def _preprocess_data_x(self, x):
"""
数据预处理
:param x: 输入值x的矩阵
:return:
"""
m, n = x.shape
x_ = np.empty((m, n + 1))
x_[:, 0] = 1
x_[:, 1:] = x
return x_
def train(self, x_train, y_train):
"""
训练模型
:param x_train: x的训练数据
:param y_train: y的训练数据:目标值
:return:
"""
# 预处理x_train(添加x0 = 1)
x_train = self._preprocess_data_x(x_train)
# 使用最小二乘法预估
self.w = self._ols(x_train, y_train)
def predict(self, x):
"""
预测
:param x:
:return:
"""
x = self._preprocess_data_x(x)
return np.matmul(x, self.w)
if __name__ == '__main__':
data = np.genfromtxt('winequality-red.csv', delimiter=";",skip_header=True)
x = data[:, :-1]
# print(x)
y = data[:, -1]
# print(y)
ols_lr = OLSLinearRegression()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
ols_lr.train(x_train, y_train)
y_pred = ols_lr.predict(x_test)
# print(y_pred)
# mean_squared_error 计算均方误差回归损失
mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
print(mse) # 0.4501581189034397
y_train_pred = ols_lr.predict(x_train)
mse_train = sklearn.metrics.mean_squared_error(y_train, y_train_pred)
print(mse_train) # 0.4046507467209402
# mae 含义:所有预测值和实际值的误差绝对值的平均值
mae = mean_absolute_error(y_test, y_pred)
print(mae) # 0.5161459887179464
二、通过梯度下降法实现:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
class GDLinearRegression(object):
"""梯度下降法"""
def __init__(self, n_iter=200, eta=1e-3, tol=None):
# 训练迭代次数
self.n_iter = n_iter
# 学习率
self.eta = eta
# 误差变化阈值
self.tol = tol
# 模型参数w(训练时初始化)
self.w = None
def _loss(self, y, y_pred):
"""
计算损失
:param y: 实际值
:param y_pred: 预测值
:return:
"""
return np.sum((y_pred - y) ** 2) / y.size
def _gradient(self, x, y, y_pred):
"""
计算梯度
:param x:
:param y:
:param y_pred: 预测值y
:return:
"""
return np.matmul(y_pred - y, x) / y.size
def _gradient_descent(self,x, y):
"""
梯度下降算法
:param x: 输入值x
:param y:
:return:
"""
# 若用户指定tol,则启用早期停止法
if self.tol is not None:
loss_old = np.inf
# 使用梯度下降,至多迭代n_iter次, 更新w
for step_i in range(self.n_iter):
# 预测
y_pred = self._predict(x, self.w)
# 计算损失
loss = self._loss(y, y_pred)
print(f"循环次数:{step_i},loss:{loss}")
print(self.w)
# 早期停止法
if self.tol is not None:
# 如果损失下降小于阈值,则终止迭代
if loss_old - loss < self.tol:
break
loss_old = loss
# 计算梯度
grad = self._gradient(x, y, y_pred)
# 更新参数 w
self.w -= self.eta * grad
def _preprocess_data_x(self, x):
"""
数据预处理
:param x:
:return:
"""
# 扩展x, 添加x0列并设置为1
m, n = x.shape
x_ = np.empty((m, n + 1))
x_[:, 0] = 1
x_[:, 1:] = x
return x_
def train(self, x_train, y_train):
"""
训练
:param x_train:
:param y_train:
:return:
"""
# 预处理x_train(添加x0=1)
x_train = self._preprocess_data_x(x_train)
# 初始化参数向量w
_, n = x_train.shape
self.w = np.random.random(n) * 0.05
# 执行梯度下降训练w
self._gradient_descent(x_train, y_train)
def _predict(self, x, w):
"""
预测内部接口,实现函数h(x)
:param x: 输入值x矩阵
:param w: 权重矩阵
:return:
"""
return np.matmul(x, w)
def predict(self, x):
"""预测"""
x = self._preprocess_data_x(x)
return self._predict(x, self.w)
if __name__ == '__main__':
data = np.genfromtxt('winequality-red.csv', delimiter=";", skip_header=True)
x = data[:, :-1]
y = data[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# 对训练集进行标准化处理
ss = StandardScaler()
ss.fit(x_train)
x_train_std = ss.fit_transform(x_train)
x_test_std = ss.fit_transform(x_test)
gd_lr = GDLinearRegression(n_iter=3000, eta=0.05, tol=0.00001)
gd_lr.train(x_train_std, y_train)
# gd_lr = joblib.load('gd_lr.pkl') 加载模型
# joblib.dump(gd_lr, 'gd_lr.pkl') 保存训练好的模型
y_pred = gd_lr.predict(x_test_std)
mse = mean_squared_error(y_test,y_pred)
print(mse) # 0.4050383474602159
mae = mean_absolute_error(y_test,y_pred)
print(mae) # 0.49031095345068276
三、sklearn库实现线性回归
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
data = np.genfromtxt(r'winequality-red.csv', delimiter=";", skip_header=True)
x = data[:, :-1]
# print(x)
y = data[:, -1]
# print(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(mse) # 0.43312318561250157