机器学习算法之——线性回归

线性回归是最简单的机器学习模型,其形式简单,易于实现,以下是它的2种原生实现方式及通过sklearn库实现方式,供比较。

线性回归模型假设函数:

                                   h(x)= wx +b

其中w和b为模型参数,也称为回归系数。w为权重向量,b为偏执(bias

一、线性回归之最小二乘法实现

        

import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# 数据集下载路径
# https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv


class OLSLinearRegression(object):
    def _ols(self, X, y):
        """
        最小二乘法估算 方程 w=(XTX)-1XTy
        :param X: 输入值x的矩阵
        :param y: 预测值
        :return:
        """

        tmp = np.linalg.inv(np.matmul(X.T, X))  # 求逆矩阵
        tmp = np.matmul(tmp, X.T)  # 求逆矩阵和转置矩阵的内积
        return np.matmul(tmp, y)

    def _preprocess_data_x(self, x):
        """
        数据预处理
        :param x: 输入值x的矩阵
        :return:
        """
        m, n = x.shape
        x_ = np.empty((m, n + 1))
        x_[:, 0] = 1
        x_[:, 1:] = x
        return x_

    def train(self, x_train, y_train):
        """
        训练模型
        :param x_train: x的训练数据
        :param y_train: y的训练数据:目标值
        :return:
        """
        # 预处理x_train(添加x0 = 1)
        x_train = self._preprocess_data_x(x_train)
        # 使用最小二乘法预估
        self.w = self._ols(x_train, y_train)

    def predict(self, x):
        """
        预测
        :param x:
        :return:
        """
        x = self._preprocess_data_x(x)
        return np.matmul(x, self.w)


if __name__ == '__main__':
    data = np.genfromtxt('winequality-red.csv', delimiter=";",skip_header=True)
    x = data[:, :-1]
    # print(x)
    y = data[:, -1]
    # print(y)
    ols_lr = OLSLinearRegression()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    ols_lr.train(x_train, y_train)
    y_pred = ols_lr.predict(x_test)
    # print(y_pred)
    # mean_squared_error 计算均方误差回归损失
    mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
    print(mse)  # 0.4501581189034397
    y_train_pred = ols_lr.predict(x_train)
    mse_train = sklearn.metrics.mean_squared_error(y_train, y_train_pred)
    print(mse_train)  # 0.4046507467209402
    # mae 含义:所有预测值和实际值的误差绝对值的平均值
    mae = mean_absolute_error(y_test, y_pred)
    print(mae)  # 0.5161459887179464


二、通过梯度下降法实现:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

from sklearn.externals import joblib


class GDLinearRegression(object):
    """梯度下降法"""
    def __init__(self, n_iter=200, eta=1e-3, tol=None):
        # 训练迭代次数
        self.n_iter = n_iter
        # 学习率
        self.eta = eta
        # 误差变化阈值
        self.tol = tol
        # 模型参数w(训练时初始化)
        self.w = None

    def _loss(self, y, y_pred):
        """
        计算损失
        :param y:  实际值
        :param y_pred:  预测值
        :return:
        """
        return np.sum((y_pred - y) ** 2) / y.size

    def _gradient(self, x, y, y_pred):
        """
        计算梯度
        :param x:
        :param y:
        :param y_pred: 预测值y
        :return:
        """
        return np.matmul(y_pred - y, x) / y.size

    def _gradient_descent(self,x, y):
        """
        梯度下降算法
        :param x: 输入值x
        :param y:
        :return:
        """
        # 若用户指定tol,则启用早期停止法
        if self.tol is not None:
            loss_old = np.inf
            # 使用梯度下降,至多迭代n_iter次, 更新w
        for step_i in range(self.n_iter):
            # 预测
            y_pred = self._predict(x, self.w)
            # 计算损失
            loss = self._loss(y, y_pred)
            print(f"循环次数:{step_i},loss:{loss}")
            print(self.w)
            # 早期停止法
            if self.tol is not None:
                # 如果损失下降小于阈值,则终止迭代
                if loss_old - loss < self.tol:
                    break
                loss_old = loss

                # 计算梯度
                grad = self._gradient(x, y, y_pred)
                # 更新参数 w
                self.w -= self.eta * grad

    def _preprocess_data_x(self, x):
        """
        数据预处理
        :param x:
        :return:
        """
        # 扩展x, 添加x0列并设置为1
        m, n = x.shape
        x_ = np.empty((m, n + 1))
        x_[:, 0] = 1
        x_[:, 1:] = x
        return x_

    def train(self, x_train, y_train):
        """
        训练
        :param x_train:
        :param y_train:
        :return:
        """
        # 预处理x_train(添加x0=1)
        x_train = self._preprocess_data_x(x_train)
        # 初始化参数向量w
        _, n = x_train.shape
        self.w = np.random.random(n) * 0.05
        # 执行梯度下降训练w
        self._gradient_descent(x_train, y_train)

    def _predict(self, x, w):
        """
        预测内部接口,实现函数h(x)
        :param x: 输入值x矩阵
        :param w: 权重矩阵
        :return:
        """
        return np.matmul(x, w)

    def predict(self, x):
        """预测"""
        x = self._preprocess_data_x(x)
        return self._predict(x, self.w)


if __name__ == '__main__':
    data = np.genfromtxt('winequality-red.csv', delimiter=";", skip_header=True)
    x = data[:, :-1]
    y = data[:, -1]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    # 对训练集进行标准化处理
    ss = StandardScaler()
    ss.fit(x_train)
    x_train_std = ss.fit_transform(x_train)
    x_test_std = ss.fit_transform(x_test)
    gd_lr = GDLinearRegression(n_iter=3000, eta=0.05, tol=0.00001)
    gd_lr.train(x_train_std, y_train)

    # gd_lr = joblib.load('gd_lr.pkl') 加载模型
    # joblib.dump(gd_lr, 'gd_lr.pkl')  保存训练好的模型

    y_pred = gd_lr.predict(x_test_std)
    mse = mean_squared_error(y_test,y_pred)
    print(mse)  # 0.4050383474602159
    mae = mean_absolute_error(y_test,y_pred)
    print(mae)  # 0.49031095345068276

三、sklearn库实现线性回归

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

data = np.genfromtxt(r'winequality-red.csv', delimiter=";", skip_header=True)
x = data[:, :-1]
# print(x)
y = data[:, -1]
# print(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(mse)  # 0.43312318561250157

 

 

 

你可能感兴趣的:(机器学习,python,机器学习,python,算法)