线性回归实现及线性回归中衡量算法模型的标准(R-Squared,MSE.RMSE,MAE)

MSE:均方误差,mean_squared_error
RMSE:均方根误差,root_mean_squared_error
MAE:平均绝对误差,mean_absolute_error

05 衡量回归算法的标准,MSE vs MAE

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

波士顿房产数据

boston = datasets.load_boston()
boston.keys()
dict_keys(['data', 'target', 'feature_names', 'DESCR'])

特征向量名称:

boston.feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], 
      dtype=')
x = boston.data[:,5] # 只使用房间数量这个特征,所有行,第五列
x.shape
(506,)
y = boston.target
y.shape
(506,)
plt.scatter(x, y)
plt.show()

线性回归实现及线性回归中衡量算法模型的标准(R-Squared,MSE.RMSE,MAE)_第1张图片

np.max(y)
50.0
x = x[y < 50.0]#采用最大值的点可能不是真实的点
y = y[y < 50.0]
x.shape
(490,)
y.shape
(490,)
plt.scatter(x, y)
plt.show()

线性回归实现及线性回归中衡量算法模型的标准(R-Squared,MSE.RMSE,MAE)_第2张图片

使用简单线性回归法

封装:

import numpy as np
class LinearRegression1:
    def __init__(self):
        self.a_=None
        self.b_=None

    def fit(self,x_train,y_train):
        num=0.0
        d=0.0
        x_mean=np.mean(x_train)
        y_mean=np.mean(y_train)
        for x_i,y_i in zip(x_train,y_train):
            num+=(x_i-x_mean)*(y_i-y_mean)
            d+=(x_i-x_mean)**2
        self.a_=num/d
        self.b_=y_mean-self.a_*(x_mean)
        return self

    def _predict(self,X_train):
        return np.array([self.predict(x) for x in X_train])

    def predict(self,x_single):
        return self.a_*x_single+self.b_
import numpy as np


class LinearRegression2:
    def __init__(self):
        self.a_=None
        self.b_=None

    def fit(self,x_train,y_train):
        x_mean=np.mean(x_train)
        y_mean=np.mean(y_train)
        num=(x_train-x_mean).dot(y_train-y_mean)
        d=(x_train-x_mean).dot(x_train-x_mean)
        self.a_=num/d
        self.b_=y_mean-self.a_*x_mean
        return self

    def _predict(self,X_train):
        return np.array([self.predict(x) for x in X_train])

    def predict(self,x_single):
        return self.a_*x_single+self.b_

随机方法:

import numpy as np


def train_test_split(X, y, test_ratio=0.2, seed=None):
    """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
    assert X.shape[0] == y.shape[0], \
        "the size of X must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0, \
        "test_ration must be valid"

    if seed:
        np.random.seed(seed)

    shuffled_indexes = np.random.permutation(len(X))

    test_size = int(len(X) * test_ratio)
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]

    X_train = X[train_indexes]
    y_train = y[train_indexes]

    X_test = X[test_indexes]
    y_test = y[test_indexes]

    return X_train, X_test, y_train, y_test

测试:

from playML.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, seed=666)
x_train.shape
(392,)
y_train.shape
(392,)
x_test.shape
(98,)
y_test.shape
(98,)
from playML.SimpleLinearRegression import SimpleLinearRegression
reg = SimpleLinearRegression()
reg.fit(x_train, y_train)
SimpleLinearRegression()
reg.a_#斜率
7.8608543562689555
reg.b_#截距
-27.459342806705543
plt.scatter(x_train, y_train)
plt.plot(x_train, reg.predict(x_train), color='r')
plt.show()

线性回归实现及线性回归中衡量算法模型的标准(R-Squared,MSE.RMSE,MAE)_第3张图片

plt.scatter(x_train, y_train)
plt.scatter(x_test, y_test, color="c")
plt.plot(x_train, reg.predict(x_train), color='r')
plt.show()

线性回归实现及线性回归中衡量算法模型的标准(R-Squared,MSE.RMSE,MAE)_第4张图片

y_predict = reg.predict(x_test)

MSE

mse_test = np.sum((y_predict - y_test)**2) / len(y_test)
mse_test#MSE和实际的y的量纲不一样
24.156602134387438

RMSE

from math import sqrt

rmse_test = sqrt(mse_test)
rmse_test#平均误差在4.91左右
4.914936635846635

MAE

mae_test = np.sum(np.absolute(y_predict - y_test))/len(y_test)
mae_test#MAE得到的结果比RMSE得到的结果小,因为RMSE有平方操作,相当于放大了操作
3.5430974409463873

封装我们自己的评测函数

代码为:

import numpy as np
from math import sqrt


def accuracy_score(y_true, y_predict):
    """计算y_true和y_predict之间的准确率"""
    assert len(y_true) == len(y_predict), \
        "the size of y_true must be equal to the size of y_predict"

    return np.sum(y_true == y_predict) / len(y_true)

#计算MSE
def mean_squared_error(y_true, y_predict):
    """计算y_true和y_predict之间的MSE"""
    assert len(y_true) == len(y_predict), \
        "the size of y_true must be equal to the size of y_predict"

    return np.sum((y_true - y_predict)**2) / len(y_true)

#计算RMSE
def root_mean_squared_error(y_true, y_predict):
    """计算y_true和y_predict之间的RMSE"""

    return sqrt(mean_squared_error(y_true, y_predict))

#计算MAE
def mean_absolute_error(y_true, y_predict):
    """计算y_true和y_predict之间的MAE"""
    assert len(y_true) == len(y_predict), \
        "the size of y_true must be equal to the size of y_predict"

    return np.sum(np.absolute(y_true - y_predict)) / len(y_true)

测试前面写的算法:

from playML.metrics import mean_squared_error
from playML.metrics import root_mean_squared_error
from playML.metrics import mean_absolute_error
mean_squared_error(y_test, y_predict)
24.156602134387438
root_mean_squared_error(y_test, y_predict)
4.914936635846635
mean_absolute_error(y_test, y_predict)
3.5430974409463873

scikit-learn中的MSE和MAE

没有RMSE,对MSE求方根即可得到

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mean_squared_error(y_test, y_predict)
24.156602134387438
mean_absolute_error(y_test, y_predict)
3.5430974409463873

R Squared

R^2 越靠近1算法精确度越好

MSE
mse_test=np.sum((y_predict-y_test)**2)/len(y_test)

RSME
rmse_test=sqrt(mse_test)

MAE
mae_test=np.sum((np.absolute(y_predict-y_test))/len(y_test))

R Square

from playML.metrics import mean_squared_error

1 - mean_squared_error(y_test, y_predict)/np.var(y_test)

封装我们自己的 R Score

def r2_score(y_true, y_predict):
    """计算y_true和y_predict之间的R Square"""

    return 1 - mean_squared_error(y_true, y_predict)/np.var(y_true)
from playML.metrics import r2_score

r2_score(y_test, y_predict)
0.61293168039373225

scikit-learn中的 r2_score

from sklearn.metrics import r2_score

r2_score(y_test, y_predict)
0.61293168039373236

scikit-learn中的LinearRegression中的score返回r2_score:http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

在我们的SimpleRegression中添加score

reg.score(x_test, y_test)
0.61293168039373225

你可能感兴趣的:(机器学习)