MSE:均方误差,mean_squared_error
RMSE:均方根误差,root_mean_squared_error
MAE:平均绝对误差,mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
boston.keys()
dict_keys(['data', 'target', 'feature_names', 'DESCR'])
特征向量名称:
boston.feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'B', 'LSTAT'],
dtype=')
x = boston.data[:,5] # 只使用房间数量这个特征,所有行,第五列
x.shape
(506,)
y = boston.target
y.shape
(506,)
plt.scatter(x, y)
plt.show()
np.max(y)
50.0
x = x[y < 50.0]#采用最大值的点可能不是真实的点
y = y[y < 50.0]
x.shape
(490,)
y.shape
(490,)
plt.scatter(x, y)
plt.show()
封装:
import numpy as np
class LinearRegression1:
def __init__(self):
self.a_=None
self.b_=None
def fit(self,x_train,y_train):
num=0.0
d=0.0
x_mean=np.mean(x_train)
y_mean=np.mean(y_train)
for x_i,y_i in zip(x_train,y_train):
num+=(x_i-x_mean)*(y_i-y_mean)
d+=(x_i-x_mean)**2
self.a_=num/d
self.b_=y_mean-self.a_*(x_mean)
return self
def _predict(self,X_train):
return np.array([self.predict(x) for x in X_train])
def predict(self,x_single):
return self.a_*x_single+self.b_
import numpy as np
class LinearRegression2:
def __init__(self):
self.a_=None
self.b_=None
def fit(self,x_train,y_train):
x_mean=np.mean(x_train)
y_mean=np.mean(y_train)
num=(x_train-x_mean).dot(y_train-y_mean)
d=(x_train-x_mean).dot(x_train-x_mean)
self.a_=num/d
self.b_=y_mean-self.a_*x_mean
return self
def _predict(self,X_train):
return np.array([self.predict(x) for x in X_train])
def predict(self,x_single):
return self.a_*x_single+self.b_
随机方法:
import numpy as np
def train_test_split(X, y, test_ratio=0.2, seed=None):
"""将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
assert X.shape[0] == y.shape[0], \
"the size of X must be equal to the size of y"
assert 0.0 <= test_ratio <= 1.0, \
"test_ration must be valid"
if seed:
np.random.seed(seed)
shuffled_indexes = np.random.permutation(len(X))
test_size = int(len(X) * test_ratio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
return X_train, X_test, y_train, y_test
测试:
from playML.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, seed=666)
x_train.shape
(392,)
y_train.shape
(392,)
x_test.shape
(98,)
y_test.shape
(98,)
from playML.SimpleLinearRegression import SimpleLinearRegression
reg = SimpleLinearRegression()
reg.fit(x_train, y_train)
SimpleLinearRegression()
reg.a_#斜率
7.8608543562689555
reg.b_#截距
-27.459342806705543
plt.scatter(x_train, y_train)
plt.plot(x_train, reg.predict(x_train), color='r')
plt.show()
plt.scatter(x_train, y_train)
plt.scatter(x_test, y_test, color="c")
plt.plot(x_train, reg.predict(x_train), color='r')
plt.show()
y_predict = reg.predict(x_test)
mse_test = np.sum((y_predict - y_test)**2) / len(y_test)
mse_test#MSE和实际的y的量纲不一样
24.156602134387438
from math import sqrt
rmse_test = sqrt(mse_test)
rmse_test#平均误差在4.91左右
4.914936635846635
mae_test = np.sum(np.absolute(y_predict - y_test))/len(y_test)
mae_test#MAE得到的结果比RMSE得到的结果小,因为RMSE有平方操作,相当于放大了操作
3.5430974409463873
代码为:
import numpy as np
from math import sqrt
def accuracy_score(y_true, y_predict):
"""计算y_true和y_predict之间的准确率"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum(y_true == y_predict) / len(y_true)
#计算MSE
def mean_squared_error(y_true, y_predict):
"""计算y_true和y_predict之间的MSE"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum((y_true - y_predict)**2) / len(y_true)
#计算RMSE
def root_mean_squared_error(y_true, y_predict):
"""计算y_true和y_predict之间的RMSE"""
return sqrt(mean_squared_error(y_true, y_predict))
#计算MAE
def mean_absolute_error(y_true, y_predict):
"""计算y_true和y_predict之间的MAE"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum(np.absolute(y_true - y_predict)) / len(y_true)
测试前面写的算法:
from playML.metrics import mean_squared_error
from playML.metrics import root_mean_squared_error
from playML.metrics import mean_absolute_error
mean_squared_error(y_test, y_predict)
24.156602134387438
root_mean_squared_error(y_test, y_predict)
4.914936635846635
mean_absolute_error(y_test, y_predict)
3.5430974409463873
没有RMSE,对MSE求方根即可得到
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mean_squared_error(y_test, y_predict)
24.156602134387438
mean_absolute_error(y_test, y_predict)
3.5430974409463873
R^2 越靠近1算法精确度越好
MSE
mse_test=np.sum((y_predict-y_test)**2)/len(y_test)
RSME
rmse_test=sqrt(mse_test)
MAE
mae_test=np.sum((np.absolute(y_predict-y_test))/len(y_test))
from playML.metrics import mean_squared_error
1 - mean_squared_error(y_test, y_predict)/np.var(y_test)
def r2_score(y_true, y_predict):
"""计算y_true和y_predict之间的R Square"""
return 1 - mean_squared_error(y_true, y_predict)/np.var(y_true)
from playML.metrics import r2_score
r2_score(y_test, y_predict)
0.61293168039373225
from sklearn.metrics import r2_score
r2_score(y_test, y_predict)
0.61293168039373236
scikit-learn中的LinearRegression中的score返回r2_score:http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
reg.score(x_test, y_test)
0.61293168039373225