def preprocess():
# get the dataset of boston
X = boston().data
y = boston().target
name_data = boston().feature_names
# draw the figure of relationship between feature and price
plt.figure()
for i in range(len(X[0])):
plt.subplot(4, 4, i + 1)
plt.scatter(X[:, i], y, s=20)
plt.title(name_data[i])
plt.show()
# delete less relevant feature
X = np.delete(X, [0, 1, 3, 4, 6, 7, 8, 9, 11], axis=1)
# normalization
for i in range(len(X[0])):
X[:, i] = (X[:, i] - X[:, i].min()) / (X[:, i].max() - X[:, i].min())
# split into test and train
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=10)
return Xtrain, Xtest, Ytrain, Ytest, X
这个直接贴代码:
def lr(Xtrain, Xtest, Ytrain, Ytest, if_figure):
# use LinearRegression
reg = LR().fit(Xtrain, Ytrain)
y_pred = reg.predict(Xtest)
loss = mean_squared_error(Ytest, y_pred)
print("*************LR*****************")
print("w\t= {}".format(reg.coef_))
print("b\t= {:.4f}".format(reg.intercept_))
# draw the figure of predict results
if if_figure:
plt.figure()
plt.plot(range(len(Ytest)), Ytest, c="blue", label="real")
plt.plot(range(len(y_pred)), y_pred, c="red", linestyle=':', label="predict")
plt.title("predict results from row LR")
plt.legend()
plt.show()
return loss
梯度下降主要思想就是以梯度作为每次迭代优化的方向,以步长更新参数,直到最优
为了两种方法比对方便,这里也使用均分误差作为Loss函数
def gradDescnet(Xtrain, Xtest, Ytrain, Ytest, X, if_figure, rate):
# grad descent
def grad(y, yp, X):
grad_w = (y - yp) * (-X)
grad_b = (y - yp) * (-1)
return [grad_w, grad_b]
# set training parameters
epoch_train = 100
learning_rate = rate
w = np.random.normal(0.0, 1.0, (1, len(X[0])))
b = 0.0
loss_train = []
loss_test = []
for epoch in range(epoch_train + 1):
loss1 = 0
for i in range(len(Xtrain)):
yp = w.dot(Xtrain[i]) + b
# calculate the loss
err = Ytrain[i] - yp
loss1 += err ** 2
# iterate update w and b
gw = grad(Ytrain[i], yp, Xtrain[i])[0]
gb = grad(Ytrain[i], yp, Xtrain[i])[1]
w = w - learning_rate * gw
b = b - learning_rate * gb
# record the loss
loss_train.append(loss1 / len(Xtrain))
loss11 = 0
for i in range(len(Xtest)):
yp2 = w.dot(Xtest[i]) + b
err2 = Ytest[i] - yp2
loss11 += err2 ** 2
# record the loss
loss_test.append(loss11 / len(Xtest))
# shuffle the data
Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
# draw the figure of loss
if if_figure:
plt.figure()
plt.title("figure of loss")
plt.plot(range(len(loss_train)), loss_train, c="blue", linestyle=":", label="train")
plt.plot(range(len(loss_test)), loss_test, c="red", label="test")
plt.legend()
plt.show()
# draw figure of predict results
if if_figure:
Predict_value = []
for i in range(len(Xtest)):
Predict_value.append(w.dot(Xtest[i]) + b)
plt.figure()
plt.title("predict results from gradScent")
plt.plot(range(len(Xtest)), Ytest, c="blue", label="real")
plt.plot(range(len(Xtest)), Predict_value, c="red", linestyle=':', label="predict")
plt.legend()
plt.show()
return loss_test[-1], w, b
为了最终代码整洁,这里也封装为一个函数
梯度下降的步长选择0.01,这个超参数在下一部分会进行优化选择
def test():
if_figure = True
Xtrain, Xtest, Ytrain, Ytest, X = preprocess()
loss_lr = lr(Xtrain, Xtest, Ytrain, Ytest, if_figure)
loss_gd, w, b = gradDescnet(Xtrain, Xtest, Ytrain, Ytest, X, if_figure, 0.01)
print("*************GD*****************")
print("w\t: {}".format(w))
print("b\t: {}".format(b))
print("************loss****************")
print("lr\t: %.4f" % loss_lr)
print("gd\t: %.4f" % loss_gd)
输出结果:
*************LR*****************
w = [ -0.39200523 21.25173835 -8.18006811 -21.61002144]
b = 23.0543
*************GD*****************
w : [[ -0.43534889 21.65996503 -8.10720196 -21.3622824 ]]
b : [22.83733711]
************loss****************
lr : 31.4272
gd : 31.2842
由于迭代步长过小容易造成更新速度慢,而过长容易导致错过最优点
这里选择从0.001到0.05之间,输出步长和loss值的关系
同样封装成一个函数
def searchRate():
if_figure = False
Xtrain, Xtest, Ytrain, Ytest, X = preprocess()
loss_grad = []
w_grad = []
b_grad = []
rates = list(np.arange(0.001, 0.05, 0.001))
epoch = 1
for rate in rates:
loss, w, b = gradDescnet(Xtrain, Xtest, Ytrain, Ytest, X, if_figure, rate)
loss_grad.append(loss[0])
w_grad.append(w)
b_grad.append(b)
print("epoch %d: %.4f" % (epoch, loss_grad[-1]))
epoch += 1
plt.figure()
plt.plot(rates, loss_grad)
plt.title("loss under different rate")
plt.show()
loss_grad_min = min(loss_grad)
position = loss_grad.index(loss_grad_min)
w = w_grad[position]
b = b_grad[position]
rate = rates[position]
loss_lr = lr(Xtrain, Xtest, Ytrain, Ytest, if_figure)
print("*************GD*****************")
print("w\t: {}".format(w))
print("b\t: {}".format(b))
print("rate: %.3f" % rate)
print("************loss****************")
print("lr\t: %.4f" % loss_lr)
print("gd\t: %.4f" % loss_grad_min)
输出结果:
epoch 1: 35.1047
epoch 2: 31.9512
epoch 3: 31.6400
epoch 4: 31.8814
epoch 5: 31.3429
epoch 6: 31.7260
epoch 7: 31.5825
epoch 8: 31.5523
epoch 9: 32.4876
epoch 10: 31.4287
epoch 11: 31.1475
epoch 12: 32.0841
epoch 13: 32.0033
epoch 14: 31.5768
epoch 15: 31.1828
epoch 16: 31.6558
epoch 17: 32.2582
epoch 18: 32.4916
epoch 19: 31.2118
epoch 20: 32.2877
epoch 21: 31.7237
epoch 22: 32.1203
epoch 23: 32.7307
epoch 24: 32.7434
epoch 25: 32.6421
epoch 26: 31.8588
epoch 27: 31.1762
epoch 28: 33.0360
epoch 29: 32.5580
epoch 30: 32.4591
epoch 31: 31.4191
epoch 32: 31.1398
epoch 33: 31.4291
epoch 34: 31.3900
epoch 35: 31.2239
epoch 36: 31.4200
epoch 37: 31.2967
epoch 38: 32.5322
epoch 39: 32.3174
epoch 40: 34.3984
epoch 41: 31.1794
epoch 42: 31.8992
epoch 43: 32.0060
epoch 44: 34.0944
epoch 45: 34.3244
epoch 46: 31.1479
epoch 47: 32.8374
epoch 48: 31.7111
epoch 49: 33.6676
*************LR*****************
w = [ -0.39200523 21.25173835 -8.18006811 -21.61002144]
b = 23.0543
*************GD*****************
w : [[ -0.29030409 21.60092767 -8.02647596 -21.79164094]]
b : [23.35049725]
rate: 0.032
************loss****************
lr : 31.4272
gd : 31.1398
可见调整步长对于最终结果还是有较大影响的
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston as boston
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import mean_squared_error