使用pytorch实现
2.2 线性回归
2.2.1 数据集构建
构造一个小的回归数据集:
生成 150 个带噪音的样本,其中 100 个训练样本,50 个测试样本,并打印出训练数据的可视化分布。
import torch
import math
def linear_func(x,w=1.2,b=0.5):
y = w*x + b
return y
def create_toy_data(func, interval, sample_num, noise = 0.0, add_outlier = False, outlier_ratio = 0.001):
X = torch.rand(size = [sample_num]) * (interval[1]-interval[0]) + interval[0]
y = func(X)
epsilon = torch.normal(0,noise,y.shape)
y = y + epsilon
if add_outlier:
outlier_num = int(len(y)*outlier_ratio)
if outlier_num != 0:
outlier_idx = torch.randint(len(y),size = [outlier_num])
y[outlier_idx] = y[outlier_idx] * 5
return X, y
func = linear_func
interval = (-10, 10)
train_num = 100
test_num = 50
noise = 2
X_train, y_train = create_toy_data(func=func, interval=interval, sample_num=train_num, noise = noise, add_outlier = False)
X_test, y_test = create_toy_data(func=func, interval=interval, sample_num=test_num, noise = noise, add_outlier = False)
X_train_large, y_train_large = create_toy_data(func=func, interval=interval, sample_num=5000, noise = noise, add_outlier = False)
X_underlying = torch.linspace(interval[0],interval[1],train_num)
y_underlying = linear_func(X_underlying)
plt.scatter(X_train, y_train, marker='*', facecolor="none", edgecolor='#e4007f', s=50, label="train data")
plt.scatter(X_test, y_test, facecolor="none", edgecolor='#f19ec2', s=50, label="test data")
plt.plot(X_underlying, y_underlying, c='#000000', label=r"underlying distribution")
plt.legend(fontsize='x-large')
plt.show()
2.2.2 模型构建
torch.manual_seed(10)
class Linear(Op):
def __init__(self, input_size):
self.input_size = input_size
self.params = {}
self.params['w'] = torch.randn(size=[self.input_size, 1], dtype=torch.float32)
self.params['b'] = torch.zeros(size=[1], dtype=torch.float32)
def __call__(self, X):
return self.forward(X)
def forward(self, X):
N, D = X.shape
if self.input_size == 0:
return torch.full(([N, 1]), self.params['b'])
assert D == self.input_size
y_pred = torch.matmul(X, self.params['w']) + self.params['b']
return y_pred
input_size = 3
N = 2
X = torch.randn(N, input_size)
model = Linear(input_size)
y_pred = model(X)
print("y_pred:", y_pred)
y_pred: tensor([[1.8529],
[0.6011]])
2.2.3 损失函数
回归任务中常用的评估指标是均方误差
均方误差(mean-square error, MSE)是反映估计量与被估计量之间差异程度的一种度量。
def mean_squared_error(y_true, y_pred):
assert y_true.shape[0] == y_pred.shape[0]
error = torch.mean(torch.square(y_true - y_pred))
return error
y_true = torch.tensor([[-0.2], [4.9]], dtype=torch.float32)
y_pred = torch.tensor([[1.3], [2.5]], dtype=torch.float32)
error = mean_squared_error(y_true=y_true, y_pred=y_pred).item()
print("error:", error)
error: 4.005000114440918
【注意:代码实现中没有除2】思考:没有除2合理么?谈谈自己的看法,写到实验报告。
后面有求导,不影响。
2.2.4 模型优化
经验风险 ( Empirical Risk ),即在训练集上的平均损失。
def optimizer_lsm(model, X, y, reg_lambda=0):
N, D = X.shape
x_bar_tran = torch.mean(X,0).T
y_bar = torch.mean(y)
x_sub = torch.subtract(X, x_bar_tran)
if torch.all(x_sub == 0):
model.params['b'] = y_bar
model.params['w'] = torch.zeros([D])
return model
tmp = torch.inverse(torch.matmul(x_sub.T, x_sub) +
reg_lambda * torch.eye(D))
w = torch.matmul(torch.matmul(tmp, x_sub.T), (y - y_bar))
b = y_bar - torch.matmul(x_bar_tran, w)
model.params['b'] = b
model.params['w'] = torch.squeeze(w, -1)
return model
思考1. 为什么省略了不影响效果?
这里的1是N维的全1向量。
思考 2. 什么是最小二乘法 ( Least Square Method , LSM )
最小二乘法是一种在误差估计、不确定度、系统辨识及预测、预报等数据处理诸多学科领域得到广泛应用的数学工具。
回答以上问题,写到实验报告。
2.2.5 模型训练
在准备了数据、模型、损失函数和参数学习的实现之后,开始模型的训练。
在回归任务中,模型的评价指标和损失函数一致,都为均方误差。
通过上文实现的线性回归类来拟合训练数据,并输出模型在训练集上的损失。
input_size = 1
model = Linear(input_size)
model = optimizer_lsm(model,X_train.reshape([-1,1]),y_train.reshape([-1,1]))
print("w_pred:", model.params['w'].item(), "b_pred: ", model.params['b'].item())
y_train_pred = model(X_train.reshape([-1,1])).squeeze()
train_error = mean_squared_error(y_true=y_train, y_pred=y_train_pred).item()
print("train error: ", train_error)
w_pred: 1.2271720170974731 b_pred: 0.37986236810684204
train error: 3.632181406021118
2.2.6 模型评估
用训练好的模型预测一下测试集的标签,并计算在测试集上的损失。
y_test_pred = model(X_test.reshape([-1,1])).squeeze()
test_error = mean_squared_error(y_true=y_test, y_pred=y_test_pred).item()
print("test error: ",test_error)
test error: 4.306798458099365
2.2.7 样本数量 & 正则化系数
(1) 调整训练数据的样本数量,由 100 调整到 5000,观察对模型性能的影响。
(2) 调整正则化系数,观察对模型性能的影响。
2.3 多项式回归
2.3.1 数据集构建
构建训练和测试数据,其中:
训练数样本 15 个,测试样本 10 个,高斯噪声标准差为 0.1,自变量范围为 (0,1)。
from matplotlib import pyplot as plt
import torch
import math
def sin(x):
y = torch.sin(2 * math.pi * x)
return y
def create_toy_data(func, interval, sample_num, noise = 0.0, add_outlier = False, outlier_ratio = 0.001):
X = torch.rand(size = [sample_num]) * (interval[1]-interval[0]) + interval[0]
y = func(X)
epsilon = torch.normal(0,noise,y.shape)
y = y + epsilon
if add_outlier:
outlier_num = int(len(y)*outlier_ratio)
if outlier_num != 0:
outlier_idx = torch.randint(len(y),size = [outlier_num])
y[outlier_idx] = y[outlier_idx] * 5
return X, y
func = sin
interval = (0, 1)
train_num = 15
test_num = 10
noise = 0.5 # 0.1
X_train, y_train = create_toy_data(func=func, interval=interval, sample_num=train_num, noise=noise)
X_test, y_test = create_toy_data(func=func, interval=interval, sample_num=test_num, noise=noise)
X_underlying = torch.linspace(interval[0], interval[1], 100)
y_underlying = sin(X_underlying)
plt.rcParams['figure.figsize'] = (8.0, 6.0)
plt.scatter(X_train, y_train, facecolor="none", edgecolor='#e4007f', s=50, label="train data")
plt.scatter(X_test, y_test, facecolor="none", edgecolor="b", s=50, label="test data")
plt.plot(X_underlying, y_underlying, c='#000000', label=r"$\sin(2\pi x)$")
plt.legend(fontsize='x-large')
plt.savefig('ml-vis2.pdf')
plt.show()
2.3.2 模型构建
套用求解线性回归参数的方法来求解多项式回归参数
def polynomial_basis_function(x, degree=2):
if degree == 0:
# x = torch.ones(x.shape)
# x = x.to(torch.float32)
# return x
return torch.ones(x.shape)
x_tmp = x
x_result = x_tmp
for i in range(2, degree + 1):
x_tmp = torch.multiply(x_tmp, x)
x_result = torch.concat((x_result, x_tmp), dim=-1)
return x_result
data = [[2], [3], [4]]
X = torch.tensor(data=data)
X = X.to(torch.float32)
degree = 3
transformed_X = polynomial_basis_function(X, degree=degree)
print("转换前:", X)
print("阶数为", degree, "转换后:", transformed_X)
2.3.3 模型训练
对于多项式回归,我们可以同样使用前面线性回归中定义的LinearRegression算子、训练函数train、均方误差函数mean_squared_error。
plt.rcParams['figure.figsize'] = (12.0, 8.0)
for i, degree in enumerate([0, 1, 3, 8]):
model = Linear(degree)
X_train_transformed = polynomial_basis_function(X_train.reshape([-1, 1]), degree)
X_underlying_transformed = polynomial_basis_function(X_underlying.reshape([-1, 1]), degree)
model = optimizer_lsm(model, X_train_transformed, y_train.reshape([-1, 1]))
y_underlying_pred = model(X_underlying_transformed).squeeze()
print(model.params)
plt.subplot(2, 2, i + 1)
plt.scatter(X_train, y_train, facecolor="none", edgecolor='#e4007f', s=50, label="train data")
plt.plot(X_underlying, y_underlying, c='#000000', label=r"$\sin(2\pi x)$")
plt.plot(X_underlying, y_underlying_pred, c='#f19ec2', label="predicted function")
plt.ylim(-2, 1.5)
plt.annotate("M={}".format(degree), xy=(0.95, -1.4))
plt.legend(loc='lower left', fontsize='x-large')
plt.savefig('ml-vis3.pdf')
plt.show()
2.3.4 模型评估
通过均方误差来衡量训练误差、测试误差以及在没有噪音的加入下sin函数值与多项式回归值之间的误差,更加真实地反映拟合结果。多项式分布阶数从0到8进行遍历。
对于模型过拟合的情况,可以引入正则化方法,通过向误差函数中添加一个惩罚项来避免系数倾向于较大的取值。
training_errors = []
test_errors = []
distribution_errors = []
for i in range(9):
model = Linear(i)
X_train_transformed = polynomial_basis_function(X_train.reshape([-1, 1]), i)
X_test_transformed = polynomial_basis_function(X_test.reshape([-1, 1]), i)
X_underlying_transformed = polynomial_basis_function(X_underlying.reshape([-1, 1]), i)
optimizer_lsm(model, X_train_transformed, y_train.reshape([-1, 1]))
y_train_pred = model(X_train_transformed).squeeze()
y_test_pred = model(X_test_transformed).squeeze()
y_underlying_pred = model(X_underlying_transformed).squeeze()
train_mse = mean_squared_error(y_true=y_train, y_pred=y_train_pred).item()
training_errors.append(train_mse)
test_mse = mean_squared_error(y_true=y_test, y_pred=y_test_pred).item()
test_errors.append(test_mse)
y_pred=y_underlying_pred).item()
print("train errors: \n", training_errors)
print("test errors: \n", test_errors)
plt.rcParams['figure.figsize'] = (8.0, 6.0)
plt.plot(training_errors, '-.', mfc="none", mec='#e4007f', ms=10, c='#e4007f', label="Training")
plt.plot(test_errors, '--', mfc="none", mec='#f19ec2', ms=10, c='#f19ec2', label="Test")
label="Distribution")
plt.legend(fontsize='x-large')
plt.xlabel("degree")
plt.ylabel("MSE")
plt.savefig('ml-mse-error.pdf')
plt.show()
2.4 Runner类介绍
机器学习方法流程包括数据集构建、模型构建、损失函数定义、优化器、模型训练、模型评价、模型预测等环节。
为了更方便地将上述环节规范化,我们将机器学习模型的基本要素封装成一个Runner类。
除上述提到的要素外,再加上模型保存、模型加载等功能。
Runner类的成员函数定义如下:
__init__函数:实例化Runner类,需要传入模型、损失函数、优化器和评价指标等;
train函数:模型训练,指定模型训练需要的训练集和验证集;
evaluate函数:通过对训练好的模型进行评价,在验证集或测试集上查看模型训练效果;
predict函数:选取一条数据对训练好的模型进行预测;
save_model函数:模型在训练过程和训练结束后需要进行保存;
load_model函数:调用加载之前保存的模型。
class Runner(object):
def __init__(self, model, optimizer, loss_fn, metric):
self.model = model
self.optimizer = optimizer
self.loss_fn = loss_fn
self.metric = metric
def train(self, train_dataset, dev_dataset=None, **kwargs):
pass
def evaluate(self, data_set, **kwargs):
pass
def predict(self, x, **kwargs):
pass
def save_model(self, save_path):
pass
def load_model(self, model_path):
pass
2.5 基于线性回归的波士顿房价预测
使用线性回归来对马萨诸塞州波士顿郊区的房屋进行预测。
实验流程主要包含如下5个步骤:
数据处理:包括数据清洗(缺失值和异常值处理)、数据集划分,以便数据可以被模型正常读取,并具有良好的泛化性;
模型构建:定义线性回归模型类;
训练配置:训练相关的一些配置,如:优化算法、评价指标等;
组装训练框架Runner:Runner用于管理模型训练和测试过程;
模型训练和测试:利用Runner进行模型训练和测试。
2.5.1 数据处理
2.5.1.2 数据清洗
print(data.head())
print(data.isna().sum())
2.5.1.3 数据集划分
torch.manual_seed(10)
def train_test_split(X, y, train_percent=0.8):
n = len(X)
shuffled_indices = torch.randperm(n)
train_set_size = int(n * train_percent)
train_indices = shuffled_indices[:train_set_size]
test_indices = shuffled_indices[train_set_size:]
X = X.values
y = y.values
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
return X_train, X_test, y_train, y_test
X = data.drop(['MEDV'], axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y)
2.5.1.4 特征工程
X_train = torch.tensor(X_train)
X_train = X_train.to(torch.float32)
X_test = torch.tensor(X_test)
X_train = X_train.to(torch.float32)
y_train = torch.tensor(y_train)
X_train = X_train.to(torch.float32)
y_test = torch.tensor(y_test)
X_train = X_train.to(torch.float32)
X_min = torch.min(X_train)
X_max = torch.max(X_train)
X_train = (X_train-X_min)/(X_max-X_min)
X_test = (X_test-X_min)/(X_max-X_min)
train_dataset = (X_train, y_train)
test_dataset = (X_test, y_test)
2.5.2 模型构建
input_size = 12
model=Linear(input_size)
2.5.3 完善Runner类
class Runner(object):
def __init__(self, model, optimizer, loss_fn, metric):
self.model = model
self.metric = metric
self.optimizer = optimizer
def train(self, dataset, reg_lambda, model_dir):
X, y = dataset
self.optimizer(self.model, X, y, reg_lambda)
self.save_model(model_dir)
def evaluate(self, dataset, **kwargs):
X, y = dataset
y_pred = self.model(X)
result = self.metric(y_pred, y)
return result
def predict(self, X, **kwargs):
return self.model(X)
def save_model(self, model_dir):
if not os.path.exists(model_dir):
os.makedirs(model_dir)
params_saved_path = os.path.join(model_dir, 'params.pdtensor')
torch.save(model.params, params_saved_path)
def load_model(self, model_dir):
params_saved_path = os.path.join(model_dir, 'params.pdtensor')
self.model.params = torch.load(params_saved_path)
optimizer = optimizer_lsm
runner = Runner(model, optimizer=optimizer, loss_fn=None, metric=mse_loss)
2.5.4 模型训练
saved_dir = 'pythonPoject2'
runner.train(train_dataset, reg_lambda=0, model_dir=saved_dir)
columns_list = data.columns.to_list()
weights = runner.model.params['w'].tolist()
b = runner.model.params['b'].item()
for i in range(len(weights)):
print(columns_list[i], "weight:", weights[i])
print("b:", b)
2.5.5 模型测试
runner.load_model(saved_dir)
mse = runner.evaluate(test_dataset)
print('MSE:', mse.item())
2.5.6 模型预测
runner.load_model(saved_dir)
pred = runner.predict(X_test[:1])
print("真实房价:",y_test[:1].item())
print("预测的房价:",pred.item())