以目标函数
为例,采样数据并添加噪声,进行不同阶次的多项式曲线拟合,分析欠拟合和过拟合。
polynomial_feature.py
import numpy as np
class PolynomialFeatureData:
"""
生成特征多项式数据
"""
def __init__(self, x, degree, with_bias=False):
"""
参数初始化
:param x: 采用数据,向量形式
:param degree: 多项式最高阶次
:param with_bias: 是否需要偏置项
"""
self.x = np.asarray(x)
self.degree = degree
self.with_bias = with_bias
if with_bias:
self.data = np.zeros((len(x), degree + 1))
else:
self.data = np.zeros((len(x), degree))
def fit_transform(self):
"""
构造多项式特征数据
:return:
"""
if self.with_bias:
self.data[:, 0] = np.ones(len(self.x))
self.data[:, 1] = self.x.reshape(-1)
for i in range(2, self.degree + 1):
self.data[:, i] = (self.x ** i).reshape(-1)
else:
self.data[:, 0] = self.x.reshape(-1)
for i in range(1, self.degree):
self.data[:, i] = (self.x ** (i + 1)).reshape(-1)
return self.data
if __name__ == '__main__':
x = np.random.randn(5)
feat_obj = PolynomialFeatureData(x, 5, with_bias=True)
data = feat_obj.fit_transform()
print(data)
polynomial_regression_curve.py
import numpy as np
from polynomial_feature import PolynomialFeatureData
class PolynomialRegressionCurve:
"""
多项式曲线拟合,采用线性回归的方法,且是闭式解
"""
def __init__(self, X, y, fit_intercept=False):
"""
参数的初始化
:param X: 样本数据,矩阵形式的
:param y: 目标值,向量
:param fit_intercept: 是否拟合截距,偏置项
"""
self.X, self.y = np.asarray(X), np.asarray(y)
self.fit_intercept = fit_intercept
self.theta = None # 模型拟合的最优参数
def fit(self):
"""
采用线性回归闭式解求解参数
:return:
"""
# pinv() 伪逆
xtx = np.dot(self.X.T, self.X) + 0.01 * np.eye(self.X.shape[1]) # 添加正则项,保证矩阵是可逆的
self.theta = np.linalg.inv(xtx).dot(self.X.T).dot(self.y)
return self.theta
def predict(self, x_test):
"""
模型预测
:param x_test: 测试样本
:return:
"""
x_test = x_test[:, np.newaxis]
if x_test.shape[1] != self.X.shape[1]:
if self.fit_intercept:
feat_obj = PolynomialFeatureData(x_test, self.X.shape[1] - 1, with_bias=True)
x_test = feat_obj.fit_transform()
else:
feat_obj = PolynomialFeatureData(x_test, self.X.shape[1])
x_test = feat_obj.fit_transform()
if self.theta is None:
self.fit()
y_pred = np.dot(self.theta, x_test.T)
return y_pred.reshape(-1)
test_poly_regression.py
import matplotlib.pyplot as plt
import numpy as np
from polynomial_feature import PolynomialFeatureData
from polynomial_regression_curve import PolynomialRegressionCurve
objective_function = lambda x: 3 * np.exp(-x) * np.sin(x) # 目标函数
np.random.seed(0) # 随机种子,便于结果的可重新
n = 10 # 样本量
raw_x = np.linspace(0, 6, n)
raw_y = objective_function(raw_x) + 0.1 * np.random.randn(n) # 目标值 + 噪声,模拟真实采样数据
degrees = [1, 3, 5, 7, 10, 12] # 多项式阶次
plt.figure(figsize=(15, 7))
for i, degree in enumerate(degrees):
feat_data = PolynomialFeatureData(raw_x, degree, with_bias=True) # 根据阶次生成特征数据
X_sample = feat_data.fit_transform()
poly_obj = PolynomialRegressionCurve(X_sample, raw_y, fit_intercept=True)
theta = poly_obj.fit() # 闭式解求解最优参数
print("degree: %d, theta is " % degree, theta)
x_test = np.linspace(0, 6, 150) # 测试样本
y_pred = poly_obj.predict(x_test) # 预测
# 可视化:采样散点图,真实目标函数,拟合的模型
plt.subplot(231 + i)
plt.scatter(raw_x, raw_y, edgecolors="k", s=16, label="Raw Data") # 采样数据散点图
plt.plot(x_test, objective_function(x_test), "k-", lw=1, label="Objective Fun") # 目标向量
plt.plot(x_test, y_pred, "r--", lw=1.5, label="Model Fitting")
plt.legend(frameon=False) # 添加图例,且取消图例边框线
plt.grid(ls=":")
plt.xlabel("$x$", fontdict={"fontsize": 12})
plt.ylabel("$y$", fontdict={"fontsize": 12})
test_ess = (y_pred - objective_function(x_test)) ** 2 # 测试样本的误差平方和
mse_score, mse_std = np.mean(test_ess), np.std(test_ess)
train_mse = ((raw_y - poly_obj.predict(raw_y)) ** 2).mean() # 训练样本均方误差
plt.title("Degree {} Test_MSE = {:.2e}(+/-{:.2e}) \n Train_MSE = {:.2e}".
format(degree, mse_score, mse_std, train_mse), fontdict={"fontsize": 12})
plt.tight_layout()
plt.show()