假设线性回归方程如下:
h ( x ) = h θ ( x ) = θ 0 + θ 1 x + + θ 2 x h(x)=h_\theta(x)=\theta_0+\theta_1x++\theta_2x h(x)=hθ(x)=θ0+θ1x++θ2x
损失函数,表达式如下:
J ( θ ) = 1 2 ∑ i = 0 m ( h θ ( x i ) − y i ) 2 J(\theta) = \frac{1}{2}\sum_{i=0}^{m}(h_\theta(x^i)-y^i)^2 J(θ)=21i=0∑m(hθ(xi)−yi)2
目标为求解最优的 θ \theta θ,使损失函数 J ( θ ) J(\theta) J(θ)取最小值。
处理一个样本的表达式:
θ j : = θ j + α ( y i − ( h θ ( x i ) ) x j i \theta_j:=\theta_j+\alpha(y^i-(h_\theta(x^i))x_j^i θj:=θj+α(yi−(hθ(xi))xji
转化为处理多个样本:
Repeat until convergence{
θ j : = θ j + α ∑ i = 1 m ( y i − ( h θ ( x i ) ) x j i \theta_j:=\theta_j+\alpha\sum_{i=1}^{m}(y^i-(h_\theta(x^i))x_j^i θj:=θj+αi=1∑m(yi−(hθ(xi))xji
}
α -步长(learning rate),控制θ每次向J(θ)变小的方向迭代时的变化幅度
在表达式中每一步都是计算的全部训练集的数据,所以称之为批梯度下降(batch gradient descent)。
注意,梯度下降可能得到局部最优,但在优化问题里已经证明线性回归只有一个最优点,因为损失函数J(θ)是一个二次的凸函数,不会产生局部最优的情况。(假设学习步长α不是特别大)
随机梯度下降在计算下降最快的方向时时随机选一个数据进行计算,而不是扫描全部训练数据集,这样就加快了迭代速度。随机梯度下降并不是沿着J(θ)下降最快的方向收敛,而是震荡的方式趋向极小点。
Loop {
for i=i to m {
θ j : = θ j + α ( y i − ( h θ ( x i ) ) x j i \theta_j:=\theta_j+\alpha(y^i-(h_\theta(x^i))x_j^i θj:=θj+α(yi−(hθ(xi))xji
}
}
linear_regression.h 代码片
.
// 利用批梯度下降算法求解线性回归
#include
#include
#include
using namespace std;
double predict(double* w, double* data, int feature_num);
// 损失函数
double Theta(double **training_set, int featue_num, int training_num, double* w);
// 批梯度下降
void gradient_descent(double** training_set, int feature_num, int training_num, double* w, double a, int iterator_time);
// 特征归一化
void feature_normalize(double **feature_set, int feature_num, int training_num);
// 测试模型预测误差
void forecast(double *forecast_set, double* w, int featue_num);
linear_regression.cpp 代码片
.
#include "pch.h"
#include "gradient_descent.h"
double predict(double* w, double* data, int feature_num) {
double sum = 0;
for (int i = 0; i < feature_num; i++) {
sum += w[i] * data[i];
}
return sum;
}
double Theta(double **training_set, int featue_num, int training_num, double* w) {
double sum = 0;
for (int i = 0; i < training_num; i++) {
sum += (training_set[i][featue_num] - predict(w, training_set[i], featue_num))*(training_set[i][featue_num] - predict(w, training_set[i], featue_num));
}
return sum / (2 * training_num);
}
void gradient_descent(double** training_set, int feature_num, int training_num, double* w, double a, int iterator_time) {
while (iterator_time--) {
double* del_theta = new double[feature_num];
for (int i = 0; i < feature_num; i++) {
del_theta[i] = 0;
for (int j = 0; j < training_num; j++) {
del_theta[i] += (predict(w, training_set[j], feature_num) - training_set[j][feature_num])*training_set[j][i];
}
}
//w[i]的更新必须等所有的del_theta测算出来了才可以!不然更新的会影响没更新的
//上述问题在代码内表示即是下面的for循环不能和上面的合并!
for (int i = 0; i < feature_num; i++)
w[i] -= a * del_theta[i] / (double)training_num;
//printf("%.3lf\n", Theta(training_set, feature_num, training_num, w));
delete[] del_theta;
}
printf("计算结果:\n");
for (int i = 0; i < feature_num - 1; i++) {
printf("%.3lf ", w[i]);
}
printf("%.3lf\n", w[feature_num - 1]);
return;
}
void forecast(double *forecast_set, double* w, int feature_num) {
double y=w[0];
for (int i = 1; i < feature_num - 1; i++) {
printf("%.3lf ", w[i]);
printf("\t%.3lf\n ", forecast_set[i]);
y = y + w[i] * forecast_set[i];
}
printf("------------------------------------------\n");
printf("预测值: %.3lf\n", y);
printf("实际值: %.3lf\n", forecast_set[feature_num]);
printf("误差: %.3lf%%\n", fabs(y - forecast_set[feature_num])*100/y);
}
void feature_normalize(double **feature_set, int feature_num, int training_num) {
//特征归一化
// 对于某个特征 x(i)=(x(i)-average(X))/standard_devistion(X)
// 1、求出特征X在n个样本中的平均值average(X)
// 2、求出特征X在n个样本中的标准差 standard_devistion(X)
// 3、对特征X的n个样本中的每个值x(i),使用上述公式进行归一化
double *average = new double[feature_num];
double *stanrd_divition = new double[feature_num];
for (int i = 1; i < feature_num; i++) {
double sum = 0;
for (int j = 0; j < training_num; j++) {
sum += feature_set[j][i];
}
average[i] = sum / training_num;
}
for (int i = 1; i < feature_num; i++) {
double sum = 0;
for (int j = 0; j < training_num; j++) {
sum += (feature_set[j][i] - average[i])*(feature_set[j][i] - average[i]);
}
stanrd_divition[i] = sqrt((sum / (training_num - 1)));
}
for (int i = 1; i < feature_num; i++)
for (int j = 0; j < training_num; j++) {
feature_set[j][i] = (feature_set[j][i] - average[i]) / (double)stanrd_divition[i];
}
delete[] stanrd_divition;
delete[] average;
}