batch gradient descent和stochastic/incremental gradient descent

题目:batch gradient descent和stochastic/incremental gradient descent

        由于斯坦福机器学习公开课的影响力,网上已有各种解析文章,但当我想找个程序时却没有搜索到,因此这里不再去谈具体内容,只给出个人编写的matlab代码。

        下面给出batch gradient descent和stochastic/incremental gradient descent算法的代码,内容对应公开课资料中的“讲义”文件夹下的《notes1.pdf》的前7页。

1、batch gradient descent(只有一个变量)

%batch gradient descent with one variable
%h_theta(x) = theta0 + theta1*x1
clc;clear all;close all;
%初始化training set
x = 1:10;
m = length(x);%training set中training example个数
y = x + 0.5*randn(1,m);
%初始化learning rate,此值不能太大
alpha = 0.0005;
%初始化parameters(also called weights)
theta0 = 0;
theta1 = 0;
%将training set画出来
figure;plot(x,y,'x');
%将training set画出来
figure;plot(x,y,'kx','linewidth',2);hold on;
kk = 0;
while 1
    htheta = theta0 + theta1*x;%由线性函数h所预测的y
    plot(x,htheta);
    %求迭代公式求和号项
    sum_t0 = 0;
    sum_t1 = 0;
    for ii = 1:m
        sum_t0 = sum_t0 + (y(ii)-htheta(ii))*1;%letting x0=1
        sum_t1 = sum_t1 + (y(ii)-htheta(ii))*x(ii);
    end
    %计算新的theta0和theta1
    theta0_t = theta0 + alpha * sum_t0;
    theta1_t = theta1 + alpha * sum_t1;
    %更新theta0和theta1
    theta0 = theta0_t;
    theta1 = theta1_t;
    %以下为收敛性判断
    kk = kk + 1;
    htheta_n = theta0 + theta1*x;
    h_err = (htheta_n - htheta)*(htheta_n - htheta)';
    %迭代两次差别很小时
    if h_err<1e-6
        break;
    end
    %防止不收敛造成死循环
    if kk>1000%迭代1000次还不收敛则跳出循环
        break;
    end
end
htheta = theta0 + theta1*x;
plot(x,htheta,'r','linewidth',3);
xlim([1 10]);ylim([1 10]);
hold off;

2、stochastic/incremental gradient descent(只有一个变量)

%stochastic/incremental gradient descent with one variable
%h_theta(x) = theta0 + theta1*x1
clc;clear all;close all;
%初始化training set
x = 1:10;
m = length(x);%training set中training example个数
y = x + 0.5*randn(1,m);
%初始化learning rate,此值不能太大
alpha = 0.001;
%初始化parameters(also called weights)
theta0 = 0;
theta1 = 0;
%将training set画出来
figure;plot(x,y,'x');
%将training set画出来
figure;plot(x,y,'kx','linewidth',2);hold on;
kk = 0;
while 1
    htheta = theta0 + theta1*x;
    plot(x,htheta);
    for ii = 1:m
        htheta_ii = theta0 + theta1*x(ii);%由线性函数h所预测的y
        theta0 = theta0 + alpha * (y(ii)-htheta_ii)*1;%letting x0=1
        theta1 = theta1 + alpha * (y(ii)-htheta_ii)*x(ii);        
    end
    %以下为收敛性判断
    kk = kk + 1;
    htheta_n = theta0 + theta1*x;
    h_err = (htheta_n - htheta)*(htheta_n - htheta)';
    %迭代两次差别很小时
    if h_err<1e-6
        break;
    end
    %防止不收敛造成死循环
    if kk>1000%迭代1000次还不收敛则跳出循环
        break;
    end
end
htheta = theta0 + theta1*x;
plot(x,htheta,'r','linewidth',3);
hold off;

3、batch gradient descent(有两个变量)

%batch gradient descent with two variable
%h_theta(x) = theta0 + theta1*x1 + theta2*x2
clc;clear all;close all;
%初始化training set
x1 = 1:10;x2=x1;
m = length(x1);%training set中training example个数
y = x1 + 0.5*randn(1,m);
%初始化learning rate,此值不能太大
alpha = 0.001;
%初始化parameters(also called weights)
theta0 = 0;
theta1 = 0;
theta2 = 0;
%将training set画出来
figure;plot3(x1,x2,y,'x');xlabel('x1');ylabel('x2');grid;
%将training set画出来
figure;plot3(x1,x2,y,'x','linewidth',2);hold on;
kk = 0;
while 1
    htheta = theta0 + theta1*x1 + theta2*x2;%由线性函数h所预测的y
    plot3(x1,x2,htheta);
    %求迭代公式求和号项
    sum_t0 = 0;
    sum_t1 = 0;
    sum_t2 = 0;
    for ii = 1:m
        sum_t0 = sum_t0 + (y(ii)-htheta(ii))*1;%letting x0=1
        sum_t1 = sum_t1 + (y(ii)-htheta(ii))*x1(ii);
        sum_t2 = sum_t2 + (y(ii)-htheta(ii))*x2(ii);
    end
    %计算新的theta0和theta1和theta2
    theta0_t = theta0 + alpha * sum_t0;
    theta1_t = theta1 + alpha * sum_t1;
    theta2_t = theta2 + alpha * sum_t2;
    %更新theta0和theta1和theta2
    theta0 = theta0_t;
    theta1 = theta1_t;
    theta2 = theta2_t;
    %以下为收敛性判断
    kk = kk + 1;
    htheta_n = theta0 + theta1*x1 + theta2*x2;
    h_err = (htheta_n - htheta)*(htheta_n - htheta)';
    %迭代两次差别很小时
    if h_err<1e-6
        break;
    end
    %防止不收敛造成死循环
    if kk>1000%迭代1000次还不收敛则跳出循环
        break;
    end
end
htheta = theta0 + theta1*x1 + theta2*x2;
plot3(x1,x2,htheta,'r','linewidth',3);
xlabel('x1');ylabel('x2');grid;
hold off;

4、stochastic/incremental gradient descent(有两个变量)

%stochastic/incremental gradient descent with two variable
%h_theta(x) = theta0 + theta1*x1 + theta2*x2
clc;clear all;close all;
%初始化training set
x1 = 1:10;x2=x1;
m = length(x1);%training set中training example个数
y = x1 + 0.5*randn(1,m);
%初始化learning rate,此值不能太大
alpha = 0.001;
%初始化parameters(also called weights)
theta0 = 0;
theta1 = 0;
theta2 = 0;
%将training set画出来
figure;plot3(x1,x2,y,'x');xlabel('x1');ylabel('x2');grid;
%将training set画出来
figure;plot3(x1,x2,y,'x','linewidth',2);hold on;
kk = 0;
while 1
    htheta = theta0 + theta1*x1 + theta2*x2;%由线性函数h所预测的y
    plot3(x1,x2,htheta);
    for ii = 1:m
        htheta_ii = theta0 + theta1*x1(ii) + theta2*x2(ii);%由线性函数h所预测的y
        theta0 = theta0 + alpha * (y(ii)-htheta_ii)*1;%letting x0=1
        theta1 = theta1 + alpha * (y(ii)-htheta_ii)*x1(ii); 
        theta2 = theta2 + alpha * (y(ii)-htheta_ii)*x2(ii);
    end
    %以下为收敛性判断
    kk = kk + 1;
    htheta_n = theta0 + theta1*x1 + theta2*x2;
    h_err = (htheta_n - htheta)*(htheta_n - htheta)';
    %迭代两次差别很小时
    if h_err<1e-6
        break;
    end
    %防止不收敛造成死循环
    if kk>1000%迭代1000次还不收敛则跳出循环
        break;
    end
end
htheta = theta0 + theta1*x1 + theta2*x2;
plot3(x1,x2,htheta,'r','linewidth',3);
xlabel('x1');ylabel('x2');grid;
hold off;

5、利用讲义notes1.pdf第1页的数据的batch gradient descent

batch gradient descent和stochastic/incremental gradient descent_第1张图片

%batch gradient descent with one variable
%使用讲义第1页的数据
%h_theta(x) = theta0 + theta1*x1
clc;clear all;close all;
%初始化training set
%x = 1:10;
x=[2104 1600 2400 1416 3000];
m = length(x);%training set中training example个数
%y = x + 0.5*randn(1,m);
y=[400 330 369 232 540];
%初始化learning rate,此值不能太大
%alpha = 0.0005;
alpha = 0.00000001;
%初始化parameters(also called weights)
theta0 = 0;
theta1 = 0;
%将training set画出来
figure;plot(x,y,'x');
%将training set画出来
figure;plot(x,y,'kx','linewidth',2);hold on;
kk = 0;
while 1
    htheta = theta0 + theta1*x;%由线性函数h所预测的y
    plot(x,htheta);
    %求迭代公式求和号项
    sum_t0 = 0;
    sum_t1 = 0;
    for ii = 1:m
        sum_t0 = sum_t0 + (y(ii)-htheta(ii))*1;%letting x0=1
        sum_t1 = sum_t1 + (y(ii)-htheta(ii))*x(ii);
    end
    %计算新的theta0和theta1
    theta0_t = theta0 + alpha * sum_t0;
    theta1_t = theta1 + alpha * sum_t1;
    %更新theta0和theta1
    theta0 = theta0_t;
    theta1 = theta1_t;
    %以下为收敛性判断
    kk = kk + 1;
    htheta_n = theta0 + theta1*x;
    h_err = (htheta_n - htheta)*(htheta_n - htheta)';
    %迭代两次差别很小时
    if h_err<1e-6
        break;
    end
    %防止不收敛造成死循环
    if kk>1000%迭代1000次还不收敛则跳出循环
        break;
    end
end
htheta = theta0 + theta1*x;
plot(x,htheta,'r','linewidth',3);
%xlim([1 10]);ylim([1 10]);
hold off;

6、总结

        之所以最后又给了一组代码,这是因为当我用写好的程序去处理讲义第1页的数据时发现结果不收敛,试了很多次,后来想到可能是learning rate参数的设置问题,后来把alpha的值一直减小,终于得到了收敛结果。所以,试处理不同的数据时一定要注意alpha参数的设置。

        不知道程序到底对不对,反正自己生成的数据是可以很好的去处理。

你可能感兴趣的:(batch gradient descent和stochastic/incremental gradient descent)