题目:batch gradient descent和stochastic/incremental gradient descent
由于斯坦福机器学习公开课的影响力,网上已有各种解析文章,但当我想找个程序时却没有搜索到,因此这里不再去谈具体内容,只给出个人编写的matlab代码。
下面给出batch gradient descent和stochastic/incremental gradient descent算法的代码,内容对应公开课资料中的“讲义”文件夹下的《notes1.pdf》的前7页。
1、batch gradient descent(只有一个变量)
%batch gradient descent with one variable %h_theta(x) = theta0 + theta1*x1 clc;clear all;close all; %初始化training set x = 1:10; m = length(x);%training set中training example个数 y = x + 0.5*randn(1,m); %初始化learning rate,此值不能太大 alpha = 0.0005; %初始化parameters(also called weights) theta0 = 0; theta1 = 0; %将training set画出来 figure;plot(x,y,'x'); %将training set画出来 figure;plot(x,y,'kx','linewidth',2);hold on; kk = 0; while 1 htheta = theta0 + theta1*x;%由线性函数h所预测的y plot(x,htheta); %求迭代公式求和号项 sum_t0 = 0; sum_t1 = 0; for ii = 1:m sum_t0 = sum_t0 + (y(ii)-htheta(ii))*1;%letting x0=1 sum_t1 = sum_t1 + (y(ii)-htheta(ii))*x(ii); end %计算新的theta0和theta1 theta0_t = theta0 + alpha * sum_t0; theta1_t = theta1 + alpha * sum_t1; %更新theta0和theta1 theta0 = theta0_t; theta1 = theta1_t; %以下为收敛性判断 kk = kk + 1; htheta_n = theta0 + theta1*x; h_err = (htheta_n - htheta)*(htheta_n - htheta)'; %迭代两次差别很小时 if h_err<1e-6 break; end %防止不收敛造成死循环 if kk>1000%迭代1000次还不收敛则跳出循环 break; end end htheta = theta0 + theta1*x; plot(x,htheta,'r','linewidth',3); xlim([1 10]);ylim([1 10]); hold off;
%stochastic/incremental gradient descent with one variable %h_theta(x) = theta0 + theta1*x1 clc;clear all;close all; %初始化training set x = 1:10; m = length(x);%training set中training example个数 y = x + 0.5*randn(1,m); %初始化learning rate,此值不能太大 alpha = 0.001; %初始化parameters(also called weights) theta0 = 0; theta1 = 0; %将training set画出来 figure;plot(x,y,'x'); %将training set画出来 figure;plot(x,y,'kx','linewidth',2);hold on; kk = 0; while 1 htheta = theta0 + theta1*x; plot(x,htheta); for ii = 1:m htheta_ii = theta0 + theta1*x(ii);%由线性函数h所预测的y theta0 = theta0 + alpha * (y(ii)-htheta_ii)*1;%letting x0=1 theta1 = theta1 + alpha * (y(ii)-htheta_ii)*x(ii); end %以下为收敛性判断 kk = kk + 1; htheta_n = theta0 + theta1*x; h_err = (htheta_n - htheta)*(htheta_n - htheta)'; %迭代两次差别很小时 if h_err<1e-6 break; end %防止不收敛造成死循环 if kk>1000%迭代1000次还不收敛则跳出循环 break; end end htheta = theta0 + theta1*x; plot(x,htheta,'r','linewidth',3); hold off;
%batch gradient descent with two variable %h_theta(x) = theta0 + theta1*x1 + theta2*x2 clc;clear all;close all; %初始化training set x1 = 1:10;x2=x1; m = length(x1);%training set中training example个数 y = x1 + 0.5*randn(1,m); %初始化learning rate,此值不能太大 alpha = 0.001; %初始化parameters(also called weights) theta0 = 0; theta1 = 0; theta2 = 0; %将training set画出来 figure;plot3(x1,x2,y,'x');xlabel('x1');ylabel('x2');grid; %将training set画出来 figure;plot3(x1,x2,y,'x','linewidth',2);hold on; kk = 0; while 1 htheta = theta0 + theta1*x1 + theta2*x2;%由线性函数h所预测的y plot3(x1,x2,htheta); %求迭代公式求和号项 sum_t0 = 0; sum_t1 = 0; sum_t2 = 0; for ii = 1:m sum_t0 = sum_t0 + (y(ii)-htheta(ii))*1;%letting x0=1 sum_t1 = sum_t1 + (y(ii)-htheta(ii))*x1(ii); sum_t2 = sum_t2 + (y(ii)-htheta(ii))*x2(ii); end %计算新的theta0和theta1和theta2 theta0_t = theta0 + alpha * sum_t0; theta1_t = theta1 + alpha * sum_t1; theta2_t = theta2 + alpha * sum_t2; %更新theta0和theta1和theta2 theta0 = theta0_t; theta1 = theta1_t; theta2 = theta2_t; %以下为收敛性判断 kk = kk + 1; htheta_n = theta0 + theta1*x1 + theta2*x2; h_err = (htheta_n - htheta)*(htheta_n - htheta)'; %迭代两次差别很小时 if h_err<1e-6 break; end %防止不收敛造成死循环 if kk>1000%迭代1000次还不收敛则跳出循环 break; end end htheta = theta0 + theta1*x1 + theta2*x2; plot3(x1,x2,htheta,'r','linewidth',3); xlabel('x1');ylabel('x2');grid; hold off;
%stochastic/incremental gradient descent with two variable %h_theta(x) = theta0 + theta1*x1 + theta2*x2 clc;clear all;close all; %初始化training set x1 = 1:10;x2=x1; m = length(x1);%training set中training example个数 y = x1 + 0.5*randn(1,m); %初始化learning rate,此值不能太大 alpha = 0.001; %初始化parameters(also called weights) theta0 = 0; theta1 = 0; theta2 = 0; %将training set画出来 figure;plot3(x1,x2,y,'x');xlabel('x1');ylabel('x2');grid; %将training set画出来 figure;plot3(x1,x2,y,'x','linewidth',2);hold on; kk = 0; while 1 htheta = theta0 + theta1*x1 + theta2*x2;%由线性函数h所预测的y plot3(x1,x2,htheta); for ii = 1:m htheta_ii = theta0 + theta1*x1(ii) + theta2*x2(ii);%由线性函数h所预测的y theta0 = theta0 + alpha * (y(ii)-htheta_ii)*1;%letting x0=1 theta1 = theta1 + alpha * (y(ii)-htheta_ii)*x1(ii); theta2 = theta2 + alpha * (y(ii)-htheta_ii)*x2(ii); end %以下为收敛性判断 kk = kk + 1; htheta_n = theta0 + theta1*x1 + theta2*x2; h_err = (htheta_n - htheta)*(htheta_n - htheta)'; %迭代两次差别很小时 if h_err<1e-6 break; end %防止不收敛造成死循环 if kk>1000%迭代1000次还不收敛则跳出循环 break; end end htheta = theta0 + theta1*x1 + theta2*x2; plot3(x1,x2,htheta,'r','linewidth',3); xlabel('x1');ylabel('x2');grid; hold off;
%batch gradient descent with one variable %使用讲义第1页的数据 %h_theta(x) = theta0 + theta1*x1 clc;clear all;close all; %初始化training set %x = 1:10; x=[2104 1600 2400 1416 3000]; m = length(x);%training set中training example个数 %y = x + 0.5*randn(1,m); y=[400 330 369 232 540]; %初始化learning rate,此值不能太大 %alpha = 0.0005; alpha = 0.00000001; %初始化parameters(also called weights) theta0 = 0; theta1 = 0; %将training set画出来 figure;plot(x,y,'x'); %将training set画出来 figure;plot(x,y,'kx','linewidth',2);hold on; kk = 0; while 1 htheta = theta0 + theta1*x;%由线性函数h所预测的y plot(x,htheta); %求迭代公式求和号项 sum_t0 = 0; sum_t1 = 0; for ii = 1:m sum_t0 = sum_t0 + (y(ii)-htheta(ii))*1;%letting x0=1 sum_t1 = sum_t1 + (y(ii)-htheta(ii))*x(ii); end %计算新的theta0和theta1 theta0_t = theta0 + alpha * sum_t0; theta1_t = theta1 + alpha * sum_t1; %更新theta0和theta1 theta0 = theta0_t; theta1 = theta1_t; %以下为收敛性判断 kk = kk + 1; htheta_n = theta0 + theta1*x; h_err = (htheta_n - htheta)*(htheta_n - htheta)'; %迭代两次差别很小时 if h_err<1e-6 break; end %防止不收敛造成死循环 if kk>1000%迭代1000次还不收敛则跳出循环 break; end end htheta = theta0 + theta1*x; plot(x,htheta,'r','linewidth',3); %xlim([1 10]);ylim([1 10]); hold off;
之所以最后又给了一组代码,这是因为当我用写好的程序去处理讲义第1页的数据时发现结果不收敛,试了很多次,后来想到可能是learning rate参数的设置问题,后来把alpha的值一直减小,终于得到了收敛结果。所以,试处理不同的数据时一定要注意alpha参数的设置。
不知道程序到底对不对,反正自己生成的数据是可以很好的去处理。