本文给出了下面9种学习器的matlab代码(代码主要来自前人整理,外加自己编写),应用的例程(附带iris数据集),以及介绍算法原理优秀的链接。
程序包
链接:https://pan.baidu.com/s/1M_lzlJ5LqyGY96KTAYDakw
提取码:oom7
%% 机器学习-例程
%% 简单介绍
%
%
% 功能:基于训练数据,利用学习器,构建预测模型。
%
% 输入:训练数据(特征+标签),测试数据(特征+标签)。
%
% 输出:预测的标签
%
% 学习器的选择:稳定学习器优先SVM,不稳定学习器优先ELM。(可以对分类数据做可视化分析,根据分布特点选择对应偏好的分类器)
%
%
% 注意:以下例程中,测试数据是‘特征+标签’,输出是测试精度和预测样本。在实际应用中,可直接将测试样本的标签设置为0,然后进行预测标签。
%% 数据整理
load iris.mat % 加载数据集,特征在前,标签在后,每一行是一个样本,每一列是一个特征
tr=iris(1:100,:); % 取iris数据的前100行作为训练样本
te=iris(101:end,:); % 取后50行作为测试样本
%% 学习器
% 1. SVM:支持向量机
[A_SVM,SVM_label] = f_SVM(tr,te);
% 该程序需要安装libsvm
% 原理:https://zhuanlan.zhihu.com/p/77750026
% 2.KNN:k-近邻
k = 3;
[A_kNN,label_knn] = f_knn(tr,te,k);
[A_kNN_1,label_knn_1,Probability] = knn_ave(tr,te,k); % 改进型kNN
% 原理:https://zhuanlan.zhihu.com/p/61341071
% 3.LDA:线性判别分析
[A_LDA,label_LDA] = f_LDA(tr,te);
% 原理:https://zhuanlan.zhihu.com/p/32658341
% 4.RF:随机森林
NTrees = 5;
[A_RF,label_RF]=f_RF(tr,te,NTrees);
% 原理:https://blog.csdn.net/qq_34106574/article/details/82016442
% 5.RSS
[A_RSS,label_RSS]=f_RSS(tr,te);
% 6.Bagging
% 集成:样本干扰
NTrees = 5;
[A_Bagging,label_Bagging]=f_Bagging(tr,te,NTrees);
% 原理:https://www.cnblogs.com/cgmcoding/p/13567288.html
% 7.Boosting
% 集成:特征干扰
NTrees = 5;
[A_Boosting,label_Boosting]=f_Boosting(tr,te,NTrees);
% 原理:https://www.cnblogs.com/lyfruit/articles/3011429.html
% 8.LR:逻辑回归
[A_LR,label_LR]=f_LR(tr,te);
% 原理:https://zhuanlan.zhihu.com/p/74874291
% 9.NB:朴素贝叶斯
[A_NB,label_NB] = f_NB(tr,te);
% 原理:https://zhuanlan.zhihu.com/p/26262151
function [A,svmlabel]=f_SVM(tr,te)
Trd = tr(:,1:end-1);
Trl = tr(:,end);%%训练样本和标号
Ted = te(:,1:end-1);
Tel = te(:,end);%%测试样本和标号
model = svmtrain(Trl, Trd, ' -c 1 -g 0.07 -t 0 -q');
[svmlabel, TestingAccuracy,~] = svmpredict(Tel, Ted, model);
A=TestingAccuracy(1)/100;
end
%% svmtrain 参数介绍
% -s svm类型:SVM设置类型(默认0)
% 0 — C-SVC; 1 –v-SVC; 2 – 一类SVM; 3 — e-SVR; 4 — v-SVR
% -t 核函数类型:核函数设置类型(默认2)
% 0 – 线性核函数:u’v
% 1 – 多项式核函数:(ru’v + coef0)^degree
% 2 – RBF(径向基)核函数:exp(-r|u-v|^2)
% 3 – sigmoid核函数:tanh(ru’v + coef0)
% -d degree:核函数中的degree设置(针对多项式核函数)(默认3)
% -g r(gamma):核函数中的gamma函数设置(针对多项式/rbf/sigmoid核函数)(默认1/k,k为总类别数)
% -r coef0:核函数中的coef0设置(针对多项式/sigmoid核函数)((默认0)
% -c cost:设置C-SVC,e -SVR和v-SVR的参数(损失函数)(默认1)
% -n nu:设置v-SVC,一类SVM和v- SVR的参数(默认0.5)
% -p p:设置e -SVR 中损失函数p的值(默认0.1)
% -m cachesize:设置cache内存大小,以MB为单位(默认40)
% -e eps:设置允许的终止判据(默认0.001)
% -h shrinking:是否使用启发式,0或1(默认1)
% -wi weight:设置第几类的参数C为weight*C (C-SVC中的C) (默认1)
% -v n: n-fold交互检验模式,n为fold的个数,必须大于等于2
function [A,knnlabel]=f_knn(tr,te,k)
%knn : 搜寻距离测试样本最近的k的训练样本,k个样本中占比最高的标签预测为测试样本的标签。
%Input: tr: 训练数据(标签在最后一列)
% te: 测试数据(标签在最后一列)
% k: 近邻数
%Output: A:测试精度
% knnlabel:预测标签
if ~exist('k', 'var')
k = 3;
end %如果没有输入k值,取k=3
data=[tr;te];
n=size(data,2);
m1=size(tr,1);
m2=size(te,1); %m1为训练样本数,m2为测试样本数
trd=tr(:,1:n-1);
trl=tr(:,n);
ted=te(:,1:n-1);
tel=te(:,n); %-d为数据,-l为标签
knnlabel=zeros(m2,1);
for j=1:m2
distance=zeros(m1,1);
for i=1:m1
distance(i)=norm(ted(j,:)-trd(i,:)); %计算测试数据与每个训练数据的欧式距离
end
[~,index]=sort(distance);
label=trl(index(1:k)); %统计距离最近的k个训练样本的标签
knnlabel(j,1)=mode(label); %取数目最多的标签为预测标签
end
bj=(knnlabel==tel);
a=nnz(bj);
A=a/m2; %输出识别率
end
function [A_1,knnlabel_1,Probability]=knn_ave(tr,te,k)
%knn : 搜寻距离测试样本最近的k的训练样本,k个样本中平均距离最短的标签预测为测试样本的标签。
%Input: tr: 训练数据(标签在最后一列)
% te: 测试数据(标签在最后一列)
% k: 近邻数
%Output: A:测试精度
% knnlabel:预测标签
if ~exist('k', 'var')
k = 3;
end %如果没有输入k值,取k=3
data=[tr;te];
n=size(data,2);
m1=size(tr,1);
m2=size(te,1); %m1为训练样本数,m2为测试样本数
trd=tr(:,1:n-1);
trl=tr(:,n);
ted=te(:,1:n-1);
tel=te(:,n); %-d为数据,-l为标签
num_label = size(unique(trl),1);
probability=zeros(size(te,1),num_label);
knnlabel_1=zeros(m2,1);
for j=1:m2
distance=zeros(m1,1);
for i=1:m1
distance(i)=norm(ted(j,:)-trd(i,:)); %计算测试数据与每个训练数据的欧式距离
end
[distance1,index]=sort(distance);
x1=trl(index,end);
distance1(:,2)=x1; %distance1的第一列是标签,第二列是距离
di=zeros(num_label,2);
for w=1:num_label
x2=find(distance1(:,2)==w);
x2=x2(1:k,:);
dis=distance1(x2,1);
dis=sum(dis)/k;
di(w,1)=dis;
di(w,2)=w;
end %把每一种标签都找出距离最近的k的样本,并计算平均距离
c=sum(di(:,1))./di(:,1)';
c=c/max(c,[],2);
probability(j,:)=c; %输出概率:距离的总和除以各个距离,然后除以其中最大值,得类概率
b=sortrows(di,1);
knnlabel_1(j,1)=b(1,2); %平均距离最近的标签为预测标签
end
Probability=[probability,knnlabel_1];
bj=(knnlabel_1==tel);
a=nnz(bj);
A_1=a/m2; %输出识别率
function [A,predict_label]=f_LDA(tr,te)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
%Output: A: Testing Accuracy
% predict_label: predict label by DA for testingdata
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
obj = ClassificationDiscriminant.fit(trd, trl);
predict_label = predict(obj, ted);
bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率
end
function [A,Predict_label]=f_RF(tr,te,NTrees)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
% NTrees: the number of the decision trees
%Output: A: Testing Accuracy
% Predict_label: predict label by DA for testingdata
if ~exist('NTrees', 'var')
NTrees = 50;
end
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
Factor = TreeBagger(NTrees, trd, trl);
[Predict_label,Scores] = predict(Factor, ted);
a=0;
for i=1:m2
cla=str2num(Predict_label{i,1});
ssr(i,1)=cla;
if cla==tel(i);
a=a+1;
end
end
A=a/m2; %输出识别率
Predict_label=ssr;
end
function [A,predict_label]=f_RSS(tr,te)
% 随机子空间分类 Random subspace method
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
% NTrees: the number of the decision trees
%Output: A: Testing Accuracy
% class: predict label by DA for testingdata
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
ens = fitensemble(trd,trl,'Subspace' ,'AllPredictorCombinations','Discriminant','type','classification');
predict_label = predict(ens, ted);
bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率
end
function [A,predict_label]=f_Bagging(tr,te,NTrees)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
% NTrees: the number of the decision trees
%Output: A: Testing Accuracy
% class: predict label by DA for testingdata
if ~exist('NTrees', 'var')
NTrees = 50;
end
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
ens = fitensemble(trd,trl,'Bag' ,NTrees,'tree','type','classification');
predict_label = predict(ens, ted);
bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率
end
function [A,predict_label]=f_Boosting(tr,te,NTrees)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
% NTrees: the number of the decision trees
%Output: A: Testing Accuracy
% predict_label: predict label by Boosting for testingdata
if ~exist('NTrees', 'var')
NTrees = 50;
end
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
L=unique(data(:,end)); % 合并标签中相同数据
ls=length(L(:)); %统计类别总数
if ls==2
str='AdaBoostM1';
else str='AdaBoostM2';
end
ens = fitensemble(trd,trl,str ,NTrees,'tree','type','classification');
predict_label = predict(ens, ted);
bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率
end
function [A,Predict_label]=f_LR(tr,te)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
%Output: A: Testing Accuracy
% Predict_label: predict label by logistic regression for testingdata
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n); tr_l=dummyvar(trl);
ted=te(:,1:n-1);tel=te(:,n);
[B1,dev1,stats1] = mnrfit(trd,tr_l);
pihat1 = mnrval(B1,ted);
Predict_label=[];
for i2=1:m2;
[q1,v1]=max(pihat1(i2,:));
Predict_label=[Predict_label;v1];
end
bj=(Predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率
end
function [accuracy,nblabel] = f_NB(Train, Test)
%Input: Train: Training set
% Test: testing set
% Note that: each row represents a instance, last column is label, begins from 1
%Output: accuracy: Testing Accuracy
% nblabel: predict label by nb for testingdata
Train_sample = Train(:,1:end-1);
Train_label = Train(:,end);
Test_sample = Test(:,1:end-1);
Test_label = Test(:,end);
Class_num = length(unique(Train_label));
Feature_num = size(Train_sample,2);
Para_mean = cell(1,Class_num);%Mean for each feature and class
Para_dev = cell(1,Class_num);%Dev for each feature and class
Sample_byclass = cell(1,Class_num);%Reorder the data set by class
Prior_prob = zeros(1,Class_num);%Prior probability of each class
for i=1:1:size(Train_sample,1)
Sample_byclass{1,Train_label(i,1)} = [Sample_byclass{1,Train_label(i,1)}; Train_sample(i,:)];
Prior_prob(1,Train_label(i,1)) = Prior_prob(1,Train_label(i,1)) + 1;
end
Prior_prob = Prior_prob/size(Train_sample,1);
for i=1:1:Class_num
miu = mean(Sample_byclass{1,i});
delta = std(Sample_byclass{1,i});
Para_mean{1,i} = miu;
Para_dev{1,i} = delta;
end
nblabel = [];
for i=1:1:size(Test_sample,1)
prob = log(Prior_prob);
for j=1:1:Class_num
for k=1:1:Feature_num
if Para_dev{1,j}(1,k) == 0
Para_dev{1,j}(1,k) = 0.1667;
end
prob(1,j) = prob(1,j) - (Test_sample(i,k)-Para_mean{1,j}(1,k))^2/(2*Para_dev{1,j}(1,k)^2) - log(Para_dev{1,j}(1,k));
end
end
[value,index] = max(prob);
nblabel = [nblabel ; index];
end
accuracy = length(find(nblabel - Test_label ==0))/length(Test_label);
end