机器学习matlab代码整理--非神经网络学习器

概览

本文给出了下面9种学习器的matlab代码(代码主要来自前人整理,外加自己编写),应用的例程(附带iris数据集),以及介绍算法原理优秀的链接。

  1. 支持向量机
  2. k-近邻
  3. 线性判别分析
  4. 随机森林
  5. RSS
  6. Bagging
  7. Boosting
  8. 逻辑回归
  9. 朴素贝叶斯

程序包
链接:https://pan.baidu.com/s/1M_lzlJ5LqyGY96KTAYDakw
提取码:oom7

应用例程

%% 机器学习-例程
%% 简单介绍
% 
% 
% 功能:基于训练数据,利用学习器,构建预测模型。
% 
% 输入:训练数据(特征+标签),测试数据(特征+标签)。
% 
% 输出:预测的标签
% 
% 学习器的选择:稳定学习器优先SVM,不稳定学习器优先ELM。(可以对分类数据做可视化分析,根据分布特点选择对应偏好的分类器)
% 
% 
% 注意:以下例程中,测试数据是‘特征+标签’,输出是测试精度和预测样本。在实际应用中,可直接将测试样本的标签设置为0,然后进行预测标签。
%% 数据整理

    load iris.mat       % 加载数据集,特征在前,标签在后,每一行是一个样本,每一列是一个特征
    tr=iris(1:100,:);   % 取iris数据的前100行作为训练样本
    te=iris(101:end,:); % 取后50行作为测试样本
%% 学习器
% 1. SVM:支持向量机

    [A_SVM,SVM_label] = f_SVM(tr,te);
    % 该程序需要安装libsvm
    % 原理:https://zhuanlan.zhihu.com/p/77750026

% 2.KNN:k-近邻

    k = 3;
    [A_kNN,label_knn] = f_knn(tr,te,k);
    [A_kNN_1,label_knn_1,Probability] = knn_ave(tr,te,k); % 改进型kNN
    % 原理:https://zhuanlan.zhihu.com/p/61341071
    
% 3.LDA:线性判别分析

    [A_LDA,label_LDA] = f_LDA(tr,te);
    % 原理:https://zhuanlan.zhihu.com/p/32658341

% 4.RF:随机森林

    NTrees = 5;
    [A_RF,label_RF]=f_RF(tr,te,NTrees);
    % 原理:https://blog.csdn.net/qq_34106574/article/details/82016442
    
% 5.RSS

    [A_RSS,label_RSS]=f_RSS(tr,te);

% 6.Bagging
    % 集成:样本干扰

    NTrees = 5;
    [A_Bagging,label_Bagging]=f_Bagging(tr,te,NTrees);
    % 原理:https://www.cnblogs.com/cgmcoding/p/13567288.html

% 7.Boosting
    % 集成:特征干扰

    NTrees = 5;
    [A_Boosting,label_Boosting]=f_Boosting(tr,te,NTrees);
    % 原理:https://www.cnblogs.com/lyfruit/articles/3011429.html
    
% 8.LR:逻辑回归

    [A_LR,label_LR]=f_LR(tr,te);
    % 原理:https://zhuanlan.zhihu.com/p/74874291

% 9.NB:朴素贝叶斯

    [A_NB,label_NB] = f_NB(tr,te);
    % 原理:https://zhuanlan.zhihu.com/p/26262151

子程序

function [A,svmlabel]=f_SVM(tr,te)

Trd = tr(:,1:end-1);
Trl = tr(:,end);%%训练样本和标号
Ted = te(:,1:end-1);
Tel = te(:,end);%%测试样本和标号
model = svmtrain(Trl, Trd, ' -c 1 -g 0.07 -t 0 -q');
[svmlabel, TestingAccuracy,~] = svmpredict(Tel, Ted, model);
A=TestingAccuracy(1)/100;
end
%% svmtrain 参数介绍
% -s svm类型:SVM设置类型(默认0)
%     0 — C-SVC; 1 –v-SVC; 2 – 一类SVM; 3 — e-SVR; 4 — v-SVR
% -t 核函数类型:核函数设置类型(默认2%     0 – 线性核函数:u’v
%     1 – 多项式核函数:(ru’v + coef0)^degree
%     2RBF(径向基)核函数:exp(-r|u-v|^2%     3 – sigmoid核函数:tanh(ru’v + coef0)
% -d degree:核函数中的degree设置(针对多项式核函数)(默认3% -g r(gamma):核函数中的gamma函数设置(针对多项式/rbf/sigmoid核函数)(默认1/k,k为总类别数)
% -r coef0:核函数中的coef0设置(针对多项式/sigmoid核函数)((默认0)
% -c cost:设置C-SVC,e -SVR和v-SVR的参数(损失函数)(默认1% -n nu:设置v-SVC,一类SVM和v- SVR的参数(默认0.5% -p p:设置e -SVR 中损失函数p的值(默认0.1% -m cachesize:设置cache内存大小,以MB为单位(默认40% -e eps:设置允许的终止判据(默认0.001% -h shrinking:是否使用启发式,01(默认1% -wi weight:设置第几类的参数C为weight*C (C-SVC中的C) (默认1% -v n: n-fold交互检验模式,n为fold的个数,必须大于等于2

function [A,knnlabel]=f_knn(tr,te,k)

%knn :      搜寻距离测试样本最近的k的训练样本,k个样本中占比最高的标签预测为测试样本的标签。
%Input:      tr: 训练数据(标签在最后一列)
%            te: 测试数据(标签在最后一列)
%            k: 近邻数 

%Output:     A:测试精度
%            knnlabel:预测标签


if ~exist('k', 'var')
     k = 3;
end               %如果没有输入k值,取k=3
    
data=[tr;te];
n=size(data,2);
m1=size(tr,1);
m2=size(te,1);    %m1为训练样本数,m2为测试样本数

trd=tr(:,1:n-1);
trl=tr(:,n);
ted=te(:,1:n-1);
tel=te(:,n);      %-d为数据,-l为标签

knnlabel=zeros(m2,1);
for j=1:m2
    distance=zeros(m1,1);
    for i=1:m1
        distance(i)=norm(ted(j,:)-trd(i,:));    %计算测试数据与每个训练数据的欧式距离 
    end

    [~,index]=sort(distance); 
    label=trl(index(1:k));      %统计距离最近的k个训练样本的标签
    knnlabel(j,1)=mode(label);  %取数目最多的标签为预测标签
end

bj=(knnlabel==tel);
a=nnz(bj);
A=a/m2; %输出识别率
end
function [A_1,knnlabel_1,Probability]=knn_ave(tr,te,k)
%knn :      搜寻距离测试样本最近的k的训练样本,k个样本中平均距离最短的标签预测为测试样本的标签。
%Input:      tr: 训练数据(标签在最后一列)
%            te: 测试数据(标签在最后一列)
%            k: 近邻数 

%Output:     A:测试精度
%            knnlabel:预测标签
if ~exist('k', 'var')
     k = 3;
end               %如果没有输入k值,取k=3


    
data=[tr;te];
n=size(data,2);
m1=size(tr,1);
m2=size(te,1);    %m1为训练样本数,m2为测试样本数

trd=tr(:,1:n-1);
trl=tr(:,n);
ted=te(:,1:n-1);
tel=te(:,n);      %-d为数据,-l为标签
num_label = size(unique(trl),1);

probability=zeros(size(te,1),num_label);

knnlabel_1=zeros(m2,1);
for j=1:m2
    distance=zeros(m1,1);
    for i=1:m1
        distance(i)=norm(ted(j,:)-trd(i,:));    %计算测试数据与每个训练数据的欧式距离 
    end

 [distance1,index]=sort(distance); 
 x1=trl(index,end);
 distance1(:,2)=x1;      %distance1的第一列是标签,第二列是距离
 di=zeros(num_label,2);
for w=1:num_label
    x2=find(distance1(:,2)==w);
    x2=x2(1:k,:);
    dis=distance1(x2,1);
    dis=sum(dis)/k;
    di(w,1)=dis;
    di(w,2)=w;
end                      %把每一种标签都找出距离最近的k的样本,并计算平均距离

c=sum(di(:,1))./di(:,1)';
c=c/max(c,[],2);
probability(j,:)=c;      %输出概率:距离的总和除以各个距离,然后除以其中最大值,得类概率

b=sortrows(di,1);
knnlabel_1(j,1)=b(1,2);  %平均距离最近的标签为预测标签
end
Probability=[probability,knnlabel_1];
bj=(knnlabel_1==tel);
a=nnz(bj);
A_1=a/m2;                %输出识别率
function [A,predict_label]=f_LDA(tr,te)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%Output:     A: Testing Accuracy
%            predict_label: predict label by DA for testingdata

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

obj = ClassificationDiscriminant.fit(trd, trl);  
predict_label   =       predict(obj, ted);  

bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率

end
function [A,Predict_label]=f_RF(tr,te,NTrees)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%            NTrees: the number of the decision trees
%Output:     A: Testing Accuracy
%            Predict_label: predict label by DA for testingdata
    if ~exist('NTrees', 'var')
        NTrees = 50;
    end

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

Factor = TreeBagger(NTrees, trd, trl);
[Predict_label,Scores] = predict(Factor, ted);
a=0;
for i=1:m2
    cla=str2num(Predict_label{i,1});
     ssr(i,1)=cla;
    if cla==tel(i);
        a=a+1;
    end
end
A=a/m2; %输出识别率
Predict_label=ssr;
end
function [A,predict_label]=f_RSS(tr,te)

% 随机子空间分类 Random subspace method
%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%            NTrees: the number of the decision trees
%Output:     A: Testing Accuracy
%            class: predict label by DA for testingdata

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

ens = fitensemble(trd,trl,'Subspace' ,'AllPredictorCombinations','Discriminant','type','classification');  
predict_label   =       predict(ens, ted);  

bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率

end 
function [A,predict_label]=f_Bagging(tr,te,NTrees)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%            NTrees: the number of the decision trees
%Output:     A: Testing Accuracy
%            class: predict label by DA for testingdata

    if ~exist('NTrees', 'var')
        NTrees = 50;
    end


data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

ens = fitensemble(trd,trl,'Bag' ,NTrees,'tree','type','classification');  
predict_label   =       predict(ens, ted);  

bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率

end
function [A,predict_label]=f_Boosting(tr,te,NTrees)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%            NTrees: the number of the decision trees
%Output:     A: Testing Accuracy
%            predict_label: predict label by Boosting for testingdata

    if ~exist('NTrees', 'var')
        NTrees = 50;
    end

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

L=unique(data(:,end));      % 合并标签中相同数据
ls=length(L(:));  %统计类别总数
if ls==2
    str='AdaBoostM1';
else str='AdaBoostM2';
end

ens = fitensemble(trd,trl,str ,NTrees,'tree','type','classification');  
predict_label   =       predict(ens, ted);  

bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率

end
function [A,Predict_label]=f_LR(tr,te)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%Output:     A: Testing Accuracy
%            Predict_label: predict label by logistic regression for testingdata

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);    tr_l=dummyvar(trl);
ted=te(:,1:n-1);tel=te(:,n);

[B1,dev1,stats1] = mnrfit(trd,tr_l);
    pihat1 = mnrval(B1,ted);
    Predict_label=[];
  for i2=1:m2;
    [q1,v1]=max(pihat1(i2,:));
          Predict_label=[Predict_label;v1];
  end

bj=(Predict_label==tel);a=nnz(bj);
A=a/m2; %输出识别率

end
function [accuracy,nblabel] = f_NB(Train, Test)

%Input:      Train: Training set
%            Test: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%Output:     accuracy: Testing Accuracy
%            nblabel: predict label by nb for testingdata

Train_sample = Train(:,1:end-1);
Train_label = Train(:,end);
Test_sample = Test(:,1:end-1);
Test_label = Test(:,end);
Class_num = length(unique(Train_label));
Feature_num = size(Train_sample,2);
Para_mean =   cell(1,Class_num);%Mean for each feature and class
Para_dev = cell(1,Class_num);%Dev for each feature and class
Sample_byclass = cell(1,Class_num);%Reorder the data set by class
Prior_prob = zeros(1,Class_num);%Prior probability of each class
for i=1:1:size(Train_sample,1)
    Sample_byclass{1,Train_label(i,1)} = [Sample_byclass{1,Train_label(i,1)}; Train_sample(i,:)];
    Prior_prob(1,Train_label(i,1)) = Prior_prob(1,Train_label(i,1)) + 1;
end
Prior_prob = Prior_prob/size(Train_sample,1);
for i=1:1:Class_num
     miu = mean(Sample_byclass{1,i});
     delta = std(Sample_byclass{1,i});   
     Para_mean{1,i} = miu;
     Para_dev{1,i} = delta;
end
nblabel = [];
for i=1:1:size(Test_sample,1)
     prob = log(Prior_prob);
     for j=1:1:Class_num
         for k=1:1:Feature_num
             if Para_dev{1,j}(1,k) == 0
                 Para_dev{1,j}(1,k) = 0.1667;
             end
             prob(1,j) = prob(1,j)   - (Test_sample(i,k)-Para_mean{1,j}(1,k))^2/(2*Para_dev{1,j}(1,k)^2)   - log(Para_dev{1,j}(1,k));
         end
     end
     [value,index] = max(prob);
     nblabel = [nblabel ; index];
end
accuracy = length(find(nblabel - Test_label ==0))/length(Test_label);
end

你可能感兴趣的:(matlab编程,机器学习,matlab,机器学习)