随机森林、决策树的MATLAB及Python完整代码实现

- ## 随机森林、决策树MATLAB及Python完整代码实现

代码1:自编程序csdn1.m

treeNum = 10;
featureNum  = 10;
dataNum =1000 ;
dataTrain=xlsread('NSL_train.xlsx');
dataTest =xlsread('NSL_test.xlsx');
y=RF(treeNum ,featureNum,dataNum ,dataTrain,dataTest);
fprintf('\n*****随机森林分类准确率为:%f***\n',y);
function [accuracy] = RF(treeNum ,featureNum,dataNum ,dataTrain,dataTest)
[dataAll,featureGrounp] = dataSet(dataTrain,treeNum,featureNum,dataNum);
RF = buildRandForest(dataAll,treeNum);
RF_prection = RFprection(RF,featureGrounp,dataTest);
accuracy = calAccuracy(dataTest,RF_prection);
end

function RF = buildRandForest(dataTrain,treeNum)
     RF = [];
   
     fprintf('*********正在训练随机森林,共%d课树**********\n',treeNum);
for a = 1: treeNum
     data = dataTrain(:,:,a);
     note = buildCartTree(data,0);
     fprintf('++++++第%d课树训练完成\n',a);
     RF = [RF,note];
     fprintf('===============================\n');
end   
    fprintf('************随机森林训练完成!*******\n');
end

function  note = buildCartTree(data,k)
  k = k + 1;
  [m,n] = size(data);

   if m == 0
      note = struct();
   else
      currentGini =  calGiniIndex(data);
      bestGini = 0;
      featureNum = n - 1;
      for a = 1:featureNum   
          feature_values = unique(data(:,a));
          [m1,n1] = size(feature_values);
          for b = 1:m1
              [D1,D2] = splitData(data,a,feature_values(b,n1));
              [m2,n2] = size(D1);
              [m3,n3] = size(D2);
             
              Gini_1 = calGiniIndex(D1);
              Gini_2 = calGiniIndex(D2);
              nowGini = (m2*Gini_1+m3*Gini_2)/m;
              gGini = currentGini - nowGini;
            
              if gGini > bestGini && m2>0 && m3>0
                 bestGini =  gGini;
                 bestFeature = [a,feature_values(b,n1)];  
                 rightData = D1;
                 leftData = D2;
              end   
             
          end 
      end 
      if bestGini > 0
         note =  buildCartTree(rightData,k) ;
         right = note;
         note = buildCartTree(leftData,k) ;
         left = note ;
         s1 = 'bestFeature';
         s2 = 'value';
         s3 = 'rightBranch';
         s4 = 'leftBranch';
         s5 = 'leaf';
         leafValue = [];
         note =  struct(s1,bestFeature(1,1),s2,bestFeature(1,2),s3,right,s4,left,s5,leafValue);
      else
         leafValue = data(1,n);
         s1 = 'leaf';
         note = struct(s1,leafValue);
      end    
   end
end
function  [dataAll,featureAll] = dataSet(dataTrain,treeNum,featureNum,dataNum)%数据集建立子函数
   dataAll = zeros(dataNum,featureNum+1,treeNum);
   featureAll = zeros(featureNum,1,treeNum);
   for a = 1: treeNum
    [data,feature] = chooseSample(dataTrain,featureNum,dataNum);
    dataAll(:,:,a) = data;
    featureAll(:,:,a) = feature';
   end 
end
function RF_prection_ = RFprection(RF,featureGrounp,dataTrain)
     [m,n] = size(RF);
     [m2,n2] = size(dataTrain);
     RF_prection = [];
     
     for a = 1:n
         RF_single = RF(:,a);
         feature = featureGrounp(:,:,a);
         data = splitData2(dataTrain,feature);
         RF_prection_single = [];
         for b =1:m2
             A = prection(RF_single,data(b,:));
             RF_prection_single = [RF_prection_single;A];
         end    
         RF_prection = [RF_prection,RF_prection_single];  
     end    
     RF_prection_ = mode(RF_prection,2);
end
function [Data1,Data2] = splitData(data,fea,value)

     D1 = [];
     D2 = [];
     [m,n] = size(data);
     if m == 0
       D1 = 0;
       D2 = 0;  
     else    
        D1 = find(data(:,fea) >= value);
        D2 = find(data(:,fea) < value);   
        Data1 = data(D1,:);
        Data2 = data(D2,:);
     end
end
function data = splitData2(dataTrain,feature)
   [m,n] = size(dataTrain);
   [m1,n1] = size(feature);
   data = zeros(m,m1);

   data(:,:) = dataTrain(:,feature);
end
function [data,feature] = chooseSample(data1,featureNum,dataNum)
   [m,n] = size(data1);
     B = randperm(n-1);
     feature = B(1,1:featureNum);
    C= zeros(1,dataNum);
        A = randperm(m);
        C(1,:) = A(1,1:dataNum);
    data= data1(C,feature);
    data = [data,data1(C,n)];
end
function Gini = calGiniIndex(data)
    [m,n] = size(data);
    if  m == 0
        Gini = 0;
    else
        labelsNum = labels_num2(data);
        [m1,n1] = size(labelsNum);
        
        Gini = 0;
        for a = 1:m1
            Gini = Gini + labelsNum(a,n1)^2;
        end    
        Gini = 1 - Gini/(m^2);
    end    
end

%统计标签中不同类型标签的数量
function labelsNum = labels_num2(data)
      [m,n] = size(data);
   
     if m == 0
        labelsNum = 0; 
     else    
      labels = data(:,n);
      
     A = unique(labels,'sorted');
     [m1,n1] = size(A);
     B = zeros(m1,2);
     B(:,1) = A(:,1);
     for a = 1:m1
          B(a,2) = size(find(labels == A(a,1)),1);
     end    
     labelsNum = B;
     end
end

function A = prection(RF_single,sample)
    if isempty(RF_single.leaf) == 0 
       A =  RF_single.leaf;
    else
       B = sample(1,RF_single.bestFeature);
       if B >= RF_single.value
           branch = RF_single.rightBranch;
       else
           branch = RF_single.leftBranch;
       end 
       A = prection(branch,sample);
    end    
end
function accuracy = calAccuracy(dataTest,RF_prection)
      [m,n] = size(dataTest);
      A = dataTest(:,n);
      right = sum(A == RF_prection);
      accuracy = right/m;
end

代码2:TreeBagger

%训练模型
Factor = TreeBagger(treeNumber, train, trainLabel,'Method','classification','NumPredictorsToSample',featureNum,'OOBpredictorImportance','on');%
%性能评估,k-fold交叉验证法
[Predict_label,Scores]  = predict(Factor, test);%%%%测试集预测标签

代码3:Sklearn

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn import metrics  # 分类结果评价函数

#数据集读取、交叉验证等省略
forest_1 = RandomForestClassifier(n_estimators=2000, random_state=10, n_jobs=-1, oob_score=True)
        forest_1.fit(x_train, y_train)
        expected = y_test  # 测试样本的期望输出
        predicted = forest_1.predict(x_selected_test_3)  # 测试样本预测
        # 输出结果
        print(metrics.classification_report(expected, predicted))  # 输出结果,精确度、召回率、f-1分数
        print(metrics.confusion_matrix(expected, predicted))  # 混淆矩阵
        auc = metrics.roc_auc_score(y_test, predicted)
        accuracy = metrics.accuracy_score(y_test, predicted)  # 求精度
        print("RF_Accuracy: %.2f%%" % (accuracy * 100.0))

你可能感兴趣的:(机器学习入门,python,决策树,随机森林,matlab)