Shark源码分析(十一):随机森林算法

Shark源码分析(十一):随机森林算法

关于这个算法的介绍,可以参看我之前关于集成算法的博客。因为Shark中关于决策树算法只实现了CART算法,所以随机森林算法中也只包含了CART算法。如果你已经看过了我之前写的关于CART算法源码分析的博客,看到后面就会发现它与随机森林算法的代码其实差不多。只是在选择最优划分属性时多了一个随机选取候选集的过程。这也是随机森林算法的一大特点。因为CART算法既可以用于分类任务中也可以用于回归任务中,所以基于CART算法的随机森林也能用于这两个任务。这里我们只介绍其用于分类任务中的代码。

MeanModel类

这个类应该算是集成算法的基类,表示如何将多个基学习器的输出结果综合起来。该类定义在中。

template<class ModelType> // ModelType表示基学习器的类型
class MeanModel : public AbstractModel<typename ModelType::InputType, typename ModelType::OutputType>
{
private:
    typedef AbstractModel<typename ModelType::InputType, typename ModelType::OutputType> base_type;
public:

    MeanModel():m_weightSum(0){}

    std::string name() const
    { return "MeanModel"; }

    using base_type::eval;
    // 输出集成学习器的结果,与决策树输出的结果一样,是对于每一个类别的所属概率
    void eval(typename base_type::BatchInputType const& patterns, typename base_type::BatchOutputType& outputs)const{
        m_models[0].eval(patterns,outputs);
        outputs *=m_weight[0];
        for(std::size_t i = 1; i != m_models.size(); i++) 
            noalias(outputs) += m_weight[i] * m_models[i](patterns);
        outputs /= m_weightSum;
    }

    void eval(typename base_type::BatchInputType const& patterns, typename base_type::BatchOutputType& outputs, State& state)const{
        eval(patterns,outputs);
    }

    RealVector parameterVector() const {
        return RealVector();
    }

    void setParameterVector(const RealVector& param) {
        SHARK_ASSERT(param.size() == 0);
    }

    void read(InArchive& archive){
        archive >> m_models;
        archive >> m_weight;
        archive >> m_weightSum;
    }

    void write(OutArchive& archive)const{
        archive << m_models;
        archive << m_weight;
        archive << m_weightSum;
    }

    void clearModels(){
        m_models.clear();
        m_weight.clear();
        m_weightSum = 0.0;
    }

    // 增加一个基学习器
    void addModel(ModelType const& model, double weight = 1.0){
        SHARK_CHECK(weight > 0, "Weights must be positive");
        m_models.push_back(model);
        m_weight.push_back(weight);
        m_weightSum+=weight;
    }

    double const& weight(std::size_t i)const{
        return m_weight[i];
    }

    void setWeight(std::size_t i, double newWeight){
        m_weightSum=newWeight - m_weight[i];
        m_weight[i] = newWeight;
    }

    std::size_t numberOfModels()const{
        return m_models.size();
    }

protected:
    // 表示所有的基学习器,这里要求它们的类型是一致的,但是在实际的应用中,其实是可以不一样的
    std::vector m_models;

    // 表示各个基学习器的权重
    std::vector<double> m_weight;

    // 所有权重之和
    double m_weightSum;
};

RFClassifier类

该类是用来表示一个随机森林,定义在中。

class RFClassifier : public MeanModel >
{
public:
    std::string name() const
    { return "RFClassifier"; }

    // 计算模型的平均OOB误差,将基学习器的OOB误差累加起来,再除以基学习器的个数
    void computeOOBerror(){
        std::size_t n_trees = numberOfModels();
        m_OOBerror = 0;
        for(std::size_t j=0;j!=n_trees;++j){
            m_OOBerror += m_models[j].OOBerror();
        }
        m_OOBerror /= n_trees;
    }

    // 综合基学习器每一维的重要程度,得到集成学习器每一维的重要程度
    void computeFeatureImportances(){
        m_featureImportances.resize(m_inputDimension);
        std::size_t n_trees = numberOfModels();

        for(std::size_t i=0;i!=m_inputDimension;++i){
            m_featureImportances[i] = 0;
            for(std::size_t j=0;j!=n_trees;++j){
                m_featureImportances[i] += m_models[j].featureImportances()[i];
            }
            m_featureImportances[i] /= n_trees;
        }
    }

    double const OOBerror() const {
        return m_OOBerror;
    }

    RealVector const& featureImportances() const {
        return m_featureImportances;
    }

    // 统计对于所有的基学习器,每一个特征在选择最优划分属性时被使用的次数
    UIntVector countAttributes() const {
        std::size_t n = m_models.size();
        if(!n) return UIntVector();
        UIntVector r = m_models[0].countAttributes();
        for(std::size_t i=1; i< n; i++ ) {
            noalias(r) += m_models[i].countAttributes();
        }
        return r;
    }

    void setLabelDimension(std::size_t in){
        m_labelDimension = in;
    }

    void setInputDimension(std::size_t in){
        m_inputDimension = in;
    }

    typedef CARTClassifier::TreeType TreeType;
    typedef std::vector ForestInfo; //一个随机森林

    // 获取森林中所有的决策树
    ForestInfo getForestInfo() const {
        ForestInfo finfo(m_models.size());
        for (std::size_t i=0; ireturn finfo;
    }

    // 用finfo来重置m_model,weights重置m_weight
    void setForestInfo(ForestInfo const& finfo, std::vector<double> const& weights = std::vector<double>()) {
        std::size_t n_tree = finfo.size();
        std::vector<double> we(weights);
        m_models.resize(n_tree);
        if (weights.empty()) // set default weights to 1
            we.resize(n_tree, 1);
        else if (weights.size() != n_tree)
            throw SHARKEXCEPTION("Weights must be the same number as trees");

        for (std::size_t i=0; iprotected:
    // 在分类任务中表示类的个数,在回归任务中表示数据的维度
    std::size_t m_labelDimension;

    std::size_t m_inputDimension;

    // out-of-bag误差
    double m_OOBerror;

    // 每一维属性的重要性
    RealVector m_featureImportances;

};

RFTrainer类

在构建随机森林时,通常会构建100棵以上的决策树。在结合各棵决策树的输出结果时,针对分类问题常采用多数投票法,对于回归任务时采用平均法。

当决策树的规模增长到停止条件时,对于决策树的构建就会停止。这里并不需要对决策树进行剪枝。因为Bagging集成方法对于偏差有很好的控制。

该类定义在,实现在。该类中包含了针对分类和回归的代码,这里我们只介绍分类的相关代码。

class RFTrainer 
: public AbstractTrainerunsigned int>
, public AbstractTrainer,
  public IParameterizable
{

public:
    SHARK_EXPORT_SYMBOL RFTrainer(bool computeFeatureImportances = false, bool computeOOBerror = false){
        m_try = 0;
        m_B = 0;
        m_nodeSize = 0;
        m_OOBratio = 0;
        m_regressionLearner = false;
        m_computeFeatureImportances = computeFeatureImportances;
        m_computeOOBerror = computeOOBerror;
    }

    std::string name() const
    { return "RFTrainer"; }

    // 构造用于分类任务的随机森林
    SHARK_EXPORT_SYMBOL void train(RFClassifier& model, ClassificationDataset const& dataset){
        model.clearModels();

        m_inputDimension = inputDimension(dataset);

        model.setInputDimension(m_inputDimension);
        model.setLabelDimension(numberOfClasses(dataset));

        m_maxLabel = static_cast<unsigned int>(numberOfClasses(dataset))-1;

        m_regressionLearner = false;
        setDefaults();

        std::size_t subsetSize = static_cast<std::size_t>(dataset.numberOfElements()*m_OOBratio);
        DataViewconst> elements(dataset);

        //Generate m_B trees
        SHARK_PARALLEL_FOR(int i = 0; i < (int)m_B; ++i){
            // 每次构建一棵新的决策树时,都要将训练数据原有的顺序打乱,再获取新的训练集
            std::vector<std::size_t> subsetIndices(dataset.numberOfElements());
            boost::iota(subsetIndices,0);
            boost::random_shuffle(subsetIndices);

            // create oob indices
            std::vector<std::size_t>::iterator oobStart = subsetIndices.begin() + subsetSize;
            std::vector<std::size_t>::iterator oobEnd   = subsetIndices.end();

            // 构造出训练本棵决策树的训练集
            subsetIndices.erase(oobStart, oobEnd);
            ClassificationDataset dataTrain = toDataset(subset(elements,subsetIndices));

            //Create attribute tables
            boost::unordered_map<std::size_t, std::size_t> cAbove;
            AttributeTables tables;
            createAttributeTables(dataTrain.inputs(), tables);
            createCountMatrix(dataTrain, cAbove);

            CARTClassifier::TreeType tree = buildTree(tables, dataTrain, cAbove, 0);
            CARTClassifier cart(tree, m_inputDimension); // 保存该决策树

            // if oob error or importances have to be computed, create an oob sample
            if(m_computeOOBerror || m_computeFeatureImportances){
                std::vector<std::size_t> subsetIndicesOOB(oobStart, oobEnd);
                ClassificationDataset dataOOB = toDataset(subset(elements, subsetIndicesOOB));

                // 如果要计算这两个值的话,需要对每一个决策树先计算好,之后再调用集成学习器的函数来计算总体的值
                if(m_computeFeatureImportances){
                    cart.computeFeatureImportances(dataOOB);
                }
                else{
                    cart.computeOOBerror(dataOOB);
                }
            }

            SHARK_CRITICAL_REGION{
                model.addModel(cart); // 将该决策树加入到集成学习器中
            }
        }

        if(m_computeOOBerror){
            model.computeOOBerror();
        }

        if(m_computeFeatureImportances){
            model.computeFeatureImportances();
        }
    }

    /// Train a random forest for regression.
    SHARK_EXPORT_SYMBOL void train(RFClassifier& model, RegressionDataset const& dataset);

    /// Set the number of random attributes to investigate at each node.
    SHARK_EXPORT_SYMBOL void setMTry(std::size_t mtry);

    /// Set the number of trees to grow.
    SHARK_EXPORT_SYMBOL void setNTrees(std::size_t nTrees);

    /// Controls when a node is considered pure. If set to 1, a node is pure
    /// when it only consists of a single node.
    SHARK_EXPORT_SYMBOL void setNodeSize(std::size_t nTrees);

    /// Set the fraction of the original training dataset to use as the
    /// out of bag sample. The default value is 0.66.
    SHARK_EXPORT_SYMBOL void setOOBratio(double ratio);

    /// Return the parameter vector.
    RealVector parameterVector() const
    {
        RealVector ret(1); // number of trees
        init(ret) << (double)m_B;
        return ret;
    }

    /// Set the parameter vector.
    void setParameterVector(RealVector const& newParameters)
    {
        SHARK_ASSERT(newParameters.size() == numberOfParameters());
        setNTrees((size_t) newParameters[0]);
    }

protected:
    struct RFAttribute {
        double value; // 某一数据某一维度上的值
        std::size_t id; // 所属数据在数据集中的下标
    };

    // 数据集中的所有数据在某一维度上的取值情况
    typedef std::vector < RFAttribute > AttributeTable;
    // 数据集中的所有数据在所有维度上的取值情况
    typedef std::vector < AttributeTable > AttributeTables;

    // 统计数据集中的所有数据在所有维度上的取值情况
    SHARK_EXPORT_SYMBOL void createAttributeTables(Data const& dataset, AttributeTables& tables){
        std::size_t elements = dataset.numberOfElements();
        //Each entry in the outer vector is an attribute table
        AttributeTable table;
        RFAttribute a;
        for(std::size_t j=0; jfor(std::size_t i=0; i//Store Attribute value, class and rid
                a.value = dataset.element(i)[j];
                a.id = i;
                table.push_back(a);
            }
            std::sort(table.begin(), table.end(), tableSort); //将vector排好序,有助于选择最优划分属性的计算
            tables.push_back(table);
        }
    }

    // 统计每一个类别所出现的数目,保存在CAbove中
    SHARK_EXPORT_SYMBOL void createCountMatrix(ClassificationDataset const& dataset, boost::unordered_map<std::size_t, std::size_t>& cAbove){
        std::size_t elements = dataset.numberOfElements();
        for(std::size_t i = 0 ; i < elements; i++){
            cAbove[dataset.element(i).label]++;
        }
    }

    // Split attribute tables into left and right parts.
    SHARK_EXPORT_SYMBOL void splitAttributeTables(AttributeTables const& tables, std::size_t index, std::size_t valIndex, AttributeTables& LAttributeTables, AttributeTables& RAttributeTables){
        AttributeTable table;

        // 根据最优划分属性和最优分割点,将原结点中的数据划分开
        boost::unordered_map<std::size_t, bool> hash; // 可以快速地检索每一个数据是被分配到左子结点还是右子结点中
        for(std::size_t i = 0; i< tables[index].size(); i++){
            hash[tables[index][i].id] = (i<=valIndex);
        }

        for(std::size_t i = 0; i < tables.size(); i++){
            LAttributeTables.push_back(table);
            RAttributeTables.push_back(table);
            for(std::size_t j = 0; j < tables[i].size(); j++){
                if(hash[tables[i][j].id]){
                    //Left
                    LAttributeTables[i].push_back(tables[i][j]);
                }else{
                    //Right
                    RAttributeTables[i].push_back(tables[i][j]);
                }
            }
        }
    }

    // 构建一棵决策树
    SHARK_EXPORT_SYMBOL CARTClassifier::TreeType buildTree(AttributeTables& tables, ClassificationDataset const& dataset, boost::unordered_map<std::size_t, std::size_t>& cAbove, std::size_t nodeId){
        CARTClassifier::TreeType lTree, rTree; // 表示该结点的左子树和右子树

        CARTClassifier::NodeInfo nodeInfo; //表示决策树单个结点的信息

        nodeInfo.nodeId = nodeId;
        nodeInfo.attributeIndex = 0;
        nodeInfo.attributeValue = 0.0;
        nodeInfo.leftNodeId = 0;
        nodeInfo.rightNodeId = 0;
        nodeInfo.misclassProp = 0.0;
        nodeInfo.r = 0;
        nodeInfo.g = 0.0;

        std::size_t n = tables[0].size(); // 表示该次用于构建决策树的数据个数

        bool isLeaf = false;
        if(gini(cAbove,tables[0].size())==0 || n <= m_nodeSize){
            isLeaf = true;
        }else{
            boost::unordered_map<std::size_t, std::size_t> cBelow, cBestBelow, cBestAbove;

            set<std::size_t> tableIndicies; // 用于选择此次最优属性选择的候选集合
            generateRandomTableIndicies(tableIndicies);

            std::size_t bestAttributeIndex, bestAttributeValIndex;

            double bestAttributeVal;
            double bestImpurity = n+1.0;

            // 寻找最优划分属性
            for (set<std::size_t>::iterator it=tableIndicies.begin() ; it != tableIndicies.end(); it++ ){
                std::size_t attributeIndex = *it;
                boost::unordered_map<std::size_t, std::size_t> cTmpAbove = cAbove;
                cBelow.clear();

                // 对于某一属性,寻找最优划分点
                for(std::size_t i=1; istd::size_t prev = i-1;

                    //Update the count of the label
                    cBelow[dataset.element(tables[attributeIndex][prev].id).label]++;
                    cTmpAbove[dataset.element(tables[attributeIndex][prev].id).label]--;

                    // 避免相同的属性值被划分到不同的类别中这一矛盾情况的发生
                    if(tables[attributeIndex][prev].value!=tables[attributeIndex][i].value){
                        std::size_t n1 = i;
                        std::size_t n2 = n-n1;

                        //Calculate the Gini impurity of the split
                        double impurity = n1*gini(cBelow,n1)+n2*gini(cTmpAbove,n2);
                        if(impurity//Found a more pure split, store the attribute index and value
                            bestImpurity = impurity;
                            bestAttributeIndex = attributeIndex;
                            bestAttributeValIndex = prev;
                            bestAttributeVal = tables[attributeIndex][bestAttributeValIndex].value;
                            cBestAbove = cTmpAbove;
                            cBestBelow = cBelow;
                        }
                    }
                }
            }

            // 如果寻找到了最优划分属性以及划分点,则构造中间结点
            if(bestImpurity1){
                AttributeTables rTables, lTables;
                splitAttributeTables(tables, bestAttributeIndex, bestAttributeValIndex, lTables, rTables);
                tables.clear();

                nodeInfo.attributeIndex = bestAttributeIndex;
                nodeInfo.attributeValue = bestAttributeVal;
                nodeInfo.leftNodeId = 2*nodeId+1; // 使用的是类似最小堆的构建方式
                nodeInfo.rightNodeId = 2*nodeId+2;

                lTree = buildTree(lTables, dataset, cBestBelow, nodeInfo.leftNodeId);
                rTree = buildTree(rTables, dataset, cBestAbove, nodeInfo.rightNodeId);
            }else{
                // 如果无法进行分割,则标记为叶子结点
                isLeaf = true;
            }
        }

        CARTClassifier::TreeType tree;

        if(isLeaf){
            // 如果是叶子结点,计算出类标签后,加入到容器中
            nodeInfo.label = hist(cAbove);
            tree.push_back(nodeInfo);
            return tree;
        }

        // 如果是中间结点,则先将该结点加入到容器中,再将左右子树分别加入到容器中
        tree.push_back(nodeInfo);
        tree.insert(tree.end(), lTree.begin(), lTree.end());
        tree.insert(tree.end(), rTree.begin(), rTree.end());

        return tree;
    }

    /// Builds a decision tree for regression
    SHARK_EXPORT_SYMBOL CARTClassifier::TreeType buildTree(AttributeTables& tables, RegressionDataset const& dataset, std::vector const& labels, std::size_t nodeId);

    /// comparison function for sorting an attributeTable
    SHARK_EXPORT_SYMBOL static bool tableSort(RFAttribute const& v1, RFAttribute const& v2);

    // 计算出当前结点的数据中每个类别的比例
    SHARK_EXPORT_SYMBOL RealVector hist(boost::unordered_map<std::size_t, std::size_t> countMatrix){

        RealVector histogram(m_maxLabel+1,0.0);

        std::size_t totalElements = 0;

        boost::unordered_map<std::size_t, std::size_t>::iterator it;
        for ( it=countMatrix.begin() ; it != countMatrix.end(); it++ ){
            histogram(it->first) = (double)it->second;
            totalElements += it->second;
        }
        histogram /= totalElements;

        return histogram;
    }

    /// Average label over a vector.
    SHARK_EXPORT_SYMBOL RealVector average(std::vector const& labels);

    // 计算当前数据的gini指数
    SHARK_EXPORT_SYMBOL double gini(boost::unordered_map<std::size_t, std::size_t> & countMatrix, std::size_t n){
        double res = 0;
        boost::unordered_map<std::size_t, std::size_t>::iterator it;
        if(n){
            n = n*n;
            for ( it=countMatrix.begin() ; it != countMatrix.end(); it++ ){
                res += sqr(it->second)/(double)n;
            }
        }
        return 1-res;
    }

    /// Total Sum Of Squares
    SHARK_EXPORT_SYMBOL double totalSumOfSquares(std::vector& labels, std::size_t from, std::size_t to, RealVector const& sumLabel);

    // 用于产生最优属性的候选集合,保存在tableIndicies中
    SHARK_EXPORT_SYMBOL void generateRandomTableIndicies(std::set<std::size_t>& tableIndicies){
        while(tableIndicies.size()// 为算法中的一些参数设置默认值
    SHARK_EXPORT_SYMBOL void setDefaults(){
        if(!m_try){
            if(m_regressionLearner){
                setMTry(static_cast<std::size_t>(std::ceil(m_inputDimension/3.0)));
            }else{
                // 设置每一棵决策树在选择最优划分属性时的候选集大小为输入维度的平方根
                setMTry(static_cast<std::size_t>(std::ceil(std::sqrt((double)m_inputDimension))));
            }
        }

        if(!m_B){
            // 随机森林中树的棵树默认为100棵
            setNTrees(100);
        }

        if(!m_nodeSize){
            if(m_regressionLearner){
                setNodeSize(5);
            }else{
                // 决策树结点中包含的最少数据个数为1,即希望整棵决策树被完全构造
                setNodeSize(1);
            }
        }

        if(m_OOBratio <= 0 || m_OOBratio>1){
            // 在构造每一棵决策树时,从训练集中获取数据的比例为0.66,类似与bootstarp方法,但不一样的是bootstrap是有放回的
            setOOBratio(0.66);
        }
    }

    std::size_t m_inputDimension;

    std::size_t m_labelDimension;

    // 最大类别的标号
    unsigned int m_maxLabel;

    // 进行最优划分属性选择时,候选集的大小
    std::size_t m_try;

    // 随机森林中,决策树的数目
    std::size_t m_B;

    //决策树中叶结点所包含数据大小的限制
    std::size_t m_nodeSize;

    // 构建每一棵决策树时,从训练集中选取数据的比例
    double m_OOBratio;

    // 表示是否是用于回归任务的
    bool m_regressionLearner;

    // 如果为真,表示需要计算m_FeatureImportances
    bool m_computeFeatureImportances;

    // 如果为真,表示需要计算m_OOBerror
    bool m_computeOOBerror;
};

一个例子

#include  //importing the file
#include  //the random forest trainer
#include  //zero one loss for evaluation

#include  

using namespace std; 
using namespace shark;

int main() {

    //*****************LOAD AND PREPARE DATA***********************//
    //Read Sample data set C.csv

    ClassificationDataset data;
    importCSV(data, "data/C.csv", LAST_COLUMN, ' ');

    //Split the dataset into a training and a test dataset
    ClassificationDataset dataTest = splitAtElement(data,311);

    cout << "Training set - number of data points: " << data.numberOfElements()
        << " number of classes: " << numberOfClasses(data)
        << " input dimension: " << inputDimension(data) << endl;

    cout << "Test set - number of data points: " << dataTest.numberOfElements()
        << " number of classes: " << numberOfClasses(dataTest)
        << " input dimension: " << inputDimension(dataTest) << endl;

    //Generate a random forest
    RFTrainer trainer;
    RFClassifier model;
    trainer.train(model, data);

    // evaluate Random Forest classifier
    ZeroOneLoss<unsigned int, RealVector> loss;
    Data prediction = model(data.inputs());
    cout << "Random Forest on training set accuracy: " << 1. - loss.eval(data.labels(), prediction) << endl;

    prediction = model(dataTest.inputs());
    cout << "Random Forest on test set accuracy:     " << 1. - loss.eval(dataTest.labels(), prediction) << endl;
}

你可能感兴趣的:(Shark源码分析,shark,随机森林)