updater_basemaker-inl.h

class BaseMaker: public TreeUpdater {
    TrainParam param; //训练参数
    vector qexpand; //queue of nodes to be expanded
    vector node2workindex; //map active node to its working index offset in qexpand, can be -1, which means the node is node actively expanding
    vector position; //position of each instance in the tree, can be negative, which means this position is no longer expanding, see also Decode/EncodePosition

    struct FMetaHelper { //采集feature的元数据
      vector fminmax; //大小为2*num_feature,偶数位存某个feature最大值
      int Type(int fid); //返回指定feature的数据类型,0为空,1为binary,2为real
      float MaxValue(int fid); //返回指定feature的最大值
      SampleCol(float p, vector *p_findex); //根据概率p,从所有的feature中随机sample出p*num_feature个,存在p_findex中
    }

    //helper for row-based data
    int NextLevel(RowBatch &inst, RegTree &tree, int nid); //nid是树中Node的编号,inst是一条sample,根据Node上的split信息返回左树还是右树
    int get_nthread(); //返回线程数

    //gpair是一阶和二阶导数,fmat是数据
    void InitData(vector &gpair,DMatrix &fmat,RegTree &tree) {
      //setup position,position的大小等于gpair的size
      //fmat.info.root_index是每个sample的当前index,用它来初始化position
      //mark delete for the deleted datas
      if (gpair[i].hess < 0.0f) position[i] = ~position[i]; //如果二阶导数为负,position设为负
      //mark subsample,如果要对数据进行sample,根据伯努利分布采样,没采样到的position设为负
      // expand query,将树中待处理节点放入队列,可以并行处理
      for (int i = 0; i < tree.param.num_roots; ++i) {
        qexpand.push_back(i);
      }
      this->UpdateNode2WorkIndex(tree); //更新Tree node到queue index的映射
    }
    
    //update queue expand add in new leaves,遍历queue中对应的tree node,如果不是leaf,则将左树和右树加入queue
    void UpdateQueueExpand(RegTree &tree);

    //return decoded position,rindex是数据的index
    int DecodePosition(int ridx) {
      pid = position[ridx]; //tree中node的位置
      return pid < 0 ? ~pid : pid;  //如果pid为负,表示此node不再expand
    }
    //encode the encoded position value for ridx,nid是正整数,如果当前position是负,则设为~nid
    void SetEncodePosition(int ridx, int nid);

    //this is helper function uses column based data structure to reset the positions,根据树结构,把新的数据放在树中
    //nodes: the set of nodes that contains the split to be used
    //p_fmat: feature matrix needed for tree construction
    void ResetPositionCol(vector &nodes,DMatrix *p_fmat,RegTree &tree) {
      SetNonDefaultPositionCol(nodes, p_fmat, tree);
      SetDefaultPostion(p_fmat, tree);
    }
    //helper function to set the non-leaf positions to default direction. 把不能分到leaf上的数据(比如missing value)分配到default的方向
    void SetDefaultPostion(DMatrix *p_fmat, RegTree &tree);

    //helper function uses column based data structure to CORRECT the positions of non-default directions that WAS set to default before calling this function.
    //batch: The column batch
    //sorted_split_set: The set of index that contains split solutions.
    void CorrectNonDefaultPositionByBatch(ColBatch& batch,vector &sorted_split_set,RegTree &tree);
    
    //helper function uses column based data structure,从给定的nodes中获取split的feature id的集合
    //nodes: the set of nodes that contains the split to be used
    //out_split_set: The split index set
    void GetSplitSet(vector &nodes,RegTree &tree, vector* out_split_set);

    //helper function uses column based data structure,update all positions into nondefault branch, if any, ignore the default branch
    void SetNonDefaultPositionCol(vector &nodes,DMatrix *p_fmat,RegTree &tree) {

    //helper function to get statistics from a tree
    void GetNodeStats(vector &gpair, DMatrix &fmat, RegTree &tree, vector> *p_thread_temp, vector *p_node_stats)

    //common helper data structure to build sketch,构建sketch
    struct SketchEntry {
      //total sum of amount to be met
      double sum_total;
      //statistics used in the sketch
      double rmin, wmin;
      //last seen feature value
      float last_fvalue;
      //current size of sketch
      double next_goal;
      //pointer to the sketch to put things in
      WXQuantileSketch *sketch;
      
      // initialize the space
      void Init(max_size);

      //push a new element to sketch
      //fvalue: feature value, comes in sorted ascending order
      //w: weight
      void Push(float fvalue, float w, unsigned max_size);
      //更新node2workerindex
      void UpdateNode2WorkIndex(RegTree &tree) {
    }
}


你可能感兴趣的:(updater_basemaker-inl.h)