hive ColumnPruner

Optimizer

  public void initialize(HiveConf hiveConf) {
    if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTCP)) {
      transformations.add(new ColumnPruner());
    }
  }


create table tab1 (col1 string, col2 string, col3 int, col4 string, col5 string, col6 string, col7 string);
explain select col1, col2 from tab1 where col3>5;


hive> explain select col1, col2 from tab1 where col3>5;
OK
ABSTRACT SYNTAX TREE:
  (TOK_QUERY (TOK_FROM (TOK_TABREF tab1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL col1)) (TOK_SELEXPR (TOK_TABLE_OR_COL col2))) (TOK_WHERE (> (TOK_TABLE_OR_COL col3) 5))))

STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 is a root stage

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Alias -> Map Operator Tree:
        tab1
          TableScan
            alias: tab1
            Filter Operator
              predicate:
                  expr: (col3 > 5)
                  type: boolean
              Filter Operator
                predicate:
                    expr: (col3 > 5)
                    type: boolean
                Select Operator
                  expressions:
                        expr: col1
                        type: string
                        expr: col2
                        type: string
                  outputColumnNames: _col0, _col1
                  File Output Operator
                    compressed: false
                    GlobalTableId: 0
                    table:
                        input format: org.apache.hadoop.mapred.TextInputFormat
                        output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

  Stage: Stage-0
    Fetch Operator
      limit: -1


Time taken: 0.379 seconds





public class ColumnPruner implements Transform {

  public ParseContext transform(ParseContext pactx) throws SemanticException {
    pGraphContext = pactx;
    opToParseCtxMap = pGraphContext.getOpParseCtx();

    // generate pruned column list for all relevant operators
    ColumnPrunerProcCtx cppCtx = new ColumnPrunerProcCtx(opToParseCtxMap);

    // create a walker which walks the tree in a DFS manner while maintaining
    // the operator stack. The dispatcher
    // generates the plan from the operator tree
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new RuleRegExp("R1", "FIL%"), ColumnPrunerProcFactory
        .getFilterProc());
    opRules.put(new RuleRegExp("R2", "GBY%"), ColumnPrunerProcFactory
        .getGroupByProc());
    opRules.put(new RuleRegExp("R3", "RS%"), ColumnPrunerProcFactory
        .getReduceSinkProc());
    opRules.put(new RuleRegExp("R4", "SEL%"), ColumnPrunerProcFactory
        .getSelectProc());
    opRules.put(new RuleRegExp("R5", "JOIN%"), ColumnPrunerProcFactory
        .getJoinProc());
    opRules.put(new RuleRegExp("R6", "MAPJOIN%"), ColumnPrunerProcFactory
        .getMapJoinProc());
    opRules.put(new RuleRegExp("R7", "TS%"), ColumnPrunerProcFactory
        .getTableScanProc());
    opRules.put(new RuleRegExp("R8", "LVJ%"), ColumnPrunerProcFactory
        .getLateralViewJoinProc());
    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(ColumnPrunerProcFactory
        .getDefaultProc(), opRules, cppCtx);
    GraphWalker ogw = new ColumnPrunerWalker(disp);

    // Create a list of topop nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pGraphContext.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pGraphContext;
  }

}




1)FilterOperator(ColumnPrunerFilterProc )所需字段为:过滤条件需要的字段+孩子节点需要的字段。
2)GroupByOperator(ColumnPrunerGroupByProc)所需字段为:出现在key中的字段+出现在聚合函数中的字段。
3)ReduceSinkOperator(ColumnPrunerReduceSinkProc)所需字段为:(1)孩子节点是JoinOperator(2)孩子节点不是JoinOperator,为出现在key中的字段+出现在value中的字段
4)SelectOperator(ColumnPrunerSelectProc)所需字段为:4.1)如果有孩子节点为FileSinkOperator或者ScriptOperator或者UDTFOperator或者LimitOperator或者UnionOperator,那么从SelectOperator中获取所需字段。  4.2)
5)JoinOperator(ColumnPrunerJoinProc)所需字段为:如果有孩子节点是FileSinkOperator,那么不处理。其他情况:
6)MapJoinOperator(ColumnPrunerMapJoinProc)
7)TableScanOperator(ColumnPrunerTableScanProc)所需字段为:孩子节点需要的字段。
8)LateralViewJoinOperator(ColumnPrunerLateralViewJoinProc)

  public static class ColumnPrunerSelectProc implements NodeProcessor {
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
        Object... nodeOutputs) throws SemanticException {
      SelectOperator op = (SelectOperator) nd; //org.apache.hadoop.hive.ql.exec.SelectOperator@347448
      ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; // org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx@bec43f
      List<String> cols = new ArrayList<String>();

      if (op.getChildOperators() != null) {
        for (Operator<? extends Serializable> child : op.getChildOperators()) {
          // If one of my children is a FileSink or Script, return all columns.
          // Without this break, a bug in ReduceSink to Extract edge column
          // pruning will manifest
          // which should be fixed before remove this
          if ((child instanceof FileSinkOperator)
              || (child instanceof ScriptOperator)
              || (child instanceof UDTFOperator)
              || (child instanceof LimitOperator)
              || (child instanceof UnionOperator)) {
           // child = org.apache.hadoop.hive.ql.exec.FileSinkOperator@de6570
            cppCtx.getPrunedColLists()
                .put(op, cppCtx.getColsFromSelectExpr(op));
            return null;  // return null;
          }
        }
      }
      cols = cppCtx.genColLists(op);

      SelectDesc conf = op.getConf();
      // The input to the select does not matter. Go over the expressions
      // and return the ones which have a marked column
      cppCtx.getPrunedColLists().put(op,
          cppCtx.getSelectColsFromChildren(op, cols));

      if (conf.isSelStarNoCompute()) {
        return null;
      }

      // do we need to prune the select operator?
      List<ExprNodeDesc> originalColList = op.getConf().getColList();
      List<String> columns = new ArrayList<String>();
      for (ExprNodeDesc expr : originalColList) {
        Utilities.mergeUniqElems(columns, expr.getCols());
      }
      // by now, 'prunedCols' are columns used by child operators, and 'columns'
      // are columns used by this select operator.
      ArrayList<String> originalOutputColumnNames = conf.getOutputColumnNames();
      if (cols.size() < originalOutputColumnNames.size()) {
        ArrayList<ExprNodeDesc> newColList = new ArrayList<ExprNodeDesc>();
        ArrayList<String> newOutputColumnNames = new ArrayList<String>();
        ArrayList<ColumnInfo> rs_oldsignature = op.getSchema().getSignature();
        ArrayList<ColumnInfo> rs_newsignature = new ArrayList<ColumnInfo>();
        RowResolver old_rr = cppCtx.getOpToParseCtxMap().get(op).getRowResolver();
        RowResolver new_rr = new RowResolver();
        for (String col : cols) {
          int index = originalOutputColumnNames.indexOf(col);
          newOutputColumnNames.add(col);
          newColList.add(originalColList.get(index));
          rs_newsignature.add(rs_oldsignature.get(index));
          String[] tabcol = old_rr.reverseLookup(col);
          ColumnInfo columnInfo = old_rr.get(tabcol[0], tabcol[1]);
          new_rr.put(tabcol[0], tabcol[1], columnInfo);
        }
        cppCtx.getOpToParseCtxMap().get(op).setRowResolver(new_rr);
        op.getSchema().setSignature(rs_newsignature);
        conf.setColList(newColList);
        conf.setOutputColumnNames(newOutputColumnNames);
        handleChildren(op, cols, cppCtx);
      }
      return null;
    }

}


ColumnPrunerProcCtx:
     private final Map<Operator<? extends Serializable>, List<String>> prunedColLists; // {}

ColumnPrunerProcCtx:
  public List<String> getColsFromSelectExpr(SelectOperator op) {
    List<String> cols = new ArrayList<String>();
    SelectDesc conf = op.getConf(); // org.apache.hadoop.hive.ql.plan.SelectDesc@1995c9a
    ArrayList<ExprNodeDesc> exprList = conf.getColList(); //[Column[col1], Column[col2]]
    for (ExprNodeDesc expr : exprList) {
      cols = Utilities.mergeUniqElems(cols, expr.getCols());
    }
    return cols; // [col1, col2]
  }

执行完ColumnPrunerSelectProc 的 process后
ColumnPrunerProcCtx:
     private final Map<Operator<? extends Serializable>, List<String>> prunedColLists; // {org.apache.hadoop.hive.ql.exec.SelectOperator@347448=[col1, col2]}


  public static class ColumnPrunerFilterProc implements NodeProcessor {
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
        Object... nodeOutputs) throws SemanticException {
      FilterOperator op = (FilterOperator) nd; //org.apache.hadoop.hive.ql.exec.FilterOperator@1bcfbeb
      ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; //org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx@bec43f
      ExprNodeDesc condn = op.getConf().getPredicate(); //class org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge(Column[col3], Const int 5()
//  获得这个FilterOperator的谓词,这是过滤条件,过滤掉不用的记录。
      // get list of columns used in the filter
      List<String> cl = condn.getCols(); //[col3]  获得这个谓词需要用到的columns
      // merge it with the downstream col list
      cppCtx.getPrunedColLists().put(op,
          Utilities.mergeUniqElems(cppCtx.genColLists(op), cl));
       
      pruneOperator(cppCtx, op, cppCtx.getPrunedColLists().get(op));

      return null; //
    }
  }

// 获得curOp的所有孩子节点需要用到的所有columns。
  public List<String> genColLists(Operator<? extends Serializable> curOp)
      throws SemanticException {
// curOp = org.apache.hadoop.hive.ql.exec.FilterOperator@1bcfbeb
    List<String> colList = new ArrayList<String>();   //记录孩子节点需要用到的所有columns
    if (curOp.getChildOperators() != null) {
      for (Operator<? extends Serializable> child : curOp.getChildOperators()) { //遍历所有的孩子节点
   // child = org.apache.hadoop.hive.ql.exec.SelectOperator@347448
        if (child instanceof CommonJoinOperator) {
          int tag = child.getParentOperators().indexOf(curOp);
          List<String> prunList = joinPrunedColLists.get(child).get((byte) tag);
          colList = Utilities.mergeUniqElems(colList, prunList);
        } else {
          colList = Utilities
              .mergeUniqElems(colList, prunedColLists.get(child)); // [col1, col2]    获得孩子节点需要的用到的columns,加入colList
        }
      }
    }
    return colList; // [col1, col2]
  }

  public static List<String> mergeUniqElems(List<String> src, List<String> dest) {
  //  src=[col1, col2]      dest = [col3]
    if (dest == null) {
      return src;
    }
    if (src == null) {
      return dest;
    }
    int pos = 0;

    while (pos < dest.size()) {
      if (!src.contains(dest.get(pos))) {
        src.add(dest.get(pos));
      }
      pos++;
    }

    return src; // [col1, col2, col3]
  }



执行完ColumnPrunerFilterProc 的 process后
ColumnPrunerProcCtx:
     private final Map<Operator<? extends Serializable>, List<String>> prunedColLists; // {org.apache.hadoop.hive.ql.exec.SelectOperator@347448=[col1, col2], org.apache.hadoop.hive.ql.exec.FilterOperator@1bcfbeb=[col1, col2, col3]}


ColumnPrunerProcFactory:
  private static void pruneOperator(NodeProcessorCtx ctx,
      Operator<? extends Serializable> op,
      List<String> cols)
      throws SemanticException {   //保持顺序
// op = org.apache.hadoop.hive.ql.exec.FilterOperator@1bcfbeb
// cols = [col1, col2, col3]
    // the pruning needs to preserve the order of columns in the input schema
    RowSchema inputSchema = op.getSchema(); // col1: stringcol2: stringcol3: intcol4: stringcol5: stringcol6: stringcol7: string)
    if (inputSchema != null) {
      ArrayList<ColumnInfo> rs = new ArrayList<ColumnInfo>();
      ArrayList<ColumnInfo> inputCols = inputSchema.getSignature(); //[col1: string, col2: string, col3: int, col4: string, col5: string, col6: string, col7: string]
    for (ColumnInfo i: inputCols) {
        if (cols.contains(i.getInternalName())) {
          rs.add(i);  // rs = [col1: string, col2: string, col3: int]
        }
    }
      op.getSchema().setSignature(rs);
    }
  }

Operator:
  public RowSchema getSchema() {
    return rowSchema;
  }
RowSchema:
  public void setSignature(ArrayList<ColumnInfo> signature) {
  // this.signature =  [col1: string, col2: string, col3: int, col4: string, col5: string, col6: string, col7: string]
  // signature =  [col1: string, col2: string, col3: int]
    this.signature = signature;
  }





  public static class ColumnPrunerTableScanProc implements NodeProcessor {
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx ctx,
        Object... nodeOutputs) throws SemanticException {
      TableScanOperator scanOp = (TableScanOperator) nd; // org.apache.hadoop.hive.ql.exec.TableScanOperator@5bda13
      ColumnPrunerProcCtx cppCtx = (ColumnPrunerProcCtx) ctx; // org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcCtx@bec43f
      List<String> cols = cppCtx
          .genColLists((Operator<? extends Serializable>) nd);  // [col1, col2, col3]
      cppCtx.getPrunedColLists().put((Operator<? extends Serializable>) nd,
          cols);
      ArrayList<Integer> needed_columns = new ArrayList<Integer>();
      RowResolver inputRR = cppCtx.getOpToParseCtxMap().get(scanOp).getRowResolver(); // tab1{(col1,col1: string)(col2,col2: string)(col3,col3: int)(col4,col4: string)(col5,col5: string)(col6,col6: string)(col7,col7: string)}
      for (int i = 0; i < cols.size(); i++) {
        int position = inputRR.getPosition(cols.get(i));
        if (position >=0) {
          needed_columns.add(position); // [0, 1, 2]
        }
      }
      scanOp.setNeededColumnIDs(needed_columns);  // scanOp=org.apache.hadoop.hive.ql.exec.TableScanOperator@5bda13
      return null; //
    }
  }

ColumnPrunerProcCtx:
  public HashMap<Operator<? extends Serializable>, OpParseContext> getOpToParseCtxMap() {
    return opToParseCtxMap; // {org.apache.hadoop.hive.ql.exec.TableScanOperator@5bda13=org.apache.hadoop.hive.ql.parse.OpParseContext@19e3bdd, org.apache.hadoop.hive.ql.exec.FilterOperator@1bcfbeb=org.apache.hadoop.hive.ql.parse.OpParseContext@16c5f50, org.apache.hadoop.hive.ql.exec.SelectOperator@347448=org.apache.hadoop.hive.ql.parse.OpParseContext@1e5a0cb, org.apache.hadoop.hive.ql.exec.FileSinkOperator@de6570=org.apache.hadoop.hive.ql.parse.OpParseContext@9f9761}
  }

TableScanOperator:
  public void setNeededColumnIDs(java.util.ArrayList<Integer> orign_columns) {
    neededColumnIDs = orign_columns; // [0, 1, 2]
  }

你可能感兴趣的:(column)