在ExecReducer中会调用 reducer.startGroup();
和reducer.endGroup();
,这两个方法会递归调用到GroupByOperator
中,用于设置firstRowInGroup = true
和keysCurrentGroup.clear();
进入processOp方法,首先判断如果hashAggr如果没有实现一半以上entries的减少,则禁用hashAggr。
try {
countAfterReport++;
newKeys.getNewKey(row, rowInspector);
if (groupingSetsPresent) {
// 处理 grouping set
// cols : a b c
// groupset :(a),(a,b),(a,b,c)
// groupingSetsPosition : [a b c]
//
Object[] newKeysArray = newKeys.getKeyArray();
Object[] cloneNewKeysArray = new Object[newKeysArray.length];
for (int keyPos = 0; keyPos < groupingSetsPosition; keyPos++) {
cloneNewKeysArray[keyPos] = newKeysArray[keyPos];
}
for (int groupingSetPos = 0; groupingSetPos < groupingSets.size(); groupingSetPos++) {
for (int keyPos = 0; keyPos < groupingSetsPosition; keyPos++) {
newKeysArray[keyPos] = null;
}
FastBitSet bitset = groupingSetsBitSet[groupingSetPos];
// Some keys need to be left to null corresponding to that grouping set.
for (int keyPos = bitset.nextSetBit(0); keyPos >= 0;
keyPos = bitset.nextSetBit(keyPos+1)) {
newKeysArray[keyPos] = cloneNewKeysArray[keyPos];
}
newKeysArray[groupingSetsPosition] = newKeysGroupingSets[groupingSetPos];
processKey(row, rowInspector);
}
} else {
processKey(row, rowInspector);
}
大部分代码都是在处理groupingSets,先从简单的单group by处理逻辑processKey(row, rowInspector);
入手
if (hashAggr) {
newKeys.setHashKey();
processHashAggr(row, rowInspector, newKeys);
} else {
processAggr(row, rowInspector, newKeys);
}
/**
* HashMap hashAggregations : key是聚合key,value 是aggs,udaf中间存储结果的数组
* keysCurrentGroup : key的hashset
*
* @param row
* @param rowInspector
* @param newKeys
* @throws HiveException
*/
private void processHashAggr(Object row, ObjectInspector rowInspector,
KeyWrapper newKeys) throws HiveException {
// Prepare aggs for updating
AggregationBuffer[] aggs = null;
boolean newEntryForHashAggr = false;
// hash-based aggregations
aggs = hashAggregations.get(newKeys);
if (aggs == null) {
KeyWrapper newKeyProber = newKeys.copyKey();
aggs = newAggregations();
hashAggregations.put(newKeyProber, aggs);
newEntryForHashAggr = true;
numRowsHashTbl++; // new entry in the hash table
}
// If the grouping key and the reduction key are different, a set of
// grouping keys for the current reduction key are maintained in
// keysCurrentGroup
// Peek into the set to find out if a new grouping key is seen for the given
// reduction key
// select a,count(distinct b) from tab group by a
// grouping key : a
// reduction key : b (对于所有的 b 做聚合计算)
// newKeys : a,b 实现不是对reduce一个key的所有iterator value做聚合,只要newEntryForHashAggr=true,则开启新的聚合计算
if (groupKeyIsNotReduceKey) {
newEntryForHashAggr = keysCurrentGroup.add(newKeys.copyKey());
}
// Update the aggs
updateAggregations(aggs, row, rowInspector, true, newEntryForHashAggr, null);
// We can only flush after the updateAggregations is done, or the
// potentially new entry "aggs"
// can be flushed out of the hash table.
// Based on user-specified parameters, check if the hash table needs to be
// flushed.
// If the grouping key is not the same as reduction key, flushing can only
// happen at boundaries
//
// firstRowInGroup 由ExecReducer 代理当前类进行设置,表示Reduce端新的iterator values 开始,这样的话,可以flush内存结果
if ((!groupKeyIsNotReduceKey || firstRowInGroup)
&& shouldBeFlushed(newKeys)) {
// 将 hashAggregations map的10%进行forward到下游operator
flushHashTable(false);
}
}
iterate
或merge
方法,更新agg结果protected void updateAggregations(AggregationBuffer[] aggs, Object row,
ObjectInspector rowInspector, boolean hashAggr,
boolean newEntryForHashAggr, Object[][] lastInvoke) throws HiveException {
if (unionExprEval == null) {
for (int ai = 0; ai < aggs.length; ai++) {
// Calculate the parameters
// Object[] o 聚合函数要处理的参数,GenericUDAFEvaluator aggregationEvaluators[ai] : UDAF 函数计算器
Object[] o = new Object[aggregationParameterFields[ai].length];
for (int pi = 0; pi < aggregationParameterFields[ai].length; pi++) {
o[pi] = aggregationParameterFields[ai][pi].evaluate(row);
}
// Update the aggregations.
// 如果使用hashAggr 计算distinct ,只有当新的记录到达时才会执行aggregate
if (aggregationIsDistinct[ai]) {
if (hashAggr) {
if (newEntryForHashAggr) {
// 主要用于优化 udaf(distinct c1,c2) 这类函数,这样只有在reduction key发生变化的时候调用一次
aggregationEvaluators[ai].aggregate(aggs[ai], o);
}
} else {
if (lastInvoke[ai] == null) {
lastInvoke[ai] = new Object[o.length];
}
if (ObjectInspectorUtils.compare(o,
aggregationParameterObjectInspectors[ai], lastInvoke[ai],
aggregationParameterStandardObjectInspectors[ai]) != 0) {
aggregationEvaluators[ai].aggregate(aggs[ai], o);
for (int pi = 0; pi < o.length; pi++) {
lastInvoke[ai][pi] = ObjectInspectorUtils.copyToStandardObject(
o[pi], aggregationParameterObjectInspectors[ai][pi],
ObjectInspectorCopyOption.WRITABLE);
}
}
}
} else {
// 3. 常规group by 处理
aggregationEvaluators[ai].aggregate(aggs[ai], o);
}
}
return;
}
if (distinctKeyAggrs.size() > 0) {
// evaluate union object
UnionObject uo = (UnionObject) (unionExprEval.evaluate(row));
int unionTag = uo.getTag();
// update non-distinct key aggregations : "KEY._colx:t._coly"
if (nonDistinctKeyAggrs.get(unionTag) != null) {
for (int pos : nonDistinctKeyAggrs.get(unionTag)) {
Object[] o = new Object[aggregationParameterFields[pos].length];
for (int pi = 0; pi < aggregationParameterFields[pos].length; pi++) {
o[pi] = aggregationParameterFields[pos][pi].evaluate(row);
}
aggregationEvaluators[pos].aggregate(aggs[pos], o);
}
}
// there may be multi distinct clauses for one column
// update them all.
if (distinctKeyAggrs.get(unionTag) != null) {
for (int i : distinctKeyAggrs.get(unionTag)) {
Object[] o = new Object[aggregationParameterFields[i].length];
for (int pi = 0; pi < aggregationParameterFields[i].length; pi++) {
o[pi] = aggregationParameterFields[i][pi].evaluate(row);
}
if (hashAggr) {
if (newEntryForHashAggr) {
aggregationEvaluators[i].aggregate(aggs[i], o);
}
} else {
if (lastInvoke[i] == null) {
lastInvoke[i] = new Object[o.length];
}
if (ObjectInspectorUtils.compare(o,
aggregationParameterObjectInspectors[i],
lastInvoke[i],
aggregationParameterStandardObjectInspectors[i]) != 0) {
aggregationEvaluators[i].aggregate(aggs[i], o);
for (int pi = 0; pi < o.length; pi++) {
lastInvoke[i][pi] = ObjectInspectorUtils.copyToStandardObject(
o[pi], aggregationParameterObjectInspectors[i][pi],
ObjectInspectorCopyOption.WRITABLE);
}
}
}
}
}
// update non-distinct groupby key or value aggregations: 'KEY._COLx or VALUE._colx'
// these aggregations should be updated only once.
if (unionTag == 0) {
for (int pos : nonDistinctAggrs) {
Object[] o = new Object[aggregationParameterFields[pos].length];
for (int pi = 0; pi < aggregationParameterFields[pos].length; pi++) {
o[pi] = aggregationParameterFields[pos][pi].evaluate(row);
}
aggregationEvaluators[pos].aggregate(aggs[pos], o);
}
}
} else {
for (int ai = 0; ai < aggs.length; ai++) {
// there is no distinct aggregation,
// update all aggregations
Object[] o = new Object[aggregationParameterFields[ai].length];
for (int pi = 0; pi < aggregationParameterFields[ai].length; pi++) {
o[pi] = aggregationParameterFields[ai][pi].evaluate(row);
}
aggregationEvaluators[ai].aggregate(aggs[ai], o);
}
}
}
terminatePartial
或terminate
方法,结束当前行的计算。调用入口:
/**
* Forward a record of keys and aggregation results.
*
* @param keys
* The keys in the record
* @throws HiveException
*/
private void forward(Object[] keys, AggregationBuffer[] aggs) throws HiveException {
if (forwardCache == null) {
forwardCache = new Object[outputKeyLength + aggs.length];
}
for (int i = 0; i < outputKeyLength; i++) {
forwardCache[i] = keys[i];
}
for (int i = 0; i < aggs.length; i++) {
forwardCache[outputKeyLength + i] = aggregationEvaluators[i].evaluate(aggs[i]);
}
forward(forwardCache, outputObjInspector);
}
public void aggregate(AggregationBuffer agg, Object[] parameters) throws HiveException {
if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
iterate(agg, parameters);
} else {
assert (parameters.length == 1);
merge(agg, parameters[0]);
}
}
/**
* This function will be called by GroupByOperator when it sees a new input
* row.
*
* @param agg
* The object to store the aggregation result.
*/
public Object evaluate(AggregationBuffer agg) throws HiveException {
if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
return terminatePartial(agg);
} else {
return terminate(agg);
}
}
// GroupByOperator.Mode
public static enum Mode {
COMPLETE, PARTIAL1, PARTIAL2, PARTIALS, FINAL, HASH, MERGEPARTIAL
};
// GroupByOperator.Mode 和UDAF Evaluator.Mode映射关系
case COMPLETE:
return GenericUDAFEvaluator.Mode.COMPLETE;
case PARTIAL1:
return GenericUDAFEvaluator.Mode.PARTIAL1;
case PARTIAL2:
return GenericUDAFEvaluator.Mode.PARTIAL2;
case PARTIALS:
return isDistinct ? GenericUDAFEvaluator.Mode.PARTIAL1
: GenericUDAFEvaluator.Mode.PARTIAL2;
case FINAL:
return GenericUDAFEvaluator.Mode.FINAL;
case HASH:
return GenericUDAFEvaluator.Mode.PARTIAL1;
case MERGEPARTIAL:
return isDistinct ? GenericUDAFEvaluator.Mode.COMPLETE
: GenericUDAFEvaluator.Mode.FINAL;
// GroupByOperator.Mode 最终会执行的方法
/**
* Group-by Mode:
* COMPLETE: complete 1-phase aggregation: iterate, terminate # 只有Map端聚合,直接输出
* PARTIAL1: partial aggregation - first phase: iterate, terminatePartial # 常规Map端调用
* PARTIAL2: partial aggregation - second phase: merge, terminatePartial # 常规Reduce端调用
* PARTIALS: For non-distinct the same as PARTIAL2, for distinct the same as PARTIAL1 # ???
* FINAL: partial aggregation - final phase: merge, terminate # 看起来和 PARTIAL2(Reduce)类似 ??
* HASH: For non-distinct the same as PARTIAL1 but use hash-table-based aggregation # 对非distinct 使用hash table聚合
* MERGEPARTIAL: FINAL for non-distinct aggregations, COMPLETE for distinct aggregations. # ???
*/
explain select count(foo) from pokes group by foo;
Stage | Operator.Mode | Evaluator.Mode | Call Function |
---|---|---|---|
Map | HASH | PARTIAL1 | iterate, terminatePartial |
Reduce | MERGEPARTIAL | FINAL | merge, terminate |
// Reduce中MERGEPARTIAL 状态对应的UDFS.Mode 根据有无distinct可能不同,待验证 。。。
set hive.groupby.skewindata=true;
explain select count(foo) from pokes group by foo;
Stage | Operator.Mode | Evaluator.Mode | Call Function |
---|---|---|---|
Map 1 | HASH | PARTIAL1 | iterate, terminatePartial |
Reduce 1 | partials | //待补充 | //待补充 |
Map 2 | - | - | - |
Reduce 2 | final | FINAL | merge, terminate |
set hive.map.aggr=false;
set hive.groupby.skewindata=false;
explain select count(distinct foo) from pokes group by foo;
Stage | Operator.Mode | Evaluator.Mode | Call Function |
---|---|---|---|
Map | - | - | - |
Reduce | complete | COMPLETE | iterate, terminate |
set hive.map.aggr=false;
set hive.groupby.skewindata=false;
explain select ds,foo,count(bar) from invites group by ds,foo with rollup;
实验失败
FAILED: SemanticException [Error 10209]: Grouping sets aggregations (with rollups or cubes) are not allowed if map-side aggregation is turned off. Set hive.map.aggr=true if you want to use grouping sets
set hive.map.aggr=false;
set hive.groupby.skewindata=true;
explain select count(foo) from pokes group by foo;
Stage | Operator.Mode | Evaluator.Mode | Call Function |
---|---|---|---|
Map 1 | - | - | - |
Reduce 1 | partial1 | PARTIAL1 | iterate, terminatePartial |
Map 2 | - | - | - |
Reduce 2 | final | FINAL | merge, terminate |
set hive.map.aggr=false;
set hive.groupby.skewindata=true;
explain select ds,foo,count(distinct bar) from invites group by ds,foo with rollup;
// 实验失败
FAILED: SemanticException [Error 10209]: Grouping sets aggregations (with rollups or cubes) are not allowed if map-side aggregation is turned off. Set hive.map.aggr=true if you want to use grouping sets
补充实验
set hive.map.aggr=false;
set hive.groupby.skewindata=false;
explain select ds,foo,count(distinct bar) from invites group by ds,foo with rollup;
// 实验失败
FAILED: SemanticException [Error 10209]: Grouping sets aggregations (with rollups or cubes) are not allowed if map-side aggregation is turned off. Set hive.map.aggr=true if you want to use grouping sets
Map Aggr & No Skew:
This plan remains the same, only the implementation of the map-side hash-based aggregation operator was modified to handle the extra rows needed for rollup. The plan is as follows:
Mapper:
*Hash-based group by operator to perform partial aggregations
*Reduce sink operator, performs some partial aggregations
Reducer:
*MergePartial (list-based) group by operator to perform final aggregationsMap Aggr & Skew
Again, this plan remains the same, only the implementation of the map-side hash-based aggregation operator was modified to handle the extra rows needed for rollup. The plan is as follows:
Mapper 1:
*Hash-based group by operator to perform partial aggregations
*Reduce sink operator to spray by the group by and distinct keys (if there is a distinct key) or a random number otherwise
Reducer 1:
*Partials (list-based) group by operator to perform further partial aggregations
Mapper 2:
*Reduce sink operator, performs some partial aggregations
Reducer 2:
*Final (list-based) group by operator to perform final aggregations
Note that if there are no group by keys or distinct keys, Reducer 1 and Mapper 2 are removed from the plan and the reduce sink operator in Mapper 1 does not sprayNo Map Aggr & No Skew & No Rollup
This plan is the case from pre-rollup version of group by where there is no Map Aggr and No Skew, I included it for completeness as it remains an option if rollup is not used. The plan is as follows:
Mapper:
*Reduce sink operator, performs some partial aggregations
Reducer:
*Complete (list-based) group by operator to perform all aggregations
No Map Aggr & No Skew & With Rollup
The plan is as follows:
Mapper 1:
*Reduce sink operator, does not perform any partial aggregations
Reducer 1:
*Hash-based group by operator, much like the one used in the mappers of previous cases
Mapper 2:
*Reduce sink operator, performs some partial aggregations
Reducer 2:
*MergePartial (list-based) group by operator to perform remaining aggregationsNo Map Aggr & Skew & (No Distinct or No Rollup)
This plan is the same as was used for the case of No Map Aggr and Skew in the pre-rollup version of group by, for this cads when rollup is not used, or none of the aggregations make use of a distinct key. The implementation of the list-based group by operator was modified to handle the extra rows required for rollup if rollup is being used. The plan is as follows:
Mapper 1:
*Reduce sink operator to spray by the group by and distinct keys (if there is a distinct key) or a random number otherwise
Reducer 1:
*Partial1 (list-based) group by operator to perform partial aggregations, it makes use of the new list-based group by operator implementation for rollup if necessary
Mapper 2:
*Reduce sink operator, performs some partial aggregations
Reducer 2:
*Final (list-based) group by operator to perform remaining aggregationsNo Map Aggr & Skew & Distinct & Rollup
This plan is used when there is No Map Aggr and Skew and there is an aggregation that involves a distinct key and rollup is being used. The plan is as follows:
Mapper 1:
*Reduce sink operator to spray by the group by and distinct keys (if there is a distinct key) or a random number otherwise
Reducer 1:
*Hash-based group by operator, much like the one used in the mappers of previous cases
Mapper 2:
*Reduce sink operator to spray by the group by and distinct keys (if there is a distinct key) or a random number otherwise
Reducer 2:
*Partials (list-based) group by operator to perform further partial aggregations
Mapper 3:
*Reduce sink operator, performs some partial aggregations
Reducer 3:
*Final (list-based) group by operator to perform final aggregations
Note that if there are no group by keys or distinct keys, Reducer 2 and Mapper 3 are removed from the plan and the reduce sink operator in Mapper 2 does not spray. Also, note that the reason for Mapper 2 spraying is that if the skew in the data existed in a column that is not immediately nulled by the rollup (e.g. if we the group by keys are columns g1, g2, g3 in that order, we are concerned with the case where the skew exists in column g1 or g2) the skew may continue to exist after the hash aggregation, so we spray.
SemanticAnalyzer
private Operator genBodyPlan(QB qb, Operator input, Map<String, Operator> aliasToOpInfo)
throws SemanticException {
QBParseInfo qbp = qb.getParseInfo();
TreeSet<String> ks = new TreeSet<String>(qbp.getClauseNames());
Map<String, Operator extends OperatorDesc>> inputs = createInputForDests(qb, input, ks);
// For multi-group by with the same distinct, we ignore all user hints
// currently. It doesnt matter whether he has asked to do
// map-side aggregation or not. Map side aggregation is turned off
List<ASTNode> commonDistinctExprs = getCommonDistinctExprs(qb, input);
// Consider a query like:
//
// from src
// insert overwrite table dest1 select col1, count(distinct colx) group by col1
// insert overwrite table dest2 select col2, count(distinct colx) group by col2;
//
// With HIVE_OPTIMIZE_MULTI_GROUPBY_COMMON_DISTINCTS set to true, first we spray by the distinct
// value (colx), and then perform the 2 groups bys. This makes sense if map-side aggregation is
// turned off. However, with maps-side aggregation, it might be useful in some cases to treat
// the 2 inserts independently, thereby performing the query above in 2MR jobs instead of 3
// (due to spraying by distinct key first).
boolean optimizeMultiGroupBy = commonDistinctExprs != null &&
conf.getBoolVar(HiveConf.ConfVars.HIVE_OPTIMIZE_MULTI_GROUPBY_COMMON_DISTINCTS);
Operator curr = input;
// if there is a single distinct, optimize that. Spray initially by the
// distinct key,
// no computation at the mapper. Have multiple group by operators at the
// reducer - and then
// proceed
if (optimizeMultiGroupBy) {
curr = createCommonReduceSink(qb, input);
RowResolver currRR = opParseCtx.get(curr).getRowResolver();
// create a forward operator
input = putOpInsertMap(OperatorFactory.getAndMakeChild(new ForwardDesc(),
new RowSchema(currRR.getColumnInfos()), curr), currRR);
for (String dest : ks) {
curr = input;
curr = genGroupByPlan2MRMultiGroupBy(dest, qb, curr);
curr = genSelectPlan(dest, qb, curr, null); // TODO: we may need to pass "input" here instead of null
Integer limit = qbp.getDestLimit(dest);
if (limit != null) {
curr = genLimitMapRedPlan(dest, qb, curr, limit.intValue(), true);
qb.getParseInfo().setOuterQueryLimit(limit.intValue());
}
curr = genFileSinkPlan(dest, qb, curr);
}
} else {
List<List<String>> commonGroupByDestGroups = null;
// If we can put multiple group bys in a single reducer, determine suitable groups of
// expressions, otherwise treat all the expressions as a single group
if (conf.getBoolVar(HiveConf.ConfVars.HIVEMULTIGROUPBYSINGLEREDUCER)) {
try {
commonGroupByDestGroups = getCommonGroupByDestGroups(qb, inputs);
} catch (SemanticException e) {
LOG.error("Failed to group clauses by common spray keys.", e);
}
}
if (commonGroupByDestGroups == null) {
commonGroupByDestGroups = new ArrayList<List<String>>();
commonGroupByDestGroups.add(new ArrayList<String>(ks));
}
if (!commonGroupByDestGroups.isEmpty()) {
// Iterate over each group of subqueries with the same group by/distinct keys
for (List<String> commonGroupByDestGroup : commonGroupByDestGroups) {
if (commonGroupByDestGroup.isEmpty()) {
continue;
}
String firstDest = commonGroupByDestGroup.get(0);
input = inputs.get(firstDest);
// Constructs a standard group by plan if:
// There is no other subquery with the same group by/distinct keys or
// (There are no aggregations in a representative query for the group and
// There is no group by in that representative query) or
// The data is skewed or
// The conf variable used to control combining group bys into a single reducer is false
if (commonGroupByDestGroup.size() == 1 ||
(qbp.getAggregationExprsForClause(firstDest).size() == 0 &&
getGroupByForClause(qbp, firstDest).size() == 0) ||
conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) ||
!conf.getBoolVar(HiveConf.ConfVars.HIVEMULTIGROUPBYSINGLEREDUCER)) {
// Go over all the destination tables
for (String dest : commonGroupByDestGroup) {
curr = inputs.get(dest);
if (qbp.getWhrForClause(dest) != null) {
ASTNode whereExpr = qb.getParseInfo().getWhrForClause(dest);
curr = genFilterPlan((ASTNode) whereExpr.getChild(0), qb, curr, aliasToOpInfo, false);
}
// Preserve operator before the GBY - we'll use it to resolve '*'
Operator> gbySource = curr;
if (qbp.getAggregationExprsForClause(dest).size() != 0
|| getGroupByForClause(qbp, dest).size() > 0) {
// multiple distincts is not supported with skew in data
if (conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW) &&
qbp.getDistinctFuncExprsForClause(dest).size() > 1) {
throw new SemanticException(ErrorMsg.UNSUPPORTED_MULTIPLE_DISTINCTS.
getMsg());
}
// insert a select operator here used by the ColumnPruner to reduce
// the data to shuffle
curr = genSelectAllDesc(curr);
// Check and transform group by *. This will only happen for select distinct *.
// Here the "genSelectPlan" is being leveraged.
// The main benefits are (1) remove virtual columns that should
// not be included in the group by; (2) add the fully qualified column names to unParseTranslator
// so that view is supported. The drawback is that an additional SEL op is added. If it is
// not necessary, it will be removed by NonBlockingOpDeDupProc Optimizer because it will match
// SEL%SEL% rule.
ASTNode selExprList = qbp.getSelForClause(dest);
if (selExprList.getToken().getType() == HiveParser.TOK_SELECTDI
&& selExprList.getChildCount() == 1 && selExprList.getChild(0).getChildCount() == 1) {
ASTNode node = (ASTNode) selExprList.getChild(0).getChild(0);
if (node.getToken().getType() == HiveParser.TOK_ALLCOLREF) {
curr = genSelectPlan(dest, qb, curr, curr);
RowResolver rr = opParseCtx.get(curr).getRowResolver();
qbp.setSelExprForClause(dest, SemanticAnalyzer.genSelectDIAST(rr));
}
}
if (conf.getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) {
if (!conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) {
curr = genGroupByPlanMapAggrNoSkew(dest, qb, curr); // Map Side Join & No Skew
} else {
curr = genGroupByPlanMapAggr2MR(dest, qb, curr); // Map Side Join & Skew -> 使用2个MR
}
} else if (conf.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) {
curr = genGroupByPlan2MR(dest, qb, curr); // NO Map Side Join & Skew -> 使用2个MR
} else {
curr = genGroupByPlan1MR(dest, qb, curr); // NO Map Side Join & NO Skew -> 使用1个MR
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("RR before GB " + opParseCtx.get(gbySource).getRowResolver()
+ " after GB " + opParseCtx.get(curr).getRowResolver());
}
curr = genPostGroupByBodyPlan(curr, dest, qb, aliasToOpInfo, gbySource);
}
} else {
curr = genGroupByPlan1ReduceMultiGBY(commonGroupByDestGroup, qb, input, aliasToOpInfo);
}
}
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Created Body Plan for Query Block " + qb.getId());
}
return curr;
}
代码核心就是根据 HIVEMAPSIDEAGGREGATE 和 HIVEGROUPBYSKEW 的不同,采用四种不同的Operator生成策略。