AntDB的集群计划(Cluster Plan)类似并行计划(Parallel Plan),通过序列化(Serialize)和反序列化(Restore)执行计划(Plan Statement),并发送到各个相关节点(Node),以保证各个节点的执行计划一致(基本一致,Restore时可能略微改动)。
AntDB引入Reduce Plan用于动态重分布数据,Reduce Plan的执行完成包括两部分:本地数据(通过eof_underlying标记本地数据是否扫描完毕)+网络数据(通过eof_network标记网络数据是否扫描完毕)。
实际上,尽管各个节点的执行计划一致,但是由于各个节点自身的情况(包括机器性能、数据大小,网络等因素),各个节点在执行过程中,必然有先后之分。进而,我们就发现,Reduce Plan由于需要各个节点发送EOF消息来标记网络传输的完成,一定情况下就会出现Reduce Plan在各个节点之间死锁的问题。那么,合理的驱动Plan Tree中的Reduce Plan发送EOF成为必要的措施。
以下列举目前已知的死锁用例。
问题:常规死锁是发生在执行计划执行过程中,一定条件下,某个节点的Plan Tree提前退出,导致该Plan Tree的一个或多个Reduce Node没有发送EOF消息通知其余节点而导致的死锁。
方法:当ExecProcNode(PlanState *node)返回的Slot满足TupIsNull(slot)时,驱动以该node为顶点的Plan Tree,确保该Plan Tree下的Reduce Plan完成EOF的发送行为。
问题:Agg Plan在执行过程中,当左树返回NULL时,Agg Plan返回的结果可能不满足TupIsNull(slot),而导致没有驱动。
方法:驱动条件新增 ((AggState *) node)->agg_done) 为真时驱动。
问题:CteScan是比较特殊的一种Plan,其真正执行的plan是其上层的某个Plan的initPlan,由于驱动ClusterReduce目的在于转发不属于本节点的实时数据,丢弃其余数据,那么在CteScan的执行上就不合理了,因为其他Plan可能还会用到该CteScan。例如:
WITH t_onek AS (
SELECT unique1, two, ten, hundred, twothousand
, tenthous, even, stringu2
FROM onek
WHERE odd < 100
)
SELECT *
FROM t_onek
WHERE even = 1000
UNION ALL
SELECT *
FROM t_onek
WHERE even < 100;
方法:CteScan的驱动程序
static bool
DriveCteScanState(CteScanState *node)
{
TupleTableSlot *slot = NULL;
ListCell *lc = NULL;
SubPlanState *sps = NULL;
Assert(node && IsA(node, CteScanState));
if (!IsThereClusterReduce((PlanState *) node))
return false;
/*
* Here we do ExecCteScan instead of just driving ClusterReduce,
* because other plan node may need the results of the CteScan.
*/
for (;;)
{
slot = ExecCteScan((CteScanState *) node);
if (TupIsNull(slot))
break;
}
/*
* Do not forget to drive subPlan-s.
*/
foreach (lc, node->ss.ps.subPlan)
{
sps = (SubPlanState *) lfirst(lc);
Assert(IsA(sps, SubPlanState));
if (DriveClusterReduceWalker(sps->planstate))
return true;
}
/*
* Do not forget to drive initPlan-s.
*/
foreach (lc, node->ss.ps.initPlan)
{
sps = (SubPlanState *) lfirst(lc);
Assert(IsA(sps, SubPlanState));
if (DriveClusterReduceWalker(sps->planstate))
return true;
}
return false;
}
问题:PlanState的驱动顺序是按照planstate_tree_walker的顺序驱动的,但这个顺序与实际上PlanState的执行顺序是不匹配的,故而,一定情况下会出现执行与驱动的互锁情况。例如:HashJoin死锁,HashJoin的左树(Left Tree)和右树(Right Tree)在执行时,可能先做左树,也可能先做右树,故导致执行与驱动死锁。planstate_tree_walker的顺序按照:
subPlan-s
执行。
/*
* planstate_tree_walker --- walk plan state trees
*
* The walker has already visited the current node, and so we need only
* recurse into any sub-nodes it has.
*/
bool
planstate_tree_walker(PlanState *planstate,
bool (*walker) (),
void *context)
{
Plan *plan = planstate->plan;
ListCell *lc;
/* initPlan-s */
if (planstate_walk_subplans(planstate->initPlan, walker, context))
return true;
/* lefttree */
if (outerPlanState(planstate))
{
if (walker(outerPlanState(planstate), context))
return true;
}
/* righttree */
if (innerPlanState(planstate))
{
if (walker(innerPlanState(planstate), context))
return true;
}
/* special child plans */
switch (nodeTag(plan))
{
case T_ModifyTable:
if (planstate_walk_members(((ModifyTable *) plan)->plans,
((ModifyTableState *) planstate)->mt_plans,
walker, context))
return true;
break;
case T_Append:
if (planstate_walk_members(((Append *) plan)->appendplans,
((AppendState *) planstate)->appendplans,
walker, context))
return true;
break;
case T_MergeAppend:
if (planstate_walk_members(((MergeAppend *) plan)->mergeplans,
((MergeAppendState *) planstate)->mergeplans,
walker, context))
return true;
break;
case T_BitmapAnd:
if (planstate_walk_members(((BitmapAnd *) plan)->bitmapplans,
((BitmapAndState *) planstate)->bitmapplans,
walker, context))
return true;
break;
case T_BitmapOr:
if (planstate_walk_members(((BitmapOr *) plan)->bitmapplans,
((BitmapOrState *) planstate)->bitmapplans,
walker, context))
return true;
break;
case T_SubqueryScan:
if (walker(((SubqueryScanState *) planstate)->subplan, context))
return true;
break;
case T_CustomScan:
foreach(lc, ((CustomScanState *) planstate)->custom_ps)
{
if (walker((PlanState *) lfirst(lc), context))
return true;
}
break;
default:
break;
}
/* subPlan-s */
if (planstate_walk_subplans(planstate->subPlan, walker, context))
return true;
return false;
}
方法:为了使得驱动顺序能与执行顺序保持一致,新增planstate_tree_exec_walker函数,walk顺序为:
bool
planstate_tree_exec_walker(PlanState *planstate,
bool (*walker) (),
void *context)
{
Plan *plan = planstate->plan;
ListCell *lc;
switch (nodeTag(plan))
{
case T_HashJoin:
if (planstate_exec_walk_hashjoin((HashJoinState *)planstate,
walker,
context))
return true;
break;
case T_ModifyTable:
if (planstate_walk_members(((ModifyTable *) plan)->plans,
((ModifyTableState *) planstate)->mt_plans,
walker, context))
return true;
break;
case T_Append:
if (planstate_walk_members(((Append *) plan)->appendplans,
((AppendState *) planstate)->appendplans,
walker, context))
return true;
break;
case T_MergeAppend:
if (planstate_walk_members(((MergeAppend *) plan)->mergeplans,
((MergeAppendState *) planstate)->mergeplans,
walker, context))
return true;
break;
case T_BitmapAnd:
if (planstate_walk_members(((BitmapAnd *) plan)->bitmapplans,
((BitmapAndState *) planstate)->bitmapplans,
walker, context))
return true;
break;
case T_BitmapOr:
if (planstate_walk_members(((BitmapOr *) plan)->bitmapplans,
((BitmapOrState *) planstate)->bitmapplans,
walker, context))
return true;
break;
case T_SubqueryScan:
if (walker(((SubqueryScanState *) planstate)->subplan, context))
return true;
break;
case T_CustomScan:
foreach(lc, ((CustomScanState *) planstate)->custom_ps)
{
if (walker((PlanState *) lfirst(lc), context))
return true;
}
break;
case T_CteScan:
if (walker(((CteScanState *) planstate)->cteplanstate, context))
return true;
break;
default:
if (outerPlanState(planstate) && walker(outerPlanState(planstate), context))
return true;
if (innerPlanState(planstate) && walker(innerPlanState(planstate), context))
return true;
break;
}
/* subPlan-s */
if (planstate_walk_subplans(planstate->subPlan, walker, context))
return true;
/* initPlan-s */
if (planstate_walk_subplans(planstate->initPlan, walker, context))
return true;
return false;
}
static bool
DriveClusterReduceWalker(PlanState *node)
{
EState *estate;
int planid;
bool res;
if (node == NULL)
return false;
estate = node->state;
planid = PlanNodeID(node->plan);
if (bms_is_member(planid, estate->es_reduce_drived_set))
return false;
if (IsA(node, ClusterReduceState))
{
ClusterReduceState *crs = (ClusterReduceState *) node;
Assert(crs->port);
if (!crs->eof_network || !crs->eof_underlying)
elog(LOG, "Drive ClusterReduce(%d) to send EOF message", planid);
/*
* Drive all ClusterReduce to send slot, discard slot
* used for local.
*/
res = DriveClusterReduceState(crs);
} else
if (IsA(node, CteScanState))
{
res = DriveCteScanState((CteScanState *) node);
} else
{
res = planstate_tree_exec_walker(node, DriveClusterReduceWalker, NULL);
}
estate->es_reduce_drived_set = bms_add_member(estate->es_reduce_drived_set, planid);
return res;
}
限于认知有限,驱动程序应该尚有hold不住的情况,后续遇到死锁case时再持续优化。