Presto技术总结 因为内容过长分为了上下两集
1.环境准备
Hadoop环境,Hive环境,mysql环境,ssh环境,presto本机debug环境
推荐hadoop2.2.0、hive1.2.1、mysql5.7、openssh-server&client、presto最新版本
presto本地debug环境搭建参考presto in idea
2.查询入口&流程
所有的查询首先打到StatementResource对应的路径为@Path("/v1/statement")
Query query = Query.create(
sessionContext,
statement, //实际的sql
queryManager,
sessionPropertyManager,
exchangeClient,
responseExecutor,
timeoutExecutor,
blockEncodingSerde);
queries.put(query.getQueryId(), query); //创建query并放入执行队列
Query类中执行
Query result = new Query(sessionContext, query, queryManager, sessionPropertyManager, exchangeClient, dataProcessorExecutor, timeoutExecutor, blockEncodingSerde);
Query类中的queryManager
QueryInfo queryInfo = queryManager.createQuery(sessionContext, query); //其中sessionContext为用户的seesion信息,query为用户sql
2.1词法语法分析生成AST
queryManager是一个接口目前只有SqlQueryManager的实现类,createQuery方法
private final ConcurrentMap queries = new ConcurrentHashMap<>();
//QueryQueueManager是一个接口,sql相关的实现类SqlQueryQueueManager
private final QueryQueueManager queueManager;
//主要实现的逻辑,词法和语法分析,生成AstNode
Statement wrappedStatement = sqlParser.createStatement(query, createParsingOptions(session));
statement = unwrapExecuteStatement(wrappedStatement, sqlParser, session);
List parameters = wrappedStatement instanceof Execute ? ((Execute) wrappedStatement).getParameters() : emptyList();
//参数校验
validateParameters(statement, parameters);
//获取对应的query执行器工厂
QueryExecutionFactory> queryExecutionFactory = executionFactories.get(statement.getClass());
//query执行器工厂创建query执行器
queryExecution = queryExecutionFactory.createQueryExecution(queryId, query, session, statement, parameters);
//将query执行器个queryId映射到map中
queries.put(queryId, queryExecution);
//将query执行器提交到queueManager
queueManager.submit(statement, queryExecution, queryExecutor);
//返回query信息
return queryInfo;
SqlQueryQueueManager的submit方法
List queues;
try {
//按照配置的规则,选择执行队列
queues = selectQueues(queryExecution.getSession(), executor);
}
catch (PrestoException e) {
queryExecution.fail(e);
return;
}
for (QueryQueue queue : queues) {
if (!queue.reserve(queryExecution)) {
queryExecution.fail(new PrestoException(QUERY_QUEUE_FULL, "Too many queued queries"));
return;
}
}
//如果符合规则则入队
queues.get(0).enqueue(createQueuedExecution(queryExecution, queues.subList(1, queues.size()), executor));
//按照配置的规则,选择执行队列
private List selectQueues(Session session, Executor executor)
{
for (QueryQueueRule rule : rules) {
Optional> queues = rule.match(session.toSessionRepresentation());
if (queues.isPresent()) {
//获取或者创建一个Query队列
return getOrCreateQueues(session, executor, queues.get());
}
}
throw new PrestoException(QUERY_REJECTED, "Query did not match any queuing rule");
}
//获取或者创建一个Query队列
private List getOrCreateQueues(Session session, Executor executor, List definitions)
{
ImmutableList.Builder queues = ImmutableList.builder();
for (QueryQueueDefinition definition : definitions) {
String expandedName = definition.getExpandedTemplate(session);
QueueKey key = new QueueKey(definition, expandedName);
if (!queryQueues.containsKey(key)) {
QueryQueue queue = new QueryQueue(executor, definition.getMaxQueued(), definition.getMaxConcurrent());
if (queryQueues.putIfAbsent(key, queue) == null) {
// Export the mbean, after checking for races
String objectName = ObjectNames.builder(QueryQueue.class, definition.getTemplate()).withProperty("expansion", expandedName).build();
mbeanExporter.export(objectName, queue);
}
}
queues.add(queryQueues.get(key));
}
return queues.build();
}
QueryQueue(Executor queryExecutor, int maxQueuedQueries, int maxConcurrentQueries)
{
requireNonNull(queryExecutor, "queryExecutor is null");
checkArgument(maxQueuedQueries > 0, "maxQueuedQueries must be greater than zero");
checkArgument(maxConcurrentQueries > 0, "maxConcurrentQueries must be greater than zero");
int permits = maxQueuedQueries + maxConcurrentQueries;
// Check for overflow
checkArgument(permits > 0, "maxQueuedQueries + maxConcurrentQueries must be less than or equal to %s", Integer.MAX_VALUE);
this.queuePermits = new AtomicInteger(permits);
this.asyncSemaphore = new AsyncSemaphore<>(maxConcurrentQueries,
queryExecutor,
queueEntry -> {
QueuedExecution queuedExecution = queueEntry.dequeue();
if (queuedExecution != null) {
queuedExecution.start();
return queuedExecution.getCompletionFuture();
}
return Futures.immediateFuture(null);
});
}
public void start()
{
// Only execute if the query is not already completed (e.g. cancelled)
if (listenableFuture.isDone()) {
return;
}
if (nextQueues.isEmpty()) {
executor.execute(() -> {
try (SetThreadName ignored = new SetThreadName("Query-%s", queryExecution.getQueryId())) {
//将statement转化为analysis(plan)
queryExecution.start();
}
});
}
else {
nextQueues.get(0).enqueue(new QueuedExecution(queryExecution, nextQueues.subList(1, nextQueues.size()), executor, listenableFuture));
}
}
2.2语义分析&生成逻辑执行计划
2.2.1语义分析
先看看SqlQueryExecution类的构成
private final QueryStateMachine stateMachine;
private final Statement statement; //词法语法分析生成的astNode
private final Metadata metadata;
private final AccessControl accessControl;
private final SqlParser sqlParser; //sql解析器
private final SplitManager splitManager;
private final NodePartitioningManager nodePartitioningManager;
private final NodeScheduler nodeScheduler; //将task分配给node的核心模块,stage调度的时候会详细说明
private final List planOptimizers;
private final RemoteTaskFactory remoteTaskFactory;
private final LocationFactory locationFactory;
private final int scheduleSplitBatchSize;
private final ExecutorService queryExecutor;
private final ScheduledExecutorService schedulerExecutor;
private final FailureDetector failureDetector;
private final QueryExplainer queryExplainer;
private final PlanFlattener planFlattener;
private final CostCalculator costCalculator;
private final AtomicReference queryScheduler = new AtomicReference<>();
private final AtomicReference queryPlan = new AtomicReference<>();
private final NodeTaskMap nodeTaskMap;
private final ExecutionPolicy executionPolicy;
private final List parameters;
private final SplitSchedulerStats schedulerStats;
SqlQueryExecution类的start方法
PlanRoot plan = analyzeQuery(); //生成逻辑执行计划
//调用栈为
analyzeQuery() -> doAnalyzeQuery()
doAnalyzeQuery()
{
//创建语义分析器
Analyzer analyzer = new Analyzer(stateMachine.getSession(), metadata, sqlParser, accessControl, Optional.of(queryExplainer), parameters);
//开始语义分析
Analysis analysis = analyzer.analyze(statement);
//生成逻辑Planner
LogicalPlanner logicalPlanner = new LogicalPlanner(stateMachine.getSession(), planOptimizers, idAllocator, metadata, sqlParser, costCalculator);
//逻辑Planner开始生成逻辑执行计划,还涉及到逻辑执行计划的优化
Plan plan = logicalPlanner.plan(analysis);
queryPlan.set(plan);
//对逻辑执行计划进行分段,准备生成分布式执行计划
SubPlan fragmentedPlan = PlanFragmenter.createSubPlans(stateMachine.getSession(), metadata, nodePartitioningManager, plan, false);
return new PlanRoot(fragmentedPlan, !explainAnalyze, extractConnectors(analysis));
}
Analyzer类的analyze方法
//sql重写
Statement rewrittenStatement = StatementRewrite.rewrite(session, metadata, sqlParser, queryExplainer, statement, parameters, accessControl);
//初始化Analysis
Analysis analysis = new Analysis(rewrittenStatement, parameters, isDescribe);
//创建Statement分析器
StatementAnalyzer analyzer = new StatementAnalyzer(analysis, metadata, sqlParser, accessControl, session);
//调用Statement分析器去分析
analyzer.analyze(rewrittenStatement, Optional.empty());
analyze里面的具体实现就是遍历ASTNode对每种类型的Node作分析,主要是获取meta和校验元信息
2.2.2生成逻辑执行计划
LogicalPlanner类的plan方法
PlanNode root = planStatement(analysis, analysis.getStatement());
//检查执行计划的有效性
PlanSanityChecker.validateIntermediatePlan(root, session, metadata, sqlParser, symbolAllocator.getTypes());
//对生成的逻辑执行进行优化
root = optimizer.optimize(root, session, symbolAllocator.getTypes(), symbolAllocator, idAllocator);
LogicalPlanner类的planStatement方法
//对于普通的sql来说,只执行下面
createOutputPlan(planStatementWithoutOutput(analysis, statement), analysis);
LogicalPlanner类的planStatementWithoutOutput方法
private RelationPlan planStatementWithoutOutput(Analysis analysis, Statement statement)
{
if (statement instanceof CreateTableAsSelect) {
if (analysis.isCreateTableAsSelectNoOp()) {
throw new PrestoException(NOT_SUPPORTED, "CREATE TABLE IF NOT EXISTS is not supported in this context " + statement.getClass().getSimpleName());
}
return createTableCreationPlan(analysis, ((CreateTableAsSelect) statement).getQuery());
}
else if (statement instanceof Insert) {
checkState(analysis.getInsert().isPresent(), "Insert handle is missing");
return createInsertPlan(analysis, (Insert) statement);
}
else if (statement instanceof Delete) {
return createDeletePlan(analysis, (Delete) statement);
}
else if (statement instanceof Query) {
return createRelationPlan(analysis, (Query) statement);
}
else if (statement instanceof Explain && ((Explain) statement).isAnalyze()) {
return createExplainAnalyzePlan(analysis, (Explain) statement);
}
else {
throw new PrestoException(NOT_SUPPORTED, "Unsupported statement type " + statement.getClass().getSimpleName());
}
}
LogicalPlanner类的createRelationPlan方法
return new RelationPlanner(analysis, symbolAllocator, idAllocator, buildLambdaDeclarationToSymbolMap(analysis, symbolAllocator), metadata, session)
.process(query, null);
RelationPlanner类具体的实现就是,遍历ASTNode,生成逻辑执行计划里面对应的Node
逻辑执行计划中常见的Node和Visit操作如下面所示:
AggregationNode 聚合操作的节点,有Final、partial、single三种,表示最终聚合、局部聚合和单点聚合,在执行计划优化前,聚合类型都是单点聚合,在优化器中会拆成局部聚合和最终聚合,类似于MR任务中的,map端局部reduce,和reduce端最终reduce
DeleteNode Delete操作的节点
ExchangeNode 逻辑执行计划中,不同Stage之间交换数据的节点
FilterNode 进行Filter过滤操作的节点
JoinNode 执行Join操作的节点
LimitNode 执行limit操作的节点
MarkDistinctNode 处理count distinct
OutputNode 输出Node
ProjectNode 将下层的节点输出列映射成上层节点 例如:select a + 1 from b将TableScanNode的a列 + 1 映射到OutputNode
RemoteSourceNode 类似于ExchangeNode,在分布式执行计划中,不同Stage之间交换数据的节点
SampleNode 抽样函数Node
RowNumberNode 处理窗函数RowNumber
SortNode 排序Node
TableScanNode 读取表的数据
TableWriterNode 写入表的数据
TopNNode order by ... limit 会使用效率更高的TopNNode
UnionNode 处理Union操作
WindowNode 处理窗口函数
RelationPlanner类的visit操作
visitTable 生成TableScanNode
visitAliasedRelation 处理有别名的Relation
visitSampledRelation 添加一个SampleNode,主要处理抽样函数
visitJoin 根据不同的join类型,生成不同的节点结构,一般来说是将左右两边生成对应的queryPlan,然后左右各添加一个ProjectNode,中间添加一个JoinNode相连,让上层添加一个FilterNode,FilterNode为join条件
visitQuery 使用QueryPlanner处理Query,并返回生成的执行计划
visitQuerySpecification 使用QueryPlanner处理QueryBody,并返回生成的执行计划
QueryPlanner类的plan操作(queryBody的定义就是指一个完整的sql,可以嵌套,例如select a from QueryBody b,通常来说里面的这个QueryBody会被当做AliasedRelation继续plan)
Query和QuerySpecification相比,QuerySpecification代表完整的QueryBody,而Query则包含了QueryBody--QueryBody是一个抽象类,QuerySpecification继承了QueryBody
plan(Query query) 首先取出Query中的queryBody,然后调用RelationPlanner进行分析,调用其visitQuerySpecification然后RelationPlanner调用QueryPlanner的plan方法
plan(QuerySpecification query) 生成一个queryBody中所有的组件Node
下面是最主要的 plan(QuerySpecification query)的plan过程
PlanBuilder builder = planFrom(node); //builder的root即为生成的NodeTree
RelationPlan fromRelationPlan = builder.getRelationPlan(); //生成TableScanNode
builder = filter(builder, analysis.getWhere(node), node); //生成FilterNode
builder = aggregate(builder, node); //生成AggregateNode
builder = filter(builder, analysis.getHaving(node), node); //如果有Having则生成FilterNode
builder = window(builder, node); //生成windowNode
List outputs = analysis.getOutputExpressions(node);
builder = handleSubqueries(builder, node, outputs);
if (node.getOrderBy().isPresent() && !SystemSessionProperties.isLegacyOrderByEnabled(session)) {
if (analysis.getGroupingSets(node).isEmpty()) {
builder = project(builder, outputs, fromRelationPlan);
outputs = toSymbolReferences(computeOutputs(builder, outputs));
builder = planBuilderFor(builder, analysis.getScope(node.getOrderBy().get()));
}
else {
List orderByAggregates = analysis.getOrderByAggregates(node.getOrderBy().get());
builder = project(builder, Iterables.concat(outputs, orderByAggregates));
outputs = toSymbolReferences(computeOutputs(builder, outputs));
List complexOrderByAggregatesToRemap = orderByAggregates.stream()
.filter(expression -> !analysis.isColumnReference(expression))
.collect(toImmutableList());
builder = planBuilderFor(builder, analysis.getScope(node.getOrderBy().get()), complexOrderByAggregatesToRemap);
}
builder = window(builder, node.getOrderBy().get());
}
List orderBy = analysis.getOrderByExpressions(node);
builder = handleSubqueries(builder, node, orderBy);
builder = project(builder, Iterables.concat(orderBy, outputs));
builder = distinct(builder, node);
builder = sort(builder, node);
builder = project(builder, outputs);
builder = limit(builder, node);
return new RelationPlan(
builder.getRoot(),
analysis.getScope(node),
computeOutputs(builder, outputs));
2.2.3逻辑执行计划优化
LogicalPlanner类的plan方法
root = optimizer.optimize(root, session, symbolAllocator.getTypes(), symbolAllocator, idAllocator);
optimizer优化器的具体实现就是,调用不同的具体实现的优化去对,上一步生成的NodeTree(逻辑执行计划)进行逐个优化
具体的优化方法
AddExchanges //生成分布式执行计划,例如添加局部聚合和最终聚合
AddLocalExchanges
BeginTableWrite
CanonicalizeExpressions //将执行计划中的表达式标准化,比如将is not null 改写为not(is null),将if语句改写为case when
CheckSubqueryNodesAreRewritten
CountConstantOptimizer //将count(a)改写为count(*)提高不同数据源的兼容性
DesugaringOptimizer
DetermineJoinDistributionType
EliminateCrossJoins
EmptyDeleteOptimizer
HashGenerationOptimizer //提前进行hash计算
ImplementIntersectAndExceptAsUnion
IndexJoinOptimizer //将Join优化为IndexJoiJ,获取Join表的索引,提升速度
IterativeOptimizer
LimitPushDown //limit条件下推,减小下层节点的数据量
MetadataDeleteOptimizer
MetadataQueryOptimizer //将对表的分区字段进行的聚合操作,改写为针对表元数据的查询,减少读取表的操作
OptimizeMixedDistinctAggregations
PickLayout
PredicatePushDown //谓词(过滤条件)下推,减下下层节点的数据量
ProjectionPushDown //ProjectNode下推,减少Union节点的数据量
PruneUnreferencedOutputs //去除ProjectNodeP不在最终输出中的列,减小计算量
PruneRedundantProjections //去除多余的projectNode,如果上下节点全都直接映射,则去掉该层projectNode
PushTableWriteThroughUnion
RemoveUnreferencedScalarLateralNodes
SetFlatteningOptimizer //合并能够合并的Union语句
SimplifyExpressions //对执行计划中涉及到的表达式进行化简和优化
TransformCorrelatedNoAggregationSubqueryToJoin
TransformCorrelatedNoAggregationSubqueryToJoin
TransformCorrelatedScalarAggregationToJoin
TransformCorrelatedSingleRowSubqueryToProject
TransformQuantifiedComparisonApplyToLateralJoin
TransformUncorrelatedInPredicateSubqueryToSemiJoin
TransformUncorrelatedLateralToJoin
UnaliasSymbolReferences //去除执行计划中projectNode无意义的映射,如果列直接相对而没有带表达式则直接映射到上层节点
WindowFilterPushDown
2.3生成分布式执行计划
2.3.1逻辑执行计划分段
这个阶段会将上面生成的逻辑执行计划切分为,多个Stage,其中Stage的阶段分为四个阶段:
Sourece、Fixed、Single
Sourece:一般是TableScanNode、ProjectNode、FilterNode,一般是最下游的取数的Stage
Fixed:一般在Sourece之后,将Sourece阶段获取的数据分散到多个节点上处理,类似于Map端reduce操作,包括局部聚合、局部Join、局部数据写入
Single:一般在Fixed之后,只在一台机器上进行,汇总所有的结果、做最终聚合、全局排序,并将结果传输给Coordinator
Coordinator_only:只在Coordinator上
SqlQueryExecution类doAnalyzeQuery方法
SubPlan fragmentedPlan = PlanFragmenter.createSubPlans(stateMachine.getSession(), metadata, nodePartitioningManager, plan, false);
//SubPlan类的构造
private final PlanFragment fragment;
private final List children;
可以看出来,内部是类似于B树的树形结构,这样就将逻辑执行计划切分为了若干个段
2.3.2生成分布式执行计划
2.3.2.1获取SplitSource分片
SqlQueryExecution类start方法
//生成分段的执行计划
PlanRoot plan = analyzeQuery();
//生成分布式执行计划
planDistribution(plan);
SqlQueryExecution类planDistribution方法
//获取stage的执行计划
StageExecutionPlan outputStageExecutionPlan = distributedPlanner.plan(plan.getRoot(), stateMachine.getSession());
//创建SqlQuery调度
SqlQueryScheduler scheduler = new SqlQueryScheduler(
//状态监听器
stateMachine,
locationFactory,
outputStageExecutionPlan,
nodePartitioningManager,
//将task分配给node的核心模块
nodeScheduler,
remoteTaskFactory,
stateMachine.getSession(),
plan.isSummarizeTaskInfos(),
scheduleSplitBatchSize,
queryExecutor,
schedulerExecutor,
failureDetector,
rootOutputBuffers,
//保存了当前stage分配的task和node的映射关系
nodeTaskMap,
executionPolicy,
schedulerStats);
DistributedExecutionPlanner类plan方法
调用栈为plan -> doPlan
private StageExecutionPlan doPlan(SubPlan root, Session session, ImmutableList.Builder allSplitSources)
{
PlanFragment currentFragment = root.getFragment();
//visitor模式遍历分段,对TableScaTNode进行splitManager.getSplits()操作来获取分片,实现类是HiveSplitManager,内部实现是调用HiveSplitLoader.start()方法,下面进行详细说明
//这里好像说明了一个stage里面只能有一个tableScan???
Map splitSources = currentFragment.getRoot().accept(new Visitor(session, currentFragment.getPipelineExecutionStrategy(), allSplitSources), null);
ImmutableList.Builder dependencies = ImmutableList.builder();
for (SubPlan childPlan : root.getChildren()) {
dependencies.add(doPlan(childPlan, session, allSplitSources)); //此处会递归的调用,将子逻辑执行计划全部转化为带用层级关系的stage执行计划
}
return new StageExecutionPlan(
currentFragment,
splitSources,
dependencies.build());
}
对TableScaTNode进行splitManager.getSplits()操作来获取分片,并将结果保存在Map
public Map visitTableScan(TableScanNode node, Void context)
{
// get dataSource for table
SplitSource splitSource = splitManager.getSplits(
session,
node.getLayout().get(),
pipelineExecutionStrategy == GROUPED_EXECUTION ? GROUPED_SCHEDULING : UNGROUPED_SCHEDULING);
splitSources.add(splitSource);
return ImmutableMap.of(node.getId(), splitSource);
}
HiveSplitManager实现ConnectorSplitManager(Presto SPI接口)
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableLayoutHandle layoutHandle, SplitSchedulingStrategy splitSchedulingStrategy)
{
HiveTableLayoutHandle layout = (HiveTableLayoutHandle) layoutHandle;
SchemaTableName tableName = layout.getSchemaTableName();
// get table metadata
SemiTransactionalHiveMetastore metastore = metastoreProvider.apply((HiveTransactionHandle) transaction);
Table table = metastore.getTable(tableName.getSchemaName(), tableName.getTableName())
.orElseThrow(() -> new TableNotFoundException(tableName));
// verify table is not marked as non-readable
String tableNotReadable = table.getParameters().get(OBJECT_NOT_READABLE);
if (!isNullOrEmpty(tableNotReadable)) {
throw new HiveNotReadableException(tableName, Optional.empty(), tableNotReadable);
}
// 获取hive的分区
List partitions = layout.getPartitions()
.orElseThrow(() -> new PrestoException(GENERIC_INTERNAL_ERROR, "Layout does not contain partitions"));
// short circuit if we don't have any partitions
HivePartition partition = Iterables.getFirst(partitions, null);
if (partition == null) {
return new FixedSplitSource(ImmutableList.of());
}
// get buckets from first partition (arbitrary)
List buckets = partition.getBuckets();
// validate bucket bucketed execution
Optional bucketHandle = layout.getBucketHandle();
if ((splitSchedulingStrategy == GROUPED_SCHEDULING) && !bucketHandle.isPresent()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, "SchedulingPolicy is bucketed, but BucketHandle is not present");
}
// sort partitions
partitions = Ordering.natural().onResultOf(HivePartition::getPartitionId).reverse().sortedCopy(partitions);
Iterable hivePartitions = getPartitionMetadata(metastore, table, tableName, partitions, bucketHandle.map(HiveBucketHandle::toBucketProperty));
HiveSplitLoader hiveSplitLoader = new BackgroundHiveSplitLoader(
table,
hivePartitions,
layout.getCompactEffectivePredicate(),
createBucketSplitInfo(bucketHandle, buckets),
session,
hdfsEnvironment,
namenodeStats,
directoryLister,
executor,
splitLoaderConcurrency,
recursiveDfsWalkerEnabled);
HiveSplitSource splitSource;
switch (splitSchedulingStrategy) {
case UNGROUPED_SCHEDULING:
splitSource = HiveSplitSource.allAtOnce(
session,
table.getDatabaseName(),
table.getTableName(),
layout.getCompactEffectivePredicate(),
maxInitialSplits,
maxOutstandingSplits,
maxOutstandingSplitsSize,
hiveSplitLoader,
executor,
new CounterStat());
break;
case GROUPED_SCHEDULING:
splitSource = HiveSplitSource.bucketed(
session,
table.getDatabaseName(),
table.getTableName(),
layout.getCompactEffectivePredicate(),
maxInitialSplits,
maxOutstandingSplits,
new DataSize(32, MEGABYTE),
hiveSplitLoader,
executor,
new CounterStat());
break;
default:
throw new IllegalArgumentException("Unknown splitSchedulingStrategy: " + splitSchedulingStrategy);
}
hiveSplitLoader.start(splitSource);
return splitSource;
}
2.3.2.2产生stage执行计划
上面产生了一个StageExecutionPlan(stage执行计划),下面看看StageExecutionPlan的结构**
private final PlanFragment fragment; //当前执行计划分段
private final Map splitSources; //从HiveSplitManager获取的分片映射关系
private final List subStages; //子执行计划分段
private final Optional> fieldNames; //字段名称
经过planDistribution方法之后,分段的逻辑执行计划就转化成了stage执行计划,而presto对task的调度都是基于stage来调度的,紧接着SqlQueryScheduler会构造SqlStage执行器
SqlQueryScheduler类的构造方法
List stages = createStages(
(fragmentId, tasks, noMoreExchangeLocations) -> updateQueryOutputLocations(queryStateMachine, rootBufferId, tasks, noMoreExchangeLocations),
new AtomicInteger(),
locationFactory,
plan.withBucketToPartition(Optional.of(new int[1])),
nodeScheduler,
remoteTaskFactory,
session,
splitBatchSize,
partitioningHandle -> partitioningCache.computeIfAbsent(partitioningHandle, handle -> nodePartitioningManager.getNodePartitioningMap(session, handle)),
nodePartitioningManager,
queryExecutor,
schedulerExecutor,
failureDetector,
nodeTaskMap,
stageSchedulers,
stageLinkages);
SqlStageExecution rootStage = stages.get(0);
2.3.2.3产生stage执行器
SqlQueryScheduler类的createStages方法
ImmutableList.Builder stages = ImmutableList.builder();
StageId stageId = new StageId(queryStateMachine.getQueryId(), nextStageId.getAndIncrement());
SqlStageExecution stage = new SqlStageExecution( //创建当前的SqlStageExecution
stageId,
locationFactory.createStageLocation(stageId),
plan.getFragment(),
remoteTaskFactory,
session,
summarizeTaskInfo,
nodeTaskMap,
queryExecutor,
failureDetector,
schedulerStats);
stages.add(stage);
...
...
//中间省略创建stage调度器和分配策略的步骤,详情见2.4.3
ImmutableSet.Builder childStagesBuilder = ImmutableSet.builder();
for (StageExecutionPlan subStagePlan : plan.getSubStages()) {
List subTree = createStages( //递归创建所有的子SqlStageExecution
stage::addExchangeLocations,
nextStageId,
locationFactory,
subStagePlan.withBucketToPartition(bucketToPartition),
nodeScheduler,
remoteTaskFactory,
session,
splitBatchSize,
partitioningCache,
nodePartitioningManager,
queryExecutor,
schedulerExecutor,
failureDetector,
nodeTaskMap,
stageSchedulers,
stageLinkages);
stages.addAll(subTree);
SqlStageExecution childStage = subTree.get(0);
childStagesBuilder.add(childStage);
}
Set childStages = childStagesBuilder.build();
至此所有SqlStageExecution生成完毕,下面看一下SqlStageExecution的简化构成
private final StageStateMachine stateMachine; //stage状态监听器
private final RemoteTaskFactory remoteTaskFactory; //生成Task的工厂类
private final NodeTaskMap nodeTaskMap; //保存当前stage分配的task和节点映射列表
private final Map> tasks = new ConcurrentHashMap<>();
private final AtomicInteger nextTaskId = new AtomicInteger();
private final Set allTasks = newConcurrentHashSet();
private final Set finishedTasks = newConcurrentHashSet();
private final Multimap sourceTasks = HashMultimap.create();
2.4生成分布式执行计划调度
2.4.1调度相关的服务类
先介绍一下上文提到的SqlQueryExecution中的NodeScheduler类
主要包括成员
InternalNodeManager nodeManager //获取存活的节点列表,保存在NodeMap里面,定时更新内容,默认5秒
主要包括方法
List selectNodes //选取存活的Node列表
NodeSelector createNodeSelector //提供了NodeSelector,其中包括各个stage中task分配的算法
ResettableRandomizedIterator randomizedNodes //打乱给定的NodeMap
InternalNodeManager接口的定义为
public interface InternalNodeManager
{
Set getNodes(NodeState state); //获取指定状态的节点列表
Set getActiveConnectorNodes(ConnectorId connectorId); //根据connectorId获取节点列表
Node getCurrentNode(); //获取当前节点信息
Set getCoordinators(); //获取Coordinator列表
AllNodes getAllNodes(); //获取所有的节点列表
void refreshNodes(); //刷新节点的信息
}
//有DiscoveryNodeManager的实现类
NodeSelector接口定义为
public interface NodeSelector
{
void lockDownNodes();
List allNodes(); //选择所有的节点
Node selectCurrentNode(); //选择当前节点
List selectRandomNodes(int limit) //选择limit个随机的节点
List selectRandomNodes(int limit, Set excludedNodes); //选择limit个随机的节点排除给定的节点
SplitPlacementResult computeAssignments(Set splits, List existingTasks);
SplitPlacementResult computeAssignments(Set splits, List existingTasks, NodePartitionMap partitioning);
}
//SimpleNodeSelector和TopologyAwareNodeSelector实现类 Presto会根据不同的网络拓扑结构来选择不同的NodeSelector
//在NodeScheduler的构造方法中,只要不是 LEGACY网络 就认为使用了网络拓扑,LEGACY网络指的是历史的网络,采用了非TCP/IP的网络
this.useNetworkTopology = !config.getNetworkTopology().equals(NetworkTopologyType.LEGACY);
//在createNodeSelector方法中,实例化了NodeSelector
if (useNetworkTopology) {
//所以只要你的网络使用了TCP/IP协议,实例化的NodeSelector都是TopologyAwareNodeSelector
return new TopologyAwareNodeSelector(
nodeManager,
nodeTaskMap,
includeCoordinator,
nodeMap,
minCandidates,
maxSplitsPerNode,
maxPendingSplitsPerTask,
topologicalSplitCounters,
networkLocationSegmentNames,
networkLocationCache);
}
else {
return new SimpleNodeSelector(nodeManager, nodeTaskMap, includeCoordinator, nodeMap, minCandidates, maxSplitsPerNode, maxPendingSplitsPerTask);
}
Node的定义为
public interface Node
{
HostAddress getHostAndPort(); //host和port
URI getHttpUri(); //url
String getNodeIdentifier();
String getVersion(); //version
boolean isCoordinator(); //是不是Coordinator
}
创建createNodeSelector过程
public NodeSelector createNodeSelector(ConnectorId connectorId)
{
//采用了谷歌的Supplier缓存技术
Supplier nodeMap = Suppliers.memoizeWithExpiration(() -> {
ImmutableSetMultimap.Builder byHostAndPort = ImmutableSetMultimap.builder();
ImmutableSetMultimap.Builder byHost = ImmutableSetMultimap.builder();
ImmutableSetMultimap.Builder workersByNetworkPath = ImmutableSetMultimap.builder();
Set nodes;
if (connectorId != null) {
nodes = nodeManager.getActiveConnectorNodes(connectorId);
}
else {
nodes = nodeManager.getNodes(ACTIVE);
}
Set coordinatorNodeIds = nodeManager.getCoordinators().stream()
.map(Node::getNodeIdentifier)
.collect(toImmutableSet());
for (Node node : nodes) {
if (useNetworkTopology && (includeCoordinator || !coordinatorNodeIds.contains(node.getNodeIdentifier()))) {
NetworkLocation location = networkLocationCache.get(node.getHostAndPort());
for (int i = 0; i <= location.getSegments().size(); i++) {
workersByNetworkPath.put(location.subLocation(0, i), node);
}
}
try {
byHostAndPort.put(node.getHostAndPort(), node);
InetAddress host = InetAddress.getByName(node.getHttpUri().getHost());
byHost.put(host, node);
}
catch (UnknownHostException e) {
// ignore
}
}
return new NodeMap(byHostAndPort.build(), byHost.build(), workersByNetworkPath.build(), coordinatorNodeIds);
}, 5, TimeUnit.SECONDS);
if (useNetworkTopology) {
return new TopologyAwareNodeSelector(
nodeManager,
nodeTaskMap,
includeCoordinator,
nodeMap,
minCandidates,
maxSplitsPerNode,
maxPendingSplitsPerTask,
topologicalSplitCounters,
networkLocationSegmentNames,
networkLocationCache);
}
else {
return new SimpleNodeSelector(nodeManager, nodeTaskMap, includeCoordinator, nodeMap, minCandidates, maxSplitsPerNode, maxPendingSplitsPerTask);
}
}
2.4.2调度选择策略
Single和Fixed Stage策略,比较简单,均为调用selectRandomNodes
2.4.3生成stage调度器和分配策略
承接2.3.2.3中间的代码
Optional bucketToPartition;
PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning();
// 根据不同的stage类型,创建不同的stage调度器
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Entry entry = Iterables.getOnlyElement(plan.getSplitSources().entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
ConnectorId connectorId = splitSource.getConnectorId();
if (isInternalSystemConnector(connectorId)) {
connectorId = null;
}
//创建nodeSelector用来选择执行的节点,主要是通过从nodeManager获取
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(connectorId);
//split动态分配策略
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stage::getAllTasks);
checkArgument(plan.getFragment().getPipelineExecutionStrategy() == UNGROUPED_EXECUTION);
//source阶段的stage选择simpleSourcePartitionedScheduler
stageSchedulers.put(stageId, simpleSourcePartitionedScheduler(stage, planNodeId, splitSource, placementPolicy, splitBatchSize));
bucketToPartition = Optional.of(new int[1]);
}
else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
bucketToPartition = Optional.of(new int[1]);
}
else {
// nodes are pre determined by the nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
long nodeCount = nodePartitionMap.getPartitionToNode().values().stream().distinct().count();
OptionalInt concurrentLifespansPerTask = getConcurrentLifespansPerNode(session);
Map splitSources = plan.getSplitSources();
//如果fixed阶段的stage 分配到了SplitSource 则创建选择FixedSourcePartitionedScheduler,该调度器里面自己创建了一个FixedSplitPlacementPolicy分配策略
if (!splitSources.isEmpty()) {
List schedulingOrder = plan.getFragment().getPartitionedSources();
List connectorPartitionHandles;
switch (plan.getFragment().getPipelineExecutionStrategy()) {
case GROUPED_EXECUTION:
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
break;
case UNGROUPED_EXECUTION:
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
break;
default:
throw new UnsupportedOperationException();
}
stageSchedulers.put(stageId, new FixedSourcePartitionedScheduler(
stage,
splitSources,
plan.getFragment().getPipelineExecutionStrategy(),
schedulingOrder,
nodePartitionMap,
splitBatchSize,
concurrentLifespansPerTask.isPresent() ? OptionalInt.of(toIntExact(concurrentLifespansPerTask.getAsInt() * nodeCount)) : OptionalInt.empty(),
nodeScheduler.createNodeSelector(null),
connectorPartitionHandles));
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
else {
//存活的node列表
Map partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
//如果fixed阶段的stage 没有分配到SplitSource,则选择FixedSourcePartitionedScheduler
stageSchedulers.put(stageId, new FixedCountScheduler(stage, partitionToNode));
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
}
2.4.4sqlQuery调度器开始调度
scheduler.start()启动sqlQueryScheduler的调度里面涉及到Task的调度
public void start()
{
if (started.compareAndSet(false, true)) {
executor.submit(this::schedule);
}
}
方法引用调用schedule()
private void schedule()
{
try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) {
Set completedStages = new HashSet<>();
ExecutionSchedule executionSchedule = executionPolicy.createExecutionSchedule(stages.values());
while (!executionSchedule.isFinished()) {
List> blockedStages = new ArrayList<>();
for (SqlStageExecution stage : executionSchedule.getStagesToSchedule()) {
stage.beginScheduling();
// 调用每个stage上的stage调度器进行task的调度
// perform some scheduling work
ScheduleResult result = stageSchedulers.get(stage.getStageId())
.schedule();
// modify parent and children based on the results of the scheduling
if (result.isFinished()) {
stage.schedulingComplete();
}
else if (!result.getBlocked().isDone()) {
blockedStages.add(result.getBlocked());
}
stageLinkages.get(stage.getStageId())
.processScheduleResults(stage.getState(), result.getNewTasks());
schedulerStats.getSplitsScheduledPerIteration().add(result.getSplitsScheduled());
if (result.getBlockedReason().isPresent()) {
switch (result.getBlockedReason().get()) {
case WRITER_SCALING:
// no-op
break;
case WAITING_FOR_SOURCE:
schedulerStats.getWaitingForSource().update(1);
break;
case SPLIT_QUEUES_FULL:
schedulerStats.getSplitQueuesFull().update(1);
break;
case MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE:
case NO_ACTIVE_DRIVER_GROUP:
break;
default:
throw new UnsupportedOperationException("Unknown blocked reason: " + result.getBlockedReason().get());
}
}
}
// make sure to update stage linkage at least once per loop to catch async state changes (e.g., partial cancel)
for (SqlStageExecution stage : stages.values()) {
if (!completedStages.contains(stage.getStageId()) && stage.getState().isDone()) {
stageLinkages.get(stage.getStageId())
.processScheduleResults(stage.getState(), ImmutableSet.of());
completedStages.add(stage.getStageId());
}
}
// wait for a state change and then schedule again
if (!blockedStages.isEmpty()) {
try (TimeStat.BlockTimer timer = schedulerStats.getSleepTime().time()) {
tryGetFutureValue(whenAnyComplete(blockedStages), 1, SECONDS);
}
for (ListenableFuture> blockedStage : blockedStages) {
blockedStage.cancel(true);
}
}
}
for (SqlStageExecution stage : stages.values()) {
StageState state = stage.getState();
if (state != SCHEDULED && state != RUNNING && !state.isDone()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, format("Scheduling is complete, but stage %s is in state %s", stage.getStageId(), state));
}
}
}
}
未写完善待续…
如有错误请及时指出,共同进步~
每天晚上更新~
如需转载请附上本文链接,原创不易谢谢~