语句的执行流程
- openGauss进程的主函数main.cpp
/*
* Any Postgres server process begins execution here.
*/
int main(int argc, char* argv[])
{
char* mmap_env = NULL;
syscall_lock_init();
// 获取环境变量
mmap_env = gs_getenv_r("GAUSS_MMAP_THRESHOLD");
if (mmap_env != NULL) {
check_backend_env(mmap_env);
mmap_threshold = (size_t)atol(mmap_env);
}
// 实例上下文初始化
knl_instance_init();
g_instance.increCheckPoint_context = AllocSetContextCreate(
INSTANCE_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE),
"IncreCheckPointContext",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE,
SHARED_CONTEXT);
g_instance.account_context = AllocSetContextCreate(g_instance.instance_context,
"StandbyAccontContext",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE,
SHARED_CONTEXT);
/*
* Fire up essential subsystems: error and memory management
*
* Code after this point is allowed to use elog/ereport, though
* localization of messages may not work right away, and messages won't go
* anywhere but stderr until GUC settings get loaded.
*/
// 启动内存上下文子系统
MemoryContextInit();
PmTopMemoryContext = t_thrd.top_mem_cxt;
// 初始化主线程
knl_thread_init(MASTER_THREAD);
t_thrd.fake_session = create_session_context(t_thrd.top_mem_cxt, 0);
t_thrd.fake_session->status = KNL_SESS_FAKE;
u_sess = t_thrd.fake_session;
SelfMemoryContext = THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_DEFAULT);
MemoryContextSwitchTo(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_DEFAULT));
progname = get_progname(argv[0]); // 获取程序名
/*
* Platform-specific startup hacks
* 平台特有的启动设置
*/
startup_hacks(progname);
/* if gaussdb's name is gs_encrypt, so run in encrypte_main() */
// 如果程序名是gs_encrypt,调用encrypte_main将加密串返回
if (!strcmp(progname, "gs_encrypt")) {
return encrypte_main(argc, argv);
}
init_plog_global_mem();
/*
* Remember the physical location of the initially given argv[] array for
* possible use by ps display. On some platforms, the argv[] storage must
* be overwritten in order to set the process title for ps. In such cases
* save_ps_display_args makes and returns a new copy of the argv[] array.
*
* save_ps_display_args may also move the environment strings to make
* extra room. Therefore this should be done as early as possible during
* startup, to avoid entanglements with code that might save a getenv()
* result pointer.
* 保存argc, argv
*/
argv = save_ps_display_args(argc, argv);
/*
* If supported on the current platform, set up a handler to be called if
* the backend/postmaster crashes with a fatal signal or exception.
*/
// 平台相关,设置当后端进程/postmaster崩溃时可调用的处理程序
#if defined(WIN32) && defined(HAVE_MINIDUMP_TYPE)
pgwin32_install_crashdump_handler();
#endif
/*
* Set up locale information from environment. Note that LC_CTYPE and
* LC_COLLATE will be overridden later from pg_control if we are in an
* already-initialized database. We set them here so that they will be
* available to fill pg_control during initdb. LC_MESSAGES will get set
* later during GUC option processing, but we set it here to allow startup
* error messages to be localized.
*/
// 从环境变量设置区域信息
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("gaussdb"));
#ifdef WIN32
/*
* Windows uses codepages rather than the environment, so we work around
* that by querying the environment explicitly first for LC_COLLATE and
* LC_CTYPE. We have to do this because initdb passes those values in the
* environment. If there is nothing there we fall back on the codepage.
*/
{
char* env_locale = NULL;
if ((env_locale = gs_getenv_r("LC_COLLATE")) != NULL) {
check_backend_env(env_locale);
pg_perm_setlocale(LC_COLLATE, env_locale);
} else
pg_perm_setlocale(LC_COLLATE, "");
if ((env_locale = gs_getenv_r("LC_CTYPE")) != NULL) {
check_backend_env(env_locale);
pg_perm_setlocale(LC_CTYPE, env_locale);
} else
pg_perm_setlocale(LC_CTYPE, "");
}
#else
pg_perm_setlocale(LC_COLLATE, "");
pg_perm_setlocale(LC_CTYPE, "");
#endif
/*
* We keep these set to "C" always, except transiently in pg_locale.c; see
* that file for explanations.
*/
pg_perm_setlocale(LC_MONETARY, "C");
pg_perm_setlocale(LC_NUMERIC, "C");
pg_perm_setlocale(LC_TIME, "C");
/*
* Now that we have absorbed as much as we wish to from the locale
* environment, remove any LC_ALL setting, so that the environment
* variables installed by pg_perm_setlocale have force.
*/
(void)unsetenv("LC_ALL");
/*
* Catch standard options before doing much else
*/
if (argc > 1) {
// 打印帮助信息后退出
if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) {
help(progname);
exit(0);
}
// 打印版本信息后退出
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) {
puts("gaussdb " DEF_GS_VERSION);
exit(0);
}
}
/*
* Make sure we are not running as root.
*/
// 确认不是使用root启动的程序
check_root(progname);
/*
* Dispatch to one of various subprograms depending on first argument.
*/
#ifdef WIN32
/*
* Start our win32 signal implementation
*
* SubPostmasterMain() will do this for itself, but the remaining modes
* need it here
*/
pgwin32_signal_initialize();
#endif
/* init trace context */
if (gstrace_init(getpid()) == 0) {
on_proc_exit(gstrace_destory, 0);
}
t_thrd.mem_cxt.gs_signal_mem_cxt = AllocSetContextCreate(
t_thrd.top_mem_cxt, "gs_signal", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE);
if (NULL == t_thrd.mem_cxt.gs_signal_mem_cxt) {
ereport(LOG, (errmsg("could not start a new thread, because of no enough system resource. ")));
proc_exit(1);
}
/*
* @BuiltinFunc
* Create a global BuiltinFunc object shared among threads
*/
// 线程共享的全局内置函数对象
if (g_sorted_funcs[0] == NULL) {
initBuiltinFuncs();
}
if (argc > 1 && strcmp(argv[1], "--boot") == 0) {
// initdb相关,bootstrapping模式
IsInitdb = true;
gs_signal_monitor_startup();
gs_signal_slots_init(1);
(void)gs_signal_unblock_sigusr2();
gs_signal_startup_siginfo("AuxiliaryProcessMain");
BootStrapProcessMain(argc, argv); /* does not return */
}
// 打印guc的变量
if (argc > 1 && strcmp(argv[1], "--describe-config") == 0)
exit(GucInfoMain());
if (argc > 1 && strcmp(argv[1], "--single") == 0) {
// initdb相关,单用户模式
IsInitdb = true;
gs_signal_monitor_startup();
gs_signal_slots_init(1);
(void)gs_signal_unblock_sigusr2();
gs_signal_startup_siginfo("PostgresMain");
exit(PostgresMain(argc, argv, NULL, get_current_username(progname)));
}
// 数据库启动
exit(PostmasterMain(argc, argv));
}
查询语句的执行流程
postgres=# create table a(id int);
CREATE TABLE
postgres=# insert into a values(1);
INSERT
postgres=# select * from a;
PostgresMain.cpp ReadCommand函数读取客户端命令
简单查询调用exec_simple_query函数
static void exec_simple_query(const char* query_string, MessageType messageType, StringInfo msg = NULL)
{
...
// 报告后端线程正在处理查询语句
pgstat_report_activity(STATE_RUNNING, query_string);
...
// 开启事务
start_xact_command();
...
// SQL解析
parsetree_list = pg_parse_query(reparsed_query.empty() ?
query_string : reparsed_query.c_str(), &query_string_locationlist);
...
/*
* Run through the raw parsetree(s) and process each one.
*/
// 遍历parsetree_list
foreach (parsetree_item, parsetree_list) {
...
Node* parsetree = (Node*)lfirst(parsetree_item);
...
// 操作类型,当前为"SELECT"
commandTag = CreateCommandTag(parsetree);
...
/* Make sure we are in a transaction command */
start_xact_command();
...
/*
* Set up a snapshot if parse analysis/planning will need one.
*/
// 设置快照
if (analyze_requires_snapshot(parsetree)) {
PushActiveSnapshot(GetTransactionSnapshot());
snapshot_set = true;
}
...
// 分析解析树转换为查询树并重写查询树
querytree_list = pg_analyze_and_rewrite(parsetree, query_string, NULL, 0);
...
// 生成计划树
plantree_list = pg_plan_queries(querytree_list, 0, NULL);
...
// 创建未命令的portal来运行查询
portal = CreatePortal("", true, true);
...
// 启动portal
PortalStart(portal, NULL, 0, InvalidSnapshot);
...
// 运行portal,然后删除它及receiver
(void)PortalRun(portal, FETCH_ALL, isTopLevel, receiver, receiver, completionTag);
(*receiver->rDestroy)(receiver);
PortalDrop(portal, false);
...
// 事务提交
finish_xact_command();
...
// 命令完成
EndCommand(completionTag, dest);
...
}
}
- 词法语法解析
相关数据结构
typedef struct SelectStmt {
NodeTag type;
/*
* These fields are used only in "leaf" SelectStmts.
*/
List *distinctClause; /* NULL, list of DISTINCT ON exprs, or
* lcons(NIL,NIL) for all (SELECT DISTINCT) */
IntoClause *intoClause; /* target for SELECT INTO */
List *targetList; /* the target list (of ResTarget) */
List *fromClause; /* the FROM clause */
Node *whereClause; /* WHERE qualification */
List *groupClause; /* GROUP BY clauses */
Node *havingClause; /* HAVING conditional-expression */
List *windowClause; /* WINDOW window_name AS (...), ... */
WithClause *withClause; /* WITH clause */
/*
* In a "leaf" node representing a VALUES list, the above fields are all
* null, and instead this field is set. Note that the elements of the
* sublists are just expressions, without ResTarget decoration. Also note
* that a list element can be DEFAULT (represented as a SetToDefault
* node), regardless of the context of the VALUES list. It's up to parse
* analysis to reject that where not valid.
*/
List *valuesLists; /* untransformed list of expression lists */
/*
* These fields are used in both "leaf" SelectStmts and upper-level
* SelectStmts.
*/
List *sortClause; /* sort clause (a list of SortBy's) */
Node *limitOffset; /* # of result tuples to skip */
Node *limitCount; /* # of result tuples to return */
List *lockingClause; /* FOR UPDATE (list of LockingClause's) */
HintState *hintState;
/*
* These fields are used only in upper-level SelectStmts.
*/
SetOperation op; /* type of set op */
bool all; /* ALL specified? */
struct SelectStmt *larg; /* left child */
struct SelectStmt *rarg; /* right child */
/*
* These fields are used by operator "(+)"
*/
bool hasPlus;
/* Eventually add fields for CORRESPONDING spec here */
} SelectStmt;
相关代码在parse.cpp,主要流程如下
// flex,bison进行语法解析
List* raw_parser(const char* str, List** query_string_locationlist)
{
...
// 初始化词法分析器
yyscanner = scanner_init(str, &yyextra.core_yy_extra, ScanKeywords, NumScanKeywords);
...
// 初始化语法分析器
parser_init(&yyextra);
// SQL解析
yyresult = base_yyparse(yyscanner);
/* Clean up (release memory) */
scanner_finish(yyscanner);
if (yyresult) { /* error */
return NIL;
}
...
// 返回语法树
return yyextra.parsetree;
}
- 转换查询树并重写
相关数据结构
typedef struct Query {
NodeTag type;
CmdType commandType; /* select|insert|update|delete|merge|utility */
QuerySource querySource; /* where did I come from? */
uint64 queryId; /* query identifier (can be set by plugins) */
bool canSetTag; /* do I set the command result tag? */
Node* utilityStmt; /* non-null if this is DECLARE CURSOR or a
* non-optimizable statement */
int resultRelation; /* rtable index of target relation for
* INSERT/UPDATE/DELETE/MERGE; 0 for SELECT */
bool hasAggs; /* has aggregates in tlist or havingQual */
bool hasWindowFuncs; /* has window functions in tlist */
bool hasSubLinks; /* has subquery SubLink */
bool hasDistinctOn; /* distinctClause is from DISTINCT ON */
bool hasRecursive; /* WITH RECURSIVE was specified */
bool hasModifyingCTE; /* has INSERT/UPDATE/DELETE in WITH */
bool hasForUpdate; /* FOR UPDATE or FOR SHARE was specified */
bool hasRowSecurity; /* rewriter has applied some RLS policy */
bool hasSynonyms; /* has synonym mapping in rtable */
List* cteList; /* WITH list (of CommonTableExpr's) */
List* rtable; /* list of range table entries */
FromExpr* jointree; /* table join tree (FROM and WHERE clauses) */
List* targetList; /* target list (of TargetEntry) */
List* starStart; /* Corresponding p_star_start in ParseState */
List* starEnd; /* Corresponding p_star_end in ParseState */
List* starOnly; /* Corresponding p_star_only in ParseState */
List* returningList; /* return-values list (of TargetEntry) */
List* groupClause; /* a list of SortGroupClause's */
List* groupingSets; /* a list of GroupingSet's if present */
Node* havingQual; /* qualifications applied to groups */
List* windowClause; /* a list of WindowClause's */
List* distinctClause; /* a list of SortGroupClause's */
List* sortClause; /* a list of SortGroupClause's */
Node* limitOffset; /* # of result tuples to skip (int8 expr) */
Node* limitCount; /* # of result tuples to return (int8 expr) */
List* rowMarks; /* a list of RowMarkClause's */
Node* setOperations; /* set-operation tree if this is top level of
* a UNION/INTERSECT/EXCEPT query */
List *constraintDeps; /* a list of pg_constraint OIDs that the query
* depends on to be semantically valid */
HintState* hintState;
#ifdef PGXC
/* need this info for PGXC Planner, may be temporary */
char* sql_statement; /* original query */
bool is_local; /* enforce query execution on local node
* this is used by EXECUTE DIRECT especially. */
bool has_to_save_cmd_id; /* true if the query is such an INSERT SELECT
* that inserts into a child by selecting
* from its parent OR a WITH query that
* updates a table in main query and inserts
* a row to the same table in WITH query */
bool vec_output; /* true if it's vec output. this flag is used in FQS planning */
TdTruncCastStatus tdTruncCastStatus; /* Auto truncation Cast added, only used for stmt in stored procedure or
prepare stmt. */
List* equalVars; /* vars appears in UPDATE/DELETE clause */
#endif
ParamListInfo boundParamsQ;
int mergeTarget_relation;
List* mergeSourceTargetList;
List* mergeActionList; /* list of actions for MERGE (only) */
Query* upsertQuery; /* insert query for INSERT ON DUPLICATE KEY UPDATE (only) */
UpsertExpr* upsertClause; /* DUPLICATE KEY UPDATE [NOTHING | ...] */
bool isRowTriggerShippable; /* true if all row triggers are shippable. */
bool use_star_targets; /* true if use * for targetlist. */
bool is_from_full_join_rewrite; /* true if the query is created when doing
* full join rewrite. If true, we should not
* do some expression processing.
* Please refer to subquery_planner.
*/
uint64 uniqueSQLId; /* used by unique sql id */
bool can_push;
bool unique_check; /* true if the subquery is generated by general
* sublink pullup, and scalar output is needed */
Oid* fixed_paramTypes; /* For plpy CTAS query. CTAS is a recursive call.CREATE query is the first rewrited.
* thd 2nd rewrited query is INSERT SELECT.whithout this attribute, DB will have
* an error that has no idea about $x when INSERT SELECT query is analyzed. */
int fixed_numParams;
} Query;
相关代码在analyze.cpp,主要流程如下
List* pg_analyze_and_rewrite(Node* parsetree, const char* query_string, Oid* paramTypes, int numParams)
{
...
// 分析语法树转换为查询树
query = parse_analyze(parsetree, query_string, paramTypes, numParams);
...
/*
* (2) Rewrite the queries, as necessary
*/
// 重写查询树
querytree_list = pg_rewrite_query(query);
...
// 返回查询树
return querytree_list;
}
Query* parse_analyze(
Node* parseTree, const char* sourceText, Oid* paramTypes, int numParams, bool isFirstNode, bool isCreateView)
{
//
ParseState* pstate = make_parsestate(NULL);
...
// 转换
query = transformTopLevelStmt(pstate, parseTree, isFirstNode, isCreateView);
...
pfree_ext(pstate->p_ref_hook_state);
free_parsestate(pstate);
...
// 返回查询树
return query;
}
Query* transformTopLevelStmt(ParseState* pstate, Node* parseTree, bool isFirstNode, bool isCreateView)
{
if (IsA(parseTree, SelectStmt)) {
// 转换select...into语法为create table as语法
SelectStmt* stmt = (SelectStmt*)parseTree;
/* If it's a set-operation tree, drill down to leftmost SelectStmt */
while (stmt != NULL && stmt->op != SETOP_NONE)
stmt = stmt->larg;
AssertEreport(stmt && IsA(stmt, SelectStmt) && stmt->larg == NULL, MOD_OPT, "failure to check parseTree");
if (stmt->intoClause) {
CreateTableAsStmt* ctas = makeNode(CreateTableAsStmt);
ctas->query = parseTree;
ctas->into = stmt->intoClause;
ctas->relkind = OBJECT_TABLE;
ctas->is_select_into = true;
/*
* Remove the intoClause from the SelectStmt. This makes it safe
* for transformSelectStmt to complain if it finds intoClause set
* (implying that the INTO appeared in a disallowed place).
*/
stmt->intoClause = NULL;
parseTree = (Node*)ctas;
}
}
// 转换查询树
return transformStmt(pstate, parseTree, isFirstNode, isCreateView);
}
Query* transformStmt(ParseState* pstate, Node* parseTree, bool isFirstNode, bool isCreateView)
{
...
switch (nodeTag(parseTree)) {
...
case T_SelectStmt: {
SelectStmt* n = (SelectStmt*)parseTree;
...
// 分析select语法树
result = transformSelectStmt(pstate, n, isFirstNode, isCreateView);
...
} break;
...
}
...
// 返回查询树
return result;
}
static Query* transformSelectStmt(ParseState* pstate, SelectStmt* stmt, bool isFirstNode, bool isCreateView)
{
Query* qry = makeNode(Query);
...
qry->commandType = CMD_SELECT; // 命令类型:select
...
// 转换with子句
if (stmt->withClause) {
qry->hasRecursive = stmt->withClause->recursive;
qry->cteList = transformWithClause(pstate, stmt->withClause);
qry->hasModifyingCTE = pstate->p_hasModifyingCTE;
}
...
/* process the FROM clause */
// 转换from子句
transformFromClause(pstate, stmt->fromClause, isFirstNode, isCreateView);
/* transform targetlist */
// 将ResTarget的列表转换为TargetEntry的列表
qry->targetList = transformTargetList(pstate, stmt->targetList);
/* Transform operator "(+)" to outer join */
// (+)语法为外连接
if (stmt->hasPlus && stmt->whereClause != NULL) {
transformOperatorPlus(pstate, &stmt->whereClause);
}
...
/* mark column origins */
// 用源表的OID和列号标记Vars的目标列表列
markTargetListOrigins(pstate, qry->targetList);
/* transform WHERE
* Only "(+)" is valid when it's in WhereClause of Select, set the flag to be true
* during transform Whereclause.
*/
// 转换where子句
setIgnorePlusFlag(pstate, true);
qual = transformWhereClause(pstate, stmt->whereClause, "WHERE");
setIgnorePlusFlag(pstate, false);
/*
* Initial processing of HAVING clause is just like WHERE clause.
*/
// 转换having子句
qry->havingQual = transformWhereClause(pstate, stmt->havingClause, "HAVING");
/*
* Transform sorting/grouping stuff. Do ORDER BY first because both
* transformGroupClause and transformDistinctClause need the results. Note
* that these functions can also change the targetList, so it's passed to
* them by reference.
*/
// 转换order by子句
qry->sortClause = transformSortClause(
pstate, stmt->sortClause, &qry->targetList, true /* fix unknowns */, false /* allow SQL92 rules */);
/*
* Transform A_const to columnref type in group by clause, So that repeated group column
* will deleted in function transformGroupClause. If not to delete repeated column, for
* group by rollup can have error result, because we need set null to non- group column.
*
* select a, b, b
* from t1
* group by rollup(1, 2), 3;
*
* To this example, column b should not be set to null, but if not to delete repeated column
* b will be set to null and two b value is not equal.
*/
// 将group by子句的A_const转换为columnref类型
if (include_groupingset((Node*)stmt->groupClause)) {
transformGroupConstToColumn(pstate, (Node*)stmt->groupClause, qry->targetList);
}
// 转换group by子句
qry->groupClause = transformGroupClause(pstate,
stmt->groupClause,
&qry->groupingSets,
&qry->targetList,
qry->sortClause,
false /* allow SQL92 rules */);
if (stmt->distinctClause == NIL) {
qry->distinctClause = NIL;
qry->hasDistinctOn = false;
} else if (linitial(stmt->distinctClause) == NULL) {
// 转换distinct子句
/* We had SELECT DISTINCT */
qry->distinctClause = transformDistinctClause(pstate, &qry->targetList, qry->sortClause, false);
qry->hasDistinctOn = false;
} else {
// 转换distinct on子句
/* We had SELECT DISTINCT ON */
qry->distinctClause =
transformDistinctOnClause(pstate, stmt->distinctClause, &qry->targetList, qry->sortClause);
qry->hasDistinctOn = true;
}
/* transform LIMIT */
// 转换limit子句
qry->limitOffset = transformLimitClause(pstate, stmt->limitOffset, "OFFSET");
qry->limitCount = transformLimitClause(pstate, stmt->limitCount, "LIMIT");
/* transform window clauses after we have seen all window functions */
// 窗口函数相关
qry->windowClause = transformWindowDefinitions(pstate, pstate->p_windowdefs, &qry->targetList);
/* resolve any still-unresolved output columns as being type text */
// 将还未解析的输出列解析为类型文本
if (pstate->p_resolve_unknowns) {
resolveTargetListUnknowns(pstate, qry->targetList);
}
// 创建FromExpr节点
qry->rtable = pstate->p_rtable;
qry->jointree = makeFromExpr(pstate->p_joinlist, qual);
qry->hasSubLinks = pstate->p_hasSubLinks;
qry->hasWindowFuncs = pstate->p_hasWindowFuncs;
// 检查各子句中是否存在不应该有的窗口函数
if (pstate->p_hasWindowFuncs) {
parseCheckWindowFuncs(pstate, qry);
}
qry->hasAggs = pstate->p_hasAggs;
// 转换FOR UPDATE/SHARE子句
foreach (l, stmt->lockingClause) {
transformLockingClause(pstate, qry, (LockingClause*)lfirst(l), false);
}
qry->hintState = stmt->hintState;
...
// 标记排序信息
assign_query_collations(pstate, qry);
/* this must be done after collations, for reliable comparison of exprs */
Check for aggregates where they shouldn't be and improper grouping.
// 检查子句中不应该存在的聚集和不适当的分组
if (pstate->p_hasAggs || qry->groupClause || qry->groupingSets || qry->havingQual) {
parseCheckAggregates(pstate, qry);
}
// 返回查询树
return qry;
}
List* QueryRewrite(Query* parsetree)
{
...
/*
* Step 1
*
* Apply all non-SELECT rules possibly getting 0 or many queries
*/
// 应用所有non-SELECT重写
querylist = RewriteQuery(parsetree, NIL);
/*
* Step 2
*
* Apply all the RIR rules on each query
*
* This is also a handy place to mark each query with the original queryId
*/
// 应用所有RIR规则
results = NIL;
foreach (l, querylist) {
Query* query = (Query*)lfirst(l);
query = fireRIRrules(query, NIL, false);
query->queryId = input_query_id;
results = lappend(results, query);
}
...
return results;
}
- 生成计划树
相关数据结构
typedef struct Plan {
NodeTag type;
int plan_node_id; /* node id */
int parent_node_id; /* parent node id */
RemoteQueryExecType exec_type;
/*
* estimated execution costs for plan (see costsize.c for more info)
*/
Cost startup_cost; /* cost expended before fetching any tuples */
Cost total_cost; /* total cost (assuming all tuples fetched) */
/*
* planner's estimate of result size of this plan step
*/
double plan_rows; /* number of global rows plan is expected to emit */
double multiple;
int plan_width; /* average row width in bytes */
int dop; /* degree of parallelism of current plan */
/*
* machine learning model estimations
*/
double pred_rows;
double pred_startup_time;
double pred_total_time;
long pred_max_memory;
/*
* MPPDB Recursive-Union Support
*
* - @recursive_union_plan_nodeid
* Pointing to its belonging RecursiveUnion's plan node id to indate if we are
* under RecursiveUnion
*
* - @recursive_union_controller
* Indicate if current Plan node is controller node in recursive-union steps
*
* - @control_plan_nodeid
* Normally, set on the top-plan node of a producer thread, to indicate which
* control-plan we need syn-up with
*
* - @is_sync_planode
* Indicate the current producer thread is the sync-up thread in recursive union,
* normally set on produer's top plan node
*
* Please note the above four variables is meaningless if a plan node is not under
* recursive-union's recursive part
*/
/*
* plan node id of RecursiveUnion node where current plan node belongs to, 0 for
* not under recursive-union
*/
int recursive_union_plan_nodeid;
/* flag to indicate if it is controller plan node */
bool recursive_union_controller;
/* plan node id of Controller plan node, 0 for not in control */
int control_plan_nodeid;
/* flag indicate if the current plan node is the sync node (for multi-stream case) */
bool is_sync_plannode;
/*
* Common structural data for all Plan types.
*/
List* targetlist; /* target list to be computed at this node */
List* qual; /* implicitly-ANDed qual conditions */
struct Plan* lefttree; /* input plan tree(s) */
struct Plan* righttree;
bool ispwj; /* is it special for partitionwisejoin? */
int paramno; /* the partition'sn that it is scaning */
List* initPlan; /* Init Plan nodes (un-correlated expr
* subselects) */
List* distributed_keys; /* distributed on which key */
ExecNodes* exec_nodes; /* List of Datanodes where to execute this plan */
/*
* Information for management of parameter-change-driven rescanning
*
* extParam includes the paramIDs of all external PARAM_EXEC params
* affecting this plan node or its children. setParam params from the
* node's initPlans are not included, but their extParams are.
*
* allParam includes all the extParam paramIDs, plus the IDs of local
* params that affect the node (i.e., the setParams of its initplans).
* These are _all_ the PARAM_EXEC params that affect this node.
*/
Bitmapset* extParam;
Bitmapset* allParam;
// For vectorized engine, plan produce vector output
//
bool vec_output;
/*
* @hdfs
* Mark the foreign scan whether has unique results on one of its
* output columns.
*/
bool hasUniqueResults;
/*
* Mark the plan whether includes delta table or not.
*/
bool isDeltaTable;
/* used to replace work_mem, maxmem in [0], and minmem in [1] */
int operatorMemKB[2];
/* allowed max mem after spread */
int operatorMaxMem;
bool parallel_enabled; /* Is it run in parallel? */
bool hasHashFilter; /* true for this plan has a hashfilter */
List* var_list; /* Need bloom filter var list. */
List* filterIndexList; /* Need used bloomfilter array index. */
/* used to replace work_mem */
int** ng_operatorMemKBArray; /* for multiple logic cluster */
int ng_num;
double innerdistinct; /* join inner rel distinct estimation value */
double outerdistinct; /* join outer rel distinct estimation value */
} Plan;
typedef struct Path {
NodeTag type;
NodeTag pathtype; /* tag identifying scan/join method */
RelOptInfo* parent; /* the relation this path can build */
ParamPathInfo* param_info; /* parameterization info, or NULL if none */
/* estimated size/costs for path (see costsize.c for more info) */
double rows; /* estimated number of global result tuples */
double multiple;
Cost startup_cost; /* cost expended before fetching any tuples */
Cost total_cost; /* total cost (assuming all tuples fetched) */
Cost stream_cost; /* cost of actions invoked by stream but can't be parallelled in this path */
List* pathkeys; /* sort ordering of path's output */
List* distribute_keys; /* distribute key, Var list */
char locator_type;
Oid rangelistOid;
int dop; /* degree of parallelism */
/* pathkeys is a List of PathKey nodes; see above */
Distribution distribution;
int hint_value; /* Mark this path if be hinted, and hint kind. */
double innerdistinct; /* join inner rel distinct estimation value */
double outerdistinct; /* join outer rel distinct estimation value */
} Path;
typedef struct PlannerInfo {
NodeTag type;
Query* parse; /* the Query being planned */
PlannerGlobal* glob; /* global info for current planner run */
Index query_level; /* 1 at the outermost Query */
struct PlannerInfo* parent_root; /* NULL at outermost Query */
/*
* simple_rel_array holds pointers to "base rels" and "other rels" (see
* comments for RelOptInfo for more info). It is indexed by rangetable
* index (so entry 0 is always wasted). Entries can be NULL when an RTE
* does not correspond to a base relation, such as a join RTE or an
* unreferenced view RTE; or if the RelOptInfo hasn't been made yet.
*/
struct RelOptInfo** simple_rel_array; /* All 1-rel RelOptInfos */
int simple_rel_array_size; /* allocated size of array */
/*
* List of changed var that mutated during cost-based rewrite optimization, the
* element in the list is "struct RewriteVarMapping", for example:
* - inlist2join
* - pushjoin2union (will implemented)
* _ ...
*
*/
List* var_mappings;
Relids var_mapping_rels; /* all the relations that related to inlist2join */
/*
* simple_rte_array is the same length as simple_rel_array and holds
* pointers to the associated rangetable entries. This lets us avoid
* rt_fetch(), which can be a bit slow once large inheritance sets have
* been expanded.
*/
RangeTblEntry** simple_rte_array; /* rangetable as an array */
/*
* all_baserels is a Relids set of all base relids (but not "other"
* relids) in the query; that is, the Relids identifier of the final join
* we need to form.
*/
Relids all_baserels;
/*
* join_rel_list is a list of all join-relation RelOptInfos we have
* considered in this planning run. For small problems we just scan the
* list to do lookups, but when there are many join relations we build a
* hash table for faster lookups. The hash table is present and valid
* when join_rel_hash is not NULL. Note that we still maintain the list
* even when using the hash table for lookups; this simplifies life for
* GEQO.
*/
List* join_rel_list; /* list of join-relation RelOptInfos */
struct HTAB* join_rel_hash; /* optional hashtable for join relations */
/*
* When doing a dynamic-programming-style join search, join_rel_level[k]
* is a list of all join-relation RelOptInfos of level k, and
* join_cur_level is the current level. New join-relation RelOptInfos are
* automatically added to the join_rel_level[join_cur_level] list.
* join_rel_level is NULL if not in use.
*/
List** join_rel_level; /* lists of join-relation RelOptInfos */
int join_cur_level; /* index of list being extended */
List* init_plans; /* init SubPlans for query */
List* cte_plan_ids; /* per-CTE-item list of subplan IDs */
List* eq_classes; /* list of active EquivalenceClasses */
List* canon_pathkeys; /* list of "canonical" PathKeys */
List* left_join_clauses; /* list of RestrictInfos for
* mergejoinable outer join clauses
* w/nonnullable var on left */
List* right_join_clauses; /* list of RestrictInfos for
* mergejoinable outer join clauses
* w/nonnullable var on right */
List* full_join_clauses; /* list of RestrictInfos for
* mergejoinable full join clauses */
List* join_info_list; /* list of SpecialJoinInfos */
List* lateral_info_list; /* list of LateralJoinInfos */
List* append_rel_list; /* list of AppendRelInfos */
List* rowMarks; /* list of PlanRowMarks */
List* placeholder_list; /* list of PlaceHolderInfos */
List* query_pathkeys; /* desired pathkeys for query_planner(), and
* actual pathkeys afterwards */
List* group_pathkeys; /* groupClause pathkeys, if any */
List* window_pathkeys; /* pathkeys of bottom window, if any */
List* distinct_pathkeys; /* distinctClause pathkeys, if any */
List* sort_pathkeys; /* sortClause pathkeys, if any */
List* minmax_aggs; /* List of MinMaxAggInfos */
List* initial_rels; /* RelOptInfos we are now trying to join */
MemoryContext planner_cxt; /* context holding PlannerInfo */
double total_table_pages; /* # of pages in all tables of query */
double tuple_fraction; /* tuple_fraction passed to query_planner */
double limit_tuples; /* limit_tuples passed to query_planner */
bool hasInheritedTarget; /* true if parse->resultRelation is an
* inheritance child rel */
bool hasJoinRTEs; /* true if any RTEs are RTE_JOIN kind */
bool hasLateralRTEs; /* true if any RTEs are marked LATERAL */
bool hasHavingQual; /* true if havingQual was non-null */
bool hasPseudoConstantQuals; /* true if any RestrictInfo has
* pseudoconstant = true */
bool hasRecursion; /* true if planning a recursive WITH item */
/* Note: qualSecurityLevel is zero if there are no securityQuals */
Index qualSecurityLevel; /* minimum security_level for quals */
#ifdef PGXC
/* This field is used only when RemoteScan nodes are involved */
int rs_alias_index; /* used to build the alias reference */
/*
* In Postgres-XC Coordinators are supposed to skip the handling of
* row marks of type ROW_MARK_EXCLUSIVE & ROW_MARK_SHARE.
* In order to do that we simply remove such type
* of row marks from the list rowMarks. Instead they are saved
* in xc_rowMarks list that is then handeled to add
* FOR UPDATE/SHARE in the remote query
*/
List* xc_rowMarks; /* list of PlanRowMarks of type ROW_MARK_EXCLUSIVE & ROW_MARK_SHARE */
#endif
/* These fields are used only when hasRecursion is true: */
int wt_param_id; /* PARAM_EXEC ID for the work table */
struct Plan* non_recursive_plan; /* plan for non-recursive term */
/* These fields are workspace for createplan.c */
Relids curOuterRels; /* outer rels above current node */
List* curOuterParams; /* not-yet-assigned NestLoopParams */
Index curIteratorParamIndex;
bool isPartIteratorPlanning;
int curItrs;
List* subqueryRestrictInfo; /* Subquery RestrictInfo, which only be used in wondows agg. */
/* optional private data for join_search_hook, e.g., GEQO */
void* join_search_private;
/* Added post-release, will be in a saner place in 9.3: */
List* plan_params; /* list of PlannerParamItems, see below */
/* For count_distinct, save null info for group by clause */
List* join_null_info;
/* for GroupingFunc fixup in setrefs */
AttrNumber* grouping_map;
/* If current query level is correlated with upper level */
bool is_correlated;
/* data redistribution for DFS table.
* dataDestRelIndex is index into the range table. This variable
* will take effect on data redistribution state.
* The effective value must be greater than 0.
*/
Index dataDestRelIndex;
/* interesting keys of current query level */
ItstDisKey dis_keys;
/*
* indicate if the subquery planning root (PlannerInfo) is under or rooted from
* recursive-cte planning.
*/
bool is_under_recursive_cte;
/*
* indicate if the subquery planning root (PlannerInfo) is under recursive-cte's
* recursive branch
*/
bool is_under_recursive_tree;
bool has_recursive_correlated_rte; /* true if any RTE correlated with recursive cte */
int subquery_type;
Bitmapset *param_upper;
bool hasRownumQual;
} PlannerInfo;
typedef struct PlannedStmt {
NodeTag type;
CmdType commandType; /* select|insert|update|delete */
uint64 queryId; /* query identifier, uniquely indicate this plan in Runtime (copied from Query) */
bool hasReturning; /* is it insert|update|delete RETURNING? */
bool hasModifyingCTE; /* has insert|update|delete in WITH? */
bool canSetTag; /* do I set the command result tag? */
bool transientPlan; /* redo plan when TransactionXmin changes? */
bool dependsOnRole; /* is plan specific to current role? */
Plan* planTree; /* tree of Plan nodes */
List* rtable; /* list of RangeTblEntry nodes */
/* rtable indexes of target relations for INSERT/UPDATE/DELETE */
List* resultRelations; /* integer list of RT indexes, or NIL */
Node* utilityStmt; /* non-null if this is DECLARE CURSOR */
List* subplans; /* Plan trees for SubPlan expressions */
Bitmapset* rewindPlanIDs; /* indices of subplans that require REWIND */
List* rowMarks; /* a list of PlanRowMark's */
/*
* Notice: be careful to use relationOids as it may contain non-table OID
* in some scenarios, e.g. assignment of relationOids in fix_expr_common.
*/
List* relationOids; /* contain OIDs of relations the plan depends on */
List* invalItems; /* other dependencies, as PlanInvalItems */
int nParamExec; /* number of PARAM_EXEC Params used */
int num_streams; /* number of stream exist in plan tree */
int max_push_sql_num; /* number of split sql want push DN execute */
int gather_count; /* gather_count in query */
int num_nodes; /* number of data nodes */
NodeDefinition* nodesDefinition; /* all data nodes' defination */
int instrument_option; /* used for collect instrument data */
int num_plannodes; /* how many plan node in this planstmt */
int query_mem[2]; /* how many memory the query can use , memory in kb */
int assigned_query_mem[2]; /* how many memory the query is assigned */
bool is_dynmaic_smp;
int dynsmp_max_cpu; /* max avaliable cpu for this dn */
int dynsmp_avail_cpu; /* max avaliable cpu for this dn */
int dynsmp_cpu_util;
int dynsmp_active_statement;
double dynsmp_query_estimate_cpu_usge;
int dynsmp_plan_optimal_dop; /* the final optimized dop for the plan */
int dynsmp_plan_original_dop;
int dynsmp_dop_mem_limit; /* memory will put a limit on dop */
int dynsmp_min_non_spill_dop; /* optimal dop cannot greater than this */
int num_bucketmaps; /* Num of special-bucketmap stored in plannedstmt */
uint2* bucketMap[MAX_SPECIAL_BUCKETMAP_NUM]; /* the map information need to be get */
char* query_string; /* convey the query string to backend/stream thread of DataNode for debug purpose */
List* subplan_ids; /* in which plan id subplan should be inited */
List* initPlan; /* initplan in top plan node */
/* data redistribution for DFS table.
* dataDestRelIndex is index into the range table. This variable
* will take effect on data redistribution state.
*/
Index dataDestRelIndex;
int MaxBloomFilterNum;
int query_dop; /* Dop of current query. */
double plannertime; /* planner execute time */
/* set true in do_query_for_planrouter() for PlannedStmt sent to
* the compute pool
*/
bool in_compute_pool;
/* true if there is/are ForeignScan node(s) of OBS foreign table
* in plantree.
*/
bool has_obsrel;
List* plan_hint_warning; /* hint warning during plan generation, only used in CN */
List* noanalyze_rellist; /* relations and attributes that have no statistics, only used in CN */
int ng_num; /* nodegroup number */
NodeGroupQueryMem* ng_queryMem; /* each nodegroup's query mem */
bool ng_use_planA; /* true means I am a planA, default false */
bool isRowTriggerShippable; /* true if all row triggers are shippable. */
bool is_stream_plan;
bool multi_node_hint;
uint64 uniqueSQLId;
} PlannedStmt;
相关代码在planner.cpp, planmain.cpp,主要流程如下
void query_planner(PlannerInfo* root, List* tlist, double tuple_fraction, double limit_tuples, Path** cheapest_path,
Path** sorted_path, double* num_groups, List* rollup_groupclauses, List* rollup_lists)
{
...
/*
* If the query has an empty join tree, then it's something easy like
* "SELECT 2+2;" or "INSERT ... VALUES()". Fall through quickly.
*/
// 空连接树语句处理
if (parse->jointree->fromlist == NIL) {
/* We need a trivial path result */
*cheapest_path = (Path*)create_result_path(root, NULL, (List*)parse->jointree->quals);
*sorted_path = NULL;
/*
* We still are required to canonicalize any pathkeys, in case it's
* something like "SELECT 2+2 ORDER BY 1".
*/
root->canon_pathkeys = NIL;
canonicalize_all_pathkeys(root);
return;
}
...
// 准备用于快速访问基本关系的数组
setup_simple_rel_arrays(root);
// 为所有基本关系构造一个新的RelOptInfo
add_base_rels_to_query(root, (Node*)parse->jointree);
check_scan_hint_validity(root);
// 给targetlists增加条目,生成PlaceHolderInfo条目,为可证明的等价表达式构建了等价类,最后创建目标连接
build_base_rel_tlists(root, tlist);
find_placeholders_in_jointree(root);
find_lateral_references(root);
joinlist = deconstruct_jointree(root);
...
// 由等价类重新考虑任何延迟的外连接条件
reconsider_outer_join_clauses(root);
// 对等价类,生成额外的约束子句。
generate_base_implied_equalities(root);
generate_base_implied_qualities(root);
// 路径键规范化
canonicalize_all_pathkeys(root);
// 检查placeholder表达式
fix_placeholder_input_needed_levels(root);
// 移除不需要的外连接
joinlist = remove_useless_joins(root, joinlist);
// 分配placeholders到基础关系
add_placeholders_to_base_rels(root);
// 计算total_table_pages
total_pages = 0;
for (rti = 1; rti < (unsigned int)root->simple_rel_array_size; rti++) {
RelOptInfo* brel = root->simple_rel_array[rti];
if (brel == NULL)
continue;
AssertEreport(brel->relid == rti,
MOD_OPT,
"invalid relation oid when generating a path for a basic query."); /* sanity check on array */
if (brel->reloptkind == RELOPT_BASEREL || brel->reloptkind == RELOPT_OTHER_MEMBER_REL)
total_pages += (double)brel->pages;
}
root->total_table_pages = total_pages;
// 查找执行查询的所有可能的访问路径,返回表示查询中所有基本关系的连接的单个关系
final_rel = make_one_rel(root, joinlist);
...
if (parse->groupClause) { // 估算分组结果组的数量
...
} else if (parse->hasAggs || root->hasHavingQual || parse->groupingSets) {
// 未分组的聚合读取所有元组
...
} else if (parse->distinctClause) {
// 未分组未聚合估算结果行
...
} else {
// 简单的非分组非聚合查询:计算tuple fraction
...
}
// 选择最廉价的路径
cheapestpath = get_cheapest_path(root, final_rel, num_groups, has_groupby);
...
*cheapest_path = cheapestpath;
*sorted_path = sortedpath;
}
- 执行器
相关数据结构
typedef struct QueryDesc {
/* These fields are provided by CreateQueryDesc */
CmdType operation; /* CMD_SELECT, CMD_UPDATE, etc. */
PlannedStmt* plannedstmt; /* planner's output, or null if utility */
Node* utilitystmt; /* utility statement, or null */
const char* sourceText; /* source text of the query */
Snapshot snapshot; /* snapshot to use for query */
Snapshot crosscheck_snapshot; /* crosscheck for RI update/delete */
DestReceiver* dest; /* the destination for tuple output */
ParamListInfo params; /* param values being passed in */
int instrument_options; /* OR of InstrumentOption flags */
/* These fields are set by ExecutorStart */
TupleDesc tupDesc; /* descriptor for result tuples */
EState* estate; /* executor's query-wide state */
PlanState* planstate; /* tree of per-plan-node state */
/* This is always set NULL by the core system, but plugins can change it */
struct Instrumentation* totaltime; /* total time spent in ExecutorRun */
bool executed; /* if the query already executed */
#ifdef ENABLE_MOT
JitExec::JitContext* mot_jit_context; /* MOT JIT context required for executing LLVM jitted code */
#endif
} QueryDesc;
相关代码在execMain.cpp
// 执行器启动
void ExecutorStart(QueryDesc* queryDesc, int eflags)
{
gstrace_entry(GS_TRC_ID_ExecutorStart);
/* it's unsafe to deal with plugins hooks as dynamic lib may be released */
if (ExecutorStart_hook && !(g_instance.status > NoShutdown))
(*ExecutorStart_hook)(queryDesc, eflags);
else
standard_ExecutorStart(queryDesc, eflags);
gstrace_exit(GS_TRC_ID_ExecutorStart);
}
// 执行器执行
void ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count)
{
...
// SQL自调优:查询执行结束后,根据运行时信息分析查询计划问题
if (u_sess->exec_cxt.need_track_resource && queryDesc != NULL && has_track_operator &&
(IS_PGXC_COORDINATOR || IS_SINGLE_NODE)) {
List *issue_results = PlanAnalyzerOperator(queryDesc, queryDesc->planstate);
// 如果发现计划问题,将其存储在sysview gs_wlm_session_history中
if (issue_results != NIL) {
RecordQueryPlanIssues(issue_results);
}
}
...
// SQL动态特性,操作符历史统计
if (can_operator_history_statistics) {
u_sess->instr_cxt.can_record_to_table = true;
ExplainNodeFinish(queryDesc->planstate, queryDesc->plannedstmt, GetCurrentTimestamp(), false);
if ((IS_PGXC_COORDINATOR) && u_sess->instr_cxt.global_instr != NULL) {
delete u_sess->instr_cxt.global_instr;
u_sess->instr_cxt.thread_instr = NULL;
u_sess->instr_cxt.global_instr = NULL;
}
}
u_sess->pcache_cxt.cur_stmt_name = old_stmt_name;
}
// 执行器完成
void ExecutorFinish(QueryDesc *queryDesc)
{
if (ExecutorFinish_hook) {
(*ExecutorFinish_hook)(queryDesc);
} else {
standard_ExecutorFinish(queryDesc);
}
}
// 执行器结束
void ExecutorEnd(QueryDesc *queryDesc)
{
if (ExecutorEnd_hook) {
(*ExecutorEnd_hook)(queryDesc);
} else {
standard_ExecutorEnd(queryDesc);
}
}
PostgresMain.cpp ReadyForQuery刷出执行结果到客户端。
由以上可得出简单查询的相关函数如下图: