- 熟练掌握词法、语法的解析流程及原理
openGauss在执行SQL语句时,使用flex,bison对语句进行词法分析,语法分析
词法语法分析的入口函数是raw_parser(parser.cpp),raw_parser调用base_yyparse进行词法语法分析
-> scan.l: 词法文件,由flex编译生成scan.cpp
-> gram.y: 语法文件,由bison编译生成gram.cpp
-> kwlist.h: 列出所有关键字
-> keywords.cpp: 常量定义
-> kwlookup.cpp:二分法确认当前词是否关键字
-> scansup.cpp:词法分析相关函数
- scan.l
scan.l识别SQL语句中的关键字,标识符,常量,操作符,终结符等。
// flex由%%分为三个部分
/* 定义段 */
%{
...
%}
...
/* 规则段 */
%%
...
%%
/* 用户子程序段 */
相关数据结构
// 关键字
typedef struct ScanKeyword {
const char* name; /* 名称:小写 */
int16 value; /* token */
int16 category; /* 类型 */
} ScanKeyword;
定义段
%{
... // 定义宏,函数及include的文件
%}
%option reentrant // 生成可重用的扫描器API
%option bison-bridge // 生成的扫描器API能够被bision调用
%option bison-locations
%option 8bit // 8位扫描器
%option never-interactive // 非交互式
%option nodefault
%option noinput
%option nounput
%option noyywrap // 不调用yywrap()
%option noyyalloc
%option noyyrealloc
%option noyyfree
%option warn
%option prefix="core_yy" // yy开头的函数名替换为core_yy开头
// 定义开始状态,对特定的规则进行匹配
%x xb // 位串
%x xc // 扩展C样式注释
%x xd // 双引号标识符
%x xh // 16进制数字字符串
%x xe // 扩展引号字符串(支持反斜杠转义序列)
%x xq // 标准引用字符串
%x xdolq // $xxx$
%x xui // unicode转义的标识符
%x xus // unicode转义的字符串
%x xeu // 扩展引号字符串中的Unicode代理项对
// 匹配正则表达式
// 空格,换行,备注
space [ \t\n\r\f]
horiz_space [ \t\f]
newline [\n\r]
non_newline [^\n\r]
comment ("--"{non_newline}*)
whitespace ({space}+|{comment})
special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
// 引号
quote '
quotestop {quote}{whitespace}*
quotecontinue {quote}{whitespace_with_newline}{quote}
quotefail {quote}{whitespace}*"-"
// 位串
xbstart [bB]{quote}
xbinside [^']*
// 16进制
xhstart [xX]{quote}
xhinside [^']*
// n' 这种
xnstart [nN]{quote}
/* Quoted string that allows backslash escapes */
xestart [eE]{quote} // e' 这种
xeinside [^\\']+
xeescape [\\][^0-7]
xeoctesc [\\][0-7]{1,3}
xehexesc [\\]x[0-9A-Fa-f]{1,2}
xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
/* Extended quote
* xqdouble implements embedded quote, ''''
*/
xqstart {quote}
xqdouble {quote}{quote}
xqinside [^']+
// $xxx$ 相关
dolq_start [A-Za-z\200-\377_]
dolq_cont [A-Za-z\200-\377_0-9]
dolqdelim \$({dolq_start}{dolq_cont}*)?\$
dolqfailed \${dolq_start}{dolq_cont}*
dolqinside [^$]+
// 双引号
dquote \"
xdstart {dquote}
xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+
/* Unicode escapes */
uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
/* error rule to avoid backup */
uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU])
/* Quoted identifier with Unicode escapes */
xuistart [uU]&{dquote}
xuistop1 {dquote}{whitespace}*{uescapefail}?
xuistop2 {dquote}{whitespace}*{uescape}
/* Quoted string with Unicode escapes */
xusstart [uU]&{quote}
xusstop1 {quote}{whitespace}*{uescapefail}?
xusstop2 {quote}{whitespace}*{uescape}
/* error rule to avoid backup */
xufailed [uU]&
// C样式注释
xcstart \/\*{op_chars}*
xcstop \*+\/
xcinside [^*/]+
digit [0-9]
ident_start [A-Za-z\200-\377_]
ident_cont [A-Za-z\200-\377_0-9\$\#]
identifier {ident_start}{ident_cont}*
typecast "::"
plus_join "(+)"
dot_dot \.\.
colon_equals ":="
para_equals "=>"
/*
* "self" is the set of chars that should be returned as single-character
* tokens. "op_chars" is the set of chars that can make up "Op" tokens,
* which can be one or more characters long (but if a single-char token
* appears in the "self" set, it is not to be returned as an Op). Note
* that the sets overlap, but each has some chars that are not in the other.
*
* If you change either set, adjust the character lists appearing in the
* rule for "operator"!
*/
self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
operator {op_chars}+
/* we no longer allow unary minus in numbers.
* instead we pass it separately to parser. there it gets
* coerced via doNegate() -- Leon aug 20 1999
*
* {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
*
* {realfail1} and {realfail2} are added to prevent the need for scanner
* backup when the {real} rule fails to match completely.
*/
integer {digit}+
decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
decimalfail {digit}+\.\.
real ({integer}|{decimal})[Ee][-+]?{digit}+
realfail1 ({integer}|{decimal})[Ee]
realfail2 ({integer}|{decimal})[Ee][-+]
param \${integer}
newParam :({identifier}|{integer})
newArray :({integer}{space}*\])
other .
规则段
/* 规则 { 执行代码 } */
%%
{whitespace} { /* 忽略空格,换行,备注 */ }
// 匹配备注 /* */格式
{xcstart} {
SET_YYLLOC(); // 设置当前位置
yyextra->xcdepth = 0;
BEGIN(xc);
/* Put back any characters past slash-star; see above */
yyless(2); // 将当前token除前2个字符外的字符返回到输入流
if (yyextra->is_hint_str)
{
startlit();
addlit(yytext, yyleng, yyscanner);
}
}
{xcstart} {
(yyextra->xcdepth)++;
/* Put back any characters past slash-star; see above */
yyless(2);
if (yyextra->is_hint_str)
{
addlit(yytext, yyleng, yyscanner);
}
}
{xcstop} {
if (yyextra->xcdepth <= 0)
BEGIN(INITIAL);
else
(yyextra->xcdepth)--;
if (yyextra->is_hint_str)
{
addlit(yytext, yyleng, yyscanner);
yylval->str = litbufdup(yyscanner);
yyextra->is_hint_str = false;
return COMMENTSTRING;
}
}
{xcinside} {
if (yyextra->is_hint_str)
{
addlit(yytext, yyleng, yyscanner);
}
}
{op_chars} {
if (yyextra->is_hint_str)
{
addlit(yytext, yyleng, yyscanner);
}
}
\*+ {
if (yyextra->is_hint_str)
{
addlit(yytext, yyleng, yyscanner);
}
}
<> { yyerror("unterminated /* comment"); }
// 匹配 b''
{xbstart} {
/* Binary bit type.
* At some point we should simply pass the string
* forward to the parser and label it there.
* In the meantime, place a leading "b" on the string
* to mark it for the input routine as a binary string.
*/
SET_YYLLOC();
BEGIN(xb);
startlit();
addlitchar('b', yyscanner);
}
{quotestop} |
{quotefail} {
yyless(1);
BEGIN(INITIAL);
yylval->str = litbufdup(yyscanner);
yyextra->is_hint_str = false;
return BCONST;
}
{xhinside} |
{xbinside} {
addlit(yytext, yyleng, yyscanner);
}
{quotecontinue} |
{quotecontinue} {
/* ignore */
}
<> { yyerror("unterminated bit string literal"); }
// 匹配16进制 x''
{xhstart} {
/* Hexadecimal bit type.
* At some point we should simply pass the string
* forward to the parser and label it there.
* In the meantime, place a leading "x" on the string
* to mark it for the input routine as a hex string.
*/
SET_YYLLOC();
BEGIN(xh);
startlit();
addlitchar('x', yyscanner);
}
{quotestop} |
{quotefail} {
yyless(1);
BEGIN(INITIAL);
yylval->str = litbufdup(yyscanner);
yyextra->is_hint_str = false;
return XCONST;
}
<> { yyerror("unterminated hexadecimal string literal"); }
// 匹配n''
{xnstart} {
/* National character.
* We will pass this along as a normal character string,
* but preceded with an internally-generated "NCHAR".
*/
const ScanKeyword *keyword;
SET_YYLLOC();
yyless(1); /* eat only 'n' this time */
keyword = ScanKeywordLookup("nchar",
yyextra->keywords,
yyextra->num_keywords);
if (keyword != NULL)
{
yylval->keyword = keyword->name;
yyextra->is_hint_str = false;
return keyword->value;
}
else
{
/* If NCHAR isn't a keyword, just return "n" */
yylval->str = pstrdup("n");
yyextra->ident_quoted = false;
yyextra->is_hint_str = false;
return IDENT;
}
}
// 匹配 ''''
{xqstart} {
yyextra->warn_on_first_escape = true;
yyextra->saw_non_ascii = false;
SET_YYLLOC();
if (u_sess->attr.attr_sql.standard_conforming_strings)
BEGIN(xq);
else
BEGIN(xe);
startlit();
}
{xestart} {
yyextra->warn_on_first_escape = false;
yyextra->saw_non_ascii = false;
SET_YYLLOC();
BEGIN(xe);
startlit();
}
{xusstart} {
SET_YYLLOC();
if (!u_sess->attr.attr_sql.standard_conforming_strings)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("unsafe use of string constant with Unicode escapes"),
errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
lexer_errposition()));
BEGIN(xus);
startlit();
}
{quotestop} |
{quotefail} {
yyless(1);
BEGIN(INITIAL);
/*
* check that the data remains valid if it might have been
* made invalid by unescaping any chars.
*/
if (yyextra->saw_non_ascii)
pg_verifymbstr(yyextra->literalbuf,
yyextra->literallen,
false);
yylval->str = litbufdup(yyscanner);
yyextra->is_hint_str = false;
return SCONST;
}
{xusstop1} {
/* throw back all but the quote */
yyless(1);
BEGIN(INITIAL);
yylval->str = litbuf_udeescape('\\', yyscanner);
yyextra->is_hint_str = false;
return SCONST;
}
{xusstop2} {
BEGIN(INITIAL);
yylval->str = litbuf_udeescape(yytext[yyleng-2], yyscanner);
yyextra->is_hint_str = false;
return SCONST;
}
{xqdouble} {
addlitchar('\'', yyscanner);
}
{xqinside} {
addlit(yytext, yyleng, yyscanner);
}
{xeinside} {
addlit(yytext, yyleng, yyscanner);
}
{xeunicode} {
pg_wchar c = strtoul(yytext+2, NULL, 16);
check_escape_warning(yyscanner);
if (is_utf16_surrogate_first(c))
{
yyextra->utf16_first_part = c;
BEGIN(xeu);
}
else if (is_utf16_surrogate_second(c))
yyerror("invalid Unicode surrogate pair");
else
addunicode(c, yyscanner);
}
{xeunicode} {
pg_wchar c = strtoul(yytext+2, NULL, 16);
if (!is_utf16_surrogate_second(c))
yyerror("invalid Unicode surrogate pair");
c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
addunicode(c, yyscanner);
BEGIN(xe);
}
. { yyerror("invalid Unicode surrogate pair"); }
\n { yyerror("invalid Unicode surrogate pair"); }
<> { yyerror("invalid Unicode surrogate pair"); }
{xeunicodefail} {
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("invalid Unicode escape"),
errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
lexer_errposition()));
}
{xeescape} {
if (yytext[1] == '\'')
{
if (u_sess->attr.attr_sql.backslash_quote == BACKSLASH_QUOTE_OFF ||
(u_sess->attr.attr_sql.backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
ereport(ERROR,
(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
errmsg("unsafe use of \\' in a string literal"),
errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
lexer_errposition()));
}
check_string_escape_warning(yytext[1], yyscanner);
addlitchar(unescape_single_char(yytext[1], yyscanner),
yyscanner);
}
{xeoctesc} {
unsigned char c = strtoul(yytext+1, NULL, 8);
check_escape_warning(yyscanner);
addlitchar(c, yyscanner);
if (c == '\0' || IS_HIGHBIT_SET(c))
yyextra->saw_non_ascii = true;
}
{xehexesc} {
unsigned char c = strtoul(yytext+2, NULL, 16);
check_escape_warning(yyscanner);
addlitchar(c, yyscanner);
if (c == '\0' || IS_HIGHBIT_SET(c))
yyextra->saw_non_ascii = true;
}
{quotecontinue} {
/* ignore */
}
. {
/* This is only needed for \ just before EOF */
addlitchar(yytext[0], yyscanner);
}
<> { yyerror("unterminated quoted string"); }
// 匹配$xxx$
{dolqdelim} {
SET_YYLLOC();
yyextra->dolqstart = pstrdup(yytext);
BEGIN(xdolq);
startlit();
}
{dolqfailed} {
SET_YYLLOC();
/* throw back all but the initial "$" */
yyless(1);
/* and treat it as {other} */
yyextra->is_hint_str = false;
return yytext[0];
}
{dolqdelim} {
if (strcmp(yytext, yyextra->dolqstart) == 0)
{
FREE_POINTER(yyextra->dolqstart);
yyextra->dolqstart = NULL;
BEGIN(INITIAL);
yylval->str = litbufdup(yyscanner);
yyextra->is_hint_str = false;
return SCONST;
}
else
{
/*
* When we fail to match $...$ to dolqstart, transfer
* the $... part to the output, but put back the final
* $ for rescanning. Consider $delim$...$junk$delim$
*/
addlit(yytext, yyleng-1, yyscanner);
yyless(yyleng-1);
}
}
{dolqinside} {
addlit(yytext, yyleng, yyscanner);
}
{dolqfailed} {
addlit(yytext, yyleng, yyscanner);
}
. {
/* This is only needed for $ inside the quoted text */
addlitchar(yytext[0], yyscanner);
}
<> { yyerror("unterminated dollar-quoted string"); }
{xdstart} {
SET_YYLLOC();
BEGIN(xd);
startlit();
}
{xuistart} {
SET_YYLLOC();
BEGIN(xui);
startlit();
}
{xdstop} {
char *ident;
BEGIN(INITIAL);
if (yyextra->literallen == 0)
yyerror("zero-length delimited identifier");
ident = litbufdup(yyscanner);
if (yyextra->literallen >= NAMEDATALEN)
truncate_identifier(ident, yyextra->literallen, yyextra->warnOnTruncateIdent);
yylval->str = ident;
yyextra->ident_quoted = true;
yyextra->is_hint_str = false;
return IDENT;
}
{xuistop1} {
char *ident;
int identlen;
BEGIN(INITIAL);
if (yyextra->literallen == 0)
yyerror("zero-length delimited identifier");
ident = litbuf_udeescape('\\', yyscanner);
identlen = strlen(ident);
if (identlen >= NAMEDATALEN)
truncate_identifier(ident, identlen, yyextra->warnOnTruncateIdent);
yylval->str = ident;
/* throw back all but the quote */
yyless(1);
yyextra->ident_quoted = false;
yyextra->is_hint_str = false;
return IDENT;
}
{xuistop2} {
char *ident;
int identlen;
BEGIN(INITIAL);
if (yyextra->literallen == 0)
yyerror("zero-length delimited identifier");
ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
identlen = strlen(ident);
if (identlen >= NAMEDATALEN)
truncate_identifier(ident, identlen, yyextra->warnOnTruncateIdent);
yylval->str = ident;
yyextra->ident_quoted = false;
yyextra->is_hint_str = false;
return IDENT;
}
{xddouble} {
addlitchar('"', yyscanner);
}
{xdinside} {
addlit(yytext, yyleng, yyscanner);
}
<> { yyerror("unterminated quoted identifier"); }
{xufailed} {
char *ident;
SET_YYLLOC();
/* throw back all but the initial u/U */
yyless(1);
/* and treat it as {identifier} */
ident = downcase_truncate_identifier(yytext, yyleng, yyextra->warnOnTruncateIdent);
yylval->str = ident;
yyextra->ident_quoted = false;
yyextra->is_hint_str = false;
return IDENT;
}
// 匹配 ::
{typecast} {
SET_YYLLOC();
yyextra->is_hint_str = false;
return TYPECAST;
}
// 匹配(+)
{plus_join} {
SET_YYLLOC();
yyextra->is_hint_str = false;
return ORA_JOINOP;
}
// 匹配 ..
{dot_dot} {
SET_YYLLOC();
yyextra->is_hint_str = false;
return DOT_DOT;
}
// 匹配 :=
{colon_equals} {
SET_YYLLOC();
yyextra->is_hint_str = false;
return COLON_EQUALS;
}
// 匹配 =>
{para_equals} {
SET_YYLLOC();
yyextra->is_hint_str = false;
return PARA_EQUALS;
}
// 匹配单字符
{self} {
SET_YYLLOC();
/*
* Get the semicolon which is not in proc body nor in the '( )', treat it
* as end flag of a single query and store it in locationlist.
*/
if (yyextra->dolqstart == NULL)
{
if (yytext[0] == '(')
yyextra->paren_depth++;
else if (yytext[0] == ')' && yyextra->paren_depth > 0)
yyextra->paren_depth--;
else if (yytext[0] == ';' && yyextra->paren_depth == 0 && !yyextra->in_slash_proc_body)
yyextra->query_string_locationlist = lappend_int(yyextra->query_string_locationlist, *yylloc);
}
yyextra->is_hint_str = false;
return yytext[0];
}
// 匹配操作符
{operator} {
/*
* Check for embedded slash-star or dash-dash; those
* are comment starts, so operator must stop there.
* Note that slash-star or dash-dash at the first
* character will match a prior rule, not this one.
*/
int nchars = yyleng;
char *slashstar = strstr(yytext, "/*");
char *dashdash = strstr(yytext, "--");
if (slashstar && dashdash)
{
/* if both appear, take the first one */
if (slashstar > dashdash)
slashstar = dashdash;
}
else if (!slashstar)
slashstar = dashdash;
if (slashstar)
nchars = slashstar - yytext;
/*
* For SQL compatibility, '+' and '-' cannot be the
* last char of a multi-char operator unless the operator
* contains chars that are not in SQL operators.
* The idea is to lex '=-' as two operators, but not
* to forbid operator names like '?-' that could not be
* sequences of SQL operators.
*/
while (nchars > 1 &&
(yytext[nchars-1] == '+' ||
yytext[nchars-1] == '-'))
{
int ic;
for (ic = nchars-2; ic >= 0; ic--)
{
if (strchr("~!@#^&|`?%", yytext[ic]))
break;
}
if (ic >= 0)
break; /* found a char that makes it OK */
nchars--; /* else remove the +/-, and check again */
}
SET_YYLLOC();
if (nchars < (int)yyleng)
{
/* Strip the unwanted chars from the token */
yyless(nchars);
/*
* If what we have left is only one char, and it's
* one of the characters matching "self", then
* return it as a character token the same way
* that the "self" rule would have.
*/
if (nchars == 1 &&
strchr(",()[].;:+-*/%^<>=", yytext[0]))
{
yyextra->is_hint_str = false;
return yytext[0];
}
}
/*
* Complain if operator is too long. Unlike the case
* for identifiers, we make this an error not a notice-
* and-truncate, because the odds are we are looking at
* a syntactic mistake anyway.
*/
if (nchars >= NAMEDATALEN)
yyerror("operator too long");
/* Convert "!=" operator to "<>" for compatibility */
if (strcmp(yytext, "!=") == 0 || strcmp(yytext, "^=") == 0)
{
yylval->str = pstrdup("<>");
yyextra->is_hint_str = false;
return CmpOp;
}
else if (strcmp(yytext, ">=") == 0 || strcmp(yytext, "<=") == 0 || strcmp(yytext, "<>") == 0)
{
yylval->str = pstrdup(yytext);
yyextra->is_hint_str = false;
return CmpOp;
}
else
yylval->str = pstrdup(yytext);
yyextra->is_hint_str = false;
return Op;
}
{newArray} {
yyless(1);
yyextra->is_hint_str = false;
return yytext[0];
}
// 匹配 $n
{param} {
SET_YYLLOC();
yylval->ival = getDynaParamSeq(yytext + 1, false, false, yyscanner);
yyextra->is_hint_str = false;
return PARAM;
}
{newParam} {
SET_YYLLOC();
yylval->ival = getDynaParamSeq(yytext + 1, false, true, yyscanner);
yyextra->is_hint_str = false;
return PARAM;
}
// 匹配整数
{integer} {
SET_YYLLOC();
yyextra->is_hint_str = false;
return process_integer_literal(yytext, yylval);
}
// 匹配浮点数
{decimal} {
SET_YYLLOC();
yylval->str = pstrdup(yytext);
yyextra->is_hint_str = false;
return FCONST;
}
{decimalfail} {
/* throw back the .., and treat as integer */
yyless(yyleng-2);
SET_YYLLOC();
yyextra->is_hint_str = false;
return process_integer_literal(yytext, yylval);
}
{real} {
SET_YYLLOC();
yylval->str = pstrdup(yytext);
yyextra->is_hint_str = false;
return FCONST;
}
{realfail1} {
/*
* throw back the [Ee], and treat as {decimal}. Note
* that it is possible the input is actually {integer},
* but since this case will almost certainly lead to a
* syntax error anyway, we don't bother to distinguish.
*/
yyless(yyleng-1);
SET_YYLLOC();
yylval->str = pstrdup(yytext);
yyextra->is_hint_str = false;
return FCONST;
}
{realfail2} {
/* throw back the [Ee][+-], and proceed as above */
yyless(yyleng-2);
SET_YYLLOC();
yylval->str = pstrdup(yytext);
yyextra->is_hint_str = false;
return FCONST;
}
// 匹配关键字
{identifier} {
const ScanKeyword *keyword;
char *ident;
SET_YYLLOC();
/* 二分法确认是否关键字 */
keyword = ScanKeywordLookup(yytext,
yyextra->keywords,
yyextra->num_keywords);
yyextra->is_hint_str = false;
if (keyword != NULL)
{
yylval->keyword = keyword->name;
/* Find the CREATE PROCEDURE syntax and set dolqstart. */
if (keyword->value == CREATE)
{
yyextra->is_createstmt = true;
}
else if (keyword->value == TRIGGER && yyextra->is_createstmt)
{
/* Create trigger don't need set dolqstart */
yyextra->is_createstmt = false;
}
else if ((keyword->value == PROCEDURE || keyword->value == FUNCTION)
&& yyextra->is_createstmt)
{
/* Make yyextra->dolqstart not NULL means its in a proc with $$. */
yyextra->dolqstart = "";
}
else if (keyword->value == BEGIN_P)
{
/* cases that have to be a trans stmt and fall quickly */
if (yyg->yy_hold_char == ';' || /* found ';' after 'begin' */
yyg->yy_hold_char == '\0') /* found '\0' after 'begin' */
return BEGIN_NON_ANOYBLOCK;
/* look for other transaction stmt */
if (is_trans_stmt(yyextra->scanbuf, yyextra->scanbuflen))
return BEGIN_NON_ANOYBLOCK;
}
else if (keyword->value == SELECT ||
keyword->value == UPDATE||
keyword->value == INSERT ||
keyword->value == DELETE_P ||
keyword->value == MERGE)
{
yyextra->is_hint_str = true;
}
return keyword->value;
}
/* 不是关键字,转换为小写,如果长度超过64进行截断 */
ident = downcase_truncate_identifier(yytext, yyleng, yyextra->warnOnTruncateIdent);
yylval->str = ident;
yyextra->ident_quoted = false;
return IDENT;
}
{other} {
SET_YYLLOC();
yyextra->is_hint_str = false;
return yytext[0];
}
<> {
SET_YYLLOC();
yyterminate();
}
%%
程序段
/*
* Arrange access to yyextra for subroutines of the main yylex() function.
* We expect each subroutine to have a yyscanner parameter. Rather than
* use the yyget_xxx functions, which might or might not get inlined by the
* compiler, we cheat just a bit and cast yyscanner to the right type.
*/
#undef yyextra
#define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
/* Likewise for a couple of other things we need. */
#undef yylloc
#define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
#undef yyleng
#define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
// 返回词法或语法出错的位置
int
scanner_errposition(int location, core_yyscan_t yyscanner)
{
int pos;
if (location < 0)
return 0; /* no-op if location is unknown */
/* Convert byte offset to character number */
pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
/* And pass it to the ereport mechanism */
return errposition(pos);
}
// 报告词法或语法错误
void
scanner_yyerror(const char *message, core_yyscan_t yyscanner)
{
const char *loc = yyextra->scanbuf + *yylloc;
if (*loc == YY_END_OF_BUFFER_CHAR)
{
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
/* translator: %s is typically the translation of "syntax error" */
errmsg("%s at end of input", _(message)),
lexer_errposition()));
}
else
{
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
/* translator: first %s is typically the translation of "syntax error" */
errmsg("%s at or near \"%s\"", _(message), loc),
lexer_errposition()));
}
}
// 初始化flex扫描器
core_yyscan_t
scanner_init(const char *str,
core_yy_extra_type *yyext,
const ScanKeyword *keywords,
int num_keywords)
{
Size slen = strlen(str);
yyscan_t scanner;
// 初始化flex扫描器
if (yylex_init(&scanner) != 0)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("yylex_init() failed: %m")));
core_yyset_extra(yyext, scanner); // 将yyext赋值给scanner->yyextra
yyext->keywords = keywords; // 初始化关键字
yyext->num_keywords = num_keywords; // 关键字数量
yyext->in_slash_proc_body = false;
yyext->paren_depth = 0;
yyext->query_string_locationlist = NIL;
yyext->is_createstmt = false;
yyext->dolqstart = NULL;
yyext->is_hint_str = false;
yyext->parameter_list = NIL;
/*
* Make a scan buffer with special termination needed by flex.
*/
yyext->scanbuf = (char *) palloc(slen + 2);
yyext->scanbuflen = slen;
memcpy(yyext->scanbuf, str, slen);
yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
/* initialize literal buffer to a reasonable but expansible size */
yyext->literalalloc = 1024;
yyext->literalbuf = (char *) palloc(yyext->literalalloc);
yyext->literallen = 0;
yyext->warnOnTruncateIdent = true;
// Added CALL for procedure and function
getDynaParamSeq("init", true, true, NULL);
return scanner;
}
// 解析完成后释放内存
void
scanner_finish(core_yyscan_t yyscanner)
{
if (t_thrd.postgres_cxt.clear_key_memory)
{
errno_t rc = EOK;
memset(yyextra->scanbuf, 0x7F, yyextra->scanbuflen);
*(volatile char*)(yyextra->scanbuf) = *(volatile char*)(yyextra->scanbuf);
rc = memset_s(yyextra->literalbuf, yyextra->literallen, 0x7F, yyextra->literallen);
securec_check(rc, "\0", "\0");
}
/*
* We don't bother to call yylex_destroy(), because all it would do
* is pfree a small amount of control storage. It's cheaper to leak
* the storage until the parsing context is destroyed. The amount of
* space involved is usually negligible compared to the output parse
* tree anyway.
*
* We do bother to pfree the scanbuf and literal buffer, but only if they
* represent a nontrivial amount of space. The 8K cutoff is arbitrary.
*/
if (yyextra->scanbuflen >= 8192)
FREE_POINTER(yyextra->scanbuf);
if (yyextra->literalalloc >= 8192)
FREE_POINTER(yyextra->literalbuf);
if (yyextra->parameter_list)
{
list_free_deep(yyextra->parameter_list);
yyextra->parameter_list = NIL;
}
}
static void
addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
{
/* enlarge buffer if needed */
if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
{
do
{
yyextra->literalalloc *= 2;
} while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
/*when yytext is larger than 512M, its double will exceed 1G, so we use repalloc_huge */
yyextra->literalbuf = (char *) repalloc_huge(yyextra->literalbuf,
yyextra->literalalloc);
}
/* append new data */
memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
yyextra->literallen += yleng;
}
static void
addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
{
/* enlarge buffer if needed */
if ((yyextra->literallen + 1) >= yyextra->literalalloc)
{
yyextra->literalalloc *= 2;
yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
yyextra->literalalloc);
}
/* append new data */
yyextra->literalbuf[yyextra->literallen] = ychar;
yyextra->literallen += 1;
}
/*
* Create a palloc'd copy of literalbuf, adding a trailing null.
*/
static char *
litbufdup(core_yyscan_t yyscanner)
{
int llen = yyextra->literallen;
char *newm;
newm = (char *)palloc(llen + 1);
memcpy(newm, yyextra->literalbuf, llen);
newm[llen] = '\0';
return newm;
}
static int
process_integer_literal(const char *token, YYSTYPE *lval)
{
long val;
char *endptr;
errno = 0;
val = strtol(token, &endptr, 10);
if (*endptr != '\0' || errno == ERANGE
#ifdef HAVE_LONG_INT_64
/* if long > 32 bits, check for overflow of int4 */
|| val != (long) ((int32) val)
#endif
)
{
/* integer too large, treat it as a float */
lval->str = pstrdup(token);
return FCONST;
}
lval->ival = val;
return ICONST;
}
static unsigned int
hexval(unsigned char c)
{
if (c >= '0' && c <= '9')
return c - '0';
if (c >= 'a' && c <= 'f')
return c - 'a' + 0xA;
if (c >= 'A' && c <= 'F')
return c - 'A' + 0xA;
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid hexadecimal digit")));
return 0; /* not reached */
}
static void
check_unicode_value(pg_wchar c, const char *loc, core_yyscan_t yyscanner)
{
if (GetDatabaseEncoding() == PG_UTF8)
return;
if (c > 0x7F)
{
ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */
yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
}
}
static bool
is_utf16_surrogate_first(pg_wchar c)
{
return (c >= 0xD800 && c <= 0xDBFF);
}
static bool
is_utf16_surrogate_second(pg_wchar c)
{
return (c >= 0xDC00 && c <= 0xDFFF);
}
static pg_wchar
surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
{
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}
static void
addunicode(pg_wchar c, core_yyscan_t yyscanner)
{
char buf[8];
if (c == 0 || c > 0x10FFFF)
yyerror("invalid Unicode escape value");
if (c > 0x7F)
{
if (GetDatabaseEncoding() != PG_UTF8)
yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
yyextra->saw_non_ascii = true;
}
unicode_to_utf8(c, (unsigned char *) buf);
addlit(buf, pg_mblen(buf), yyscanner);
}
static char *
litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
{
char *newm;
char *litbuf, *in, *out;
pg_wchar pair_first = 0;
if (isxdigit(escape)
|| escape == '+'
|| escape == '\''
|| escape == '"'
|| scanner_isspace(escape))
{
ADVANCE_YYLLOC(yyextra->literallen + yyleng + 1);
yyerror("invalid Unicode escape character");
}
/* Make literalbuf null-terminated to simplify the scanning loop */
litbuf = yyextra->literalbuf;
litbuf[yyextra->literallen] = '\0';
/*
* This relies on the subtle assumption that a UTF-8 expansion
* cannot be longer than its escaped representation.
*/
newm = (char *)palloc(yyextra->literallen + 1);
in = litbuf;
out = newm;
while (*in)
{
if (in[0] == escape)
{
if (in[1] == escape)
{
if (pair_first)
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
*out++ = escape;
in += 2;
}
else if (isxdigit((unsigned char) in[1]) &&
isxdigit((unsigned char) in[2]) &&
isxdigit((unsigned char) in[3]) &&
isxdigit((unsigned char) in[4]))
{
pg_wchar unicode;
unicode = (hexval(in[1]) << 12) +
(hexval(in[2]) << 8) +
(hexval(in[3]) << 4) +
hexval(in[4]);
check_unicode_value(unicode, in, yyscanner);
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
}
else if (is_utf16_surrogate_second(unicode))
yyerror("invalid Unicode surrogate pair");
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
unicode_to_utf8(unicode, (unsigned char *) out);
out += pg_mblen(out);
}
in += 5;
}
else if (in[1] == '+' &&
isxdigit((unsigned char) in[2]) &&
isxdigit((unsigned char) in[3]) &&
isxdigit((unsigned char) in[4]) &&
isxdigit((unsigned char) in[5]) &&
isxdigit((unsigned char) in[6]) &&
isxdigit((unsigned char) in[7]))
{
pg_wchar unicode;
unicode = (hexval(in[2]) << 20) +
(hexval(in[3]) << 16) +
(hexval(in[4]) << 12) +
(hexval(in[5]) << 8) +
(hexval(in[6]) << 4) +
hexval(in[7]);
check_unicode_value(unicode, in, yyscanner);
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
}
else if (is_utf16_surrogate_second(unicode))
yyerror("invalid Unicode surrogate pair");
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
unicode_to_utf8(unicode, (unsigned char *) out);
out += pg_mblen(out);
}
in += 8;
}
else
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode escape value");
}
}
else
{
if (pair_first)
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
*out++ = *in++;
}
}
/* unfinished surrogate pair? */
if (pair_first)
{
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
yyerror("invalid Unicode surrogate pair");
}
*out = '\0';
/*
* We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
* codes; but it's probably not worth the trouble, since this isn't
* likely to be a performance-critical path.
*/
pg_verifymbstr(newm, out - newm, false);
return newm;
}
static unsigned char
unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
{
switch (c)
{
case 'b':
return '\b';
case 'f':
return '\f';
case 'n':
return '\n';
case 'r':
return '\r';
case 't':
return '\t';
default:
/* check for backslash followed by non-7-bit-ASCII */
if (c == '\0' || IS_HIGHBIT_SET(c))
yyextra->saw_non_ascii = true;
return c;
}
}
static void
check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
{
if (ychar == '\'')
{
if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
ereport(WARNING,
(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
errmsg("nonstandard use of \\' in a string literal"),
errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
lexer_errposition()));
yyextra->warn_on_first_escape = false; /* warn only once per string */
}
else if (ychar == '\\')
{
if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
ereport(WARNING,
(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
errmsg("nonstandard use of \\\\ in a string literal"),
errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
lexer_errposition()));
yyextra->warn_on_first_escape = false; /* warn only once per string */
}
else
check_escape_warning(yyscanner);
}
static void
check_escape_warning(core_yyscan_t yyscanner)
{
if (yyextra->warn_on_first_escape && u_sess->attr.attr_sql.escape_string_warning)
ereport(WARNING,
(errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
errmsg("nonstandard use of escape in a string literal"),
errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
lexer_errposition()));
yyextra->warn_on_first_escape = false; /* warn only once per string */
}
/*
* Interface functions to make flex use palloc() instead of malloc().
* It'd be better to make these static, but flex insists otherwise.
*/
void *
core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
{
return palloc(bytes);
}
void *
core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
{
if (ptr)
return repalloc(ptr, bytes);
else
return palloc(bytes);
}
void
core_yyfree(void *ptr, core_yyscan_t yyscanner)
{
if (ptr)
FREE_POINTER(ptr);
}
/*
* @Description: get the parameter sequence of dynamic SQL
* @in string: parameter name
* @in initflag: mark the operation is init or not
* @in placeholder: the flag to mark the binding parameter is placeholder or dollar quoting
* @in yyscanner: for yyextra
* @return - the sequence number of the parameter
*/
long
getDynaParamSeq(const char *string, bool initflag, bool placeholder, core_yyscan_t yyscanner)
{
int result = 0;
char* str = NULL;
const ListCell *cell;
if (initflag)
{
u_sess->parser_cxt.has_dollar = false;
u_sess->parser_cxt.has_placeholder = false;
return 0;
}
if (placeholder == false)
{
if (u_sess->parser_cxt.has_placeholder)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("It is forbidden to use placeholder and dollar quoting together.")));
u_sess->parser_cxt.has_dollar = true;
return atol(string);
}
u_sess->parser_cxt.has_placeholder = true;
if (u_sess->parser_cxt.has_dollar)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("It is forbidden to use placeholder and dollar quoting together.")));
foreach(cell, yyextra->parameter_list)
{
result++;
if (strcmp((char*)(lfirst(cell)),string) == 0)
return result;
}
str = pstrdup(string);
yyextra->parameter_list = lappend(yyextra->parameter_list, (void*)str);
return result + 1;
}
/*
* @Description: if we found begin, check if is a transaction stmt
* @param[IN] haystack: the give source string
* @param[IN] haystack_len: the length of haystack. Note that haystack may have been separated into words by '\0',
so haystack_len is needed.
* @return: true is a transaction stmt, false if not.
*
* we have to deal with a tricky case in which we recieve a sql like "begin " which is not terminated with ';' and
* followed by servral blank char. In this case we add a variable 'found_non_blank_char' to handle this case.
* if we haven't found any non blank char in the sql, consider it to be a transaction stmt.
*/
static bool
is_trans_stmt(const char *haystack, int haystack_len)
{
char *tempstr = (char *)palloc0(haystack_len + 1);
char *temp = tempstr;
int line = 1; /* lineno of haystack which split by \0 */
bool found_non_blank_char = false; /* mark if we find a non blank char after begin */
errno_t rc = EOK;
/* we have to make a copy, since haystack is const char* */
rc = memcpy_s(tempstr, haystack_len + 1, haystack, haystack_len);
securec_check_ss(rc, "\0", "\0");
/* find if the 2nd line is prefixed by a valid transaction token */
while (temp < tempstr + haystack_len)
{
/* there may be '\0' in the string, and should be skipped */
if (*temp == '\0')
{
temp++;
line++;
/* we only search the 2nd line */
if (line > 2)
break;
}
/* skip the blank char */
else if (isspace(*temp))
{
temp++;
}
else
{
/* we found a non blank char after begin, do further checking */
if (line == 2)
found_non_blank_char = true;
/* For a transaction statement, all possible tokens after BEGIN are here */
if (line == 2 &&(pg_strncasecmp(temp, "transaction", strlen("transaction")) == 0 ||
pg_strncasecmp(temp, "work", strlen("work")) == 0 ||
pg_strncasecmp(temp, "isolation", strlen("isolation")) == 0 ||
pg_strncasecmp(temp, "read", strlen("read")) == 0 ||
pg_strncasecmp(temp, "deferrable", strlen("deferrable")) == 0 ||
pg_strncasecmp(temp, "not", strlen("not")) == 0 ||
pg_strncasecmp(temp, ";", strlen(";")) == 0))
{
FREE_POINTER(tempstr);
return true;
}
temp += strlen(temp);
}
}
pfree (tempstr);
/*
* if all the char after begin are blank
* it is a trans stmt
* else
* it is a anaynomous block stmt
*/
return found_non_blank_char ? false : true;
}
- gram.y
gram.y使用词法分析出的词(token)去匹配相应的语法规则,如果匹配成功,则生成抽象语法树。
由于语法较多,这里以select语句解析为例
相关数据结构
typedef struct SelectStmt {
NodeTag type;
/*
* These fields are used only in "leaf" SelectStmts.
*/
List *distinctClause; /* distinct子句 */
IntoClause *intoClause; /* select into的目标值 */
List *targetList; /* 需要查询的字段 */
List *fromClause; /* from子句 */
Node *whereClause; /* where子句 */
List *groupClause; /* group by 子句 */
Node *havingClause; /* having条件子句 */
List *windowClause; /* 窗口函数 */
WithClause *withClause; /* with子句 */
/*
* In a "leaf" node representing a VALUES list, the above fields are all
* null, and instead this field is set. Note that the elements of the
* sublists are just expressions, without ResTarget decoration. Also note
* that a list element can be DEFAULT (represented as a SetToDefault
* node), regardless of the context of the VALUES list. It's up to parse
* analysis to reject that where not valid.
*/
List *valuesLists; /* untransformed list of expression lists */
/*
* These fields are used in both "leaf" SelectStmts and upper-level
* SelectStmts.
*/
List *sortClause; /* sort 子句 */
Node *limitOffset; /* limit offset */
Node *limitCount; /* limit 返回行 */
List *lockingClause; /* 锁子句 */
HintState *hintState;
/*
* These fields are used only in upper-level SelectStmts.
*/
SetOperation op; /* 操作符 */
bool all; /* ALL specified? */
struct SelectStmt *larg; /* left child */
struct SelectStmt *rarg; /* right child */
/*
* These fields are used by operator "(+)"
*/
bool hasPlus;
/* Eventually add fields for CORRESPONDING spec here */
} SelectStmt;
定义段
%{
... 定义宏,数据结构,函数及include文件
%}
%pure-parser
%expect 0
%name-prefix="base_yy"
%locations
%parse-param {core_yyscan_t yyscanner}
%lex-param {core_yyscan_t yyscanner}
// 修改yylval的类型
%union
{
core_YYSTYPE core_yystype;
/* these fields must match core_YYSTYPE: */
int ival;
char *str;
const char *keyword;
char chr;
bool boolean;
JoinType jtype;
DropBehavior dbehavior;
OnCommitAction oncommit;
List *list;
Node *node;
Value *value;
ObjectType objtype;
TypeName *typnam;
FunctionParameter *fun_param;
FunctionParameterMode fun_param_mode;
FuncWithArgs *funwithargs;
DefElem *defelt;
SortBy *sortby;
WindowDef *windef;
JoinExpr *jexpr;
IndexElem *ielem;
Alias *alias;
RangeVar *range;
IntoClause *into;
WithClause *with;
A_Indices *aind;
ResTarget *target;
struct PrivTarget *privtarget;
AccessPriv *accesspriv;
InsertStmt *istmt;
VariableSetStmt *vsetstmt;
/* PGXC_BEGIN */
DistributeBy *distby;
PGXCSubCluster *subclus;
/* PGXC_END */
ForeignPartState *foreignpartby;
MergeWhenClause *mergewhen;
UpsertClause *upsert;
EncryptionType algtype;
}
// 为与语法的每个部分相关联的值提供单独的类型
%type stmt schema_stmt
AlterDatabaseStmt AlterDatabaseSetStmt AlterDataSourceStmt
...
// 声明由LEX识别的YACC使用的每个语法规则,并给出值的类型
%token IDENT FCONST SCONST BCONST XCONST Op CmpOp COMMENTSTRING
...
// 关键字
%token ABORT_P ABSOLUTE_P ACCESS ACCOUNT ACTION ADD_P ADMIN AFTER
...
/* Precedence: lowest to highest */
%nonassoc PARTIAL_EMPTY_PREC
%nonassoc CLUSTER
%nonassoc SET /* see relation_expr_opt_alias */
%left UNION EXCEPT MINUS_P
%left INTERSECT
%left OR
%left AND
%right NOT
%right '='
%nonassoc '<' '>' CmpOp
%nonassoc LIKE ILIKE SIMILAR
%nonassoc ESCAPE
%nonassoc OVERLAPS
%nonassoc BETWEEN
%nonassoc IN_P
%left POSTFIXOP /* dummy for postfix Op rules */
/*
* To support target_el without AS, we must give IDENT an explicit priority
* between POSTFIXOP and Op. We can safely assign the same priority to
* various unreserved keywords as needed to resolve ambiguities (this can't
* have any bad effects since obviously the keywords will still behave the
* same as if they weren't keywords). We need to do this for PARTITION,
* RANGE, ROWS to support opt_existing_window_name; and for RANGE, ROWS
* so that they can follow a_expr without creating postfix-operator problems;
* and for NULL so that it can follow b_expr in ColQualList without creating
* postfix-operator problems.
*
* To support CUBE and ROLLUP in GROUP BY without reserving them, we give them
* an explicit priority lower than '(', so that a rule with CUBE '(' will shift
* rather than reducing a conflicting rule that takes CUBE as a function name.
* Using the same precedence as IDENT seems right for the reasons given above.
*
* The frame_bound productions UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING
* are even messier: since UNBOUNDED is an unreserved keyword (per spec!),
* there is no principled way to distinguish these from the productions
* a_expr PRECEDING/FOLLOWING. We hack this up by giving UNBOUNDED slightly
* lower precedence than PRECEDING and FOLLOWING. At present this doesn't
* appear to cause UNBOUNDED to be treated differently from other unreserved
* keywords anywhere else in the grammar, but it's definitely risky. We can
* blame any funny behavior of UNBOUNDED on the SQL standard, though.
*/
%nonassoc UNBOUNDED /* ideally should have same precedence as IDENT */
%nonassoc IDENT NULL_P PARTITION RANGE ROWS PRECEDING FOLLOWING CUBE ROLLUP
%left Op OPERATOR /* multi-character ops and user-defined operators */
%nonassoc NOTNULL
%nonassoc ISNULL
%nonassoc IS /* sets precedence for IS NULL, etc */
%left '+' '-'
%left '*' '/' '%'
%left '^'
/* Unary Operators */
%left AT /* sets precedence for AT TIME ZONE */
%left COLLATE
%right UMINUS
%left '[' ']'
%left '(' ')'
%left TYPECAST
%left '.'
/*
* These might seem to be low-precedence, but actually they are not part
* of the arithmetic hierarchy at all in their use as JOIN operators.
* We make them high-precedence to support their use as function names.
* They wouldn't be given a precedence at all, were it not that we need
* left-associativity among the JOIN rules themselves.
*/
%left JOIN CROSS LEFT FULL RIGHT INNER_P NATURAL ENCRYPTED
/* kluge to keep xml_whitespace_option from causing shift/reduce conflicts */
%right PRESERVE STRIP_P
规则段
// 解析完成后赋值
stmtblock: stmtmulti
{
pg_yyget_extra(yyscanner)->parsetree = $1;
}
;
...
// select语法
SelectStmt: select_no_parens %prec UMINUS
| select_with_parens %prec UMINUS
;
select_with_parens:
'(' select_no_parens ')' { $$ = $2; }
| '(' select_with_parens ')' { $$ = $2; }
;
select_no_parens:
simple_select { $$ = $1; } // 简单查询
| select_clause sort_clause // 带sort子句
{
insertSelectOptions((SelectStmt *) $1, $2, NIL,
NULL, NULL, NULL,
yyscanner);
$$ = $1;
}
| select_clause opt_sort_clause for_locking_clause opt_select_limit // 带锁子句
{
insertSelectOptions((SelectStmt *) $1, $2, $3,
(Node*)list_nth($4, 0), (Node*)list_nth($4, 1),
NULL,
yyscanner);
$$ = $1;
}
| select_clause opt_sort_clause select_limit opt_for_locking_clause // 带limit子句
{
insertSelectOptions((SelectStmt *) $1, $2, $4,
(Node*)list_nth($3, 0), (Node*)list_nth($3, 1),
NULL,
yyscanner);
$$ = $1;
}
| with_clause select_clause // with cte
{
insertSelectOptions((SelectStmt *) $2, NULL, NIL,
NULL, NULL,
$1,
yyscanner);
$$ = $2;
}
| with_clause select_clause sort_clause // cte + sort子句
{
insertSelectOptions((SelectStmt *) $2, $3, NIL,
NULL, NULL,
$1,
yyscanner);
$$ = $2;
}
| with_clause select_clause opt_sort_clause for_locking_clause opt_select_limit // cte + 锁子句
{
insertSelectOptions((SelectStmt *) $2, $3, $4,
(Node*)list_nth($5, 0), (Node*)list_nth($5, 1),
$1,
yyscanner);
$$ = $2;
}
| with_clause select_clause opt_sort_clause select_limit opt_for_locking_clause // cte + limit子句
{
insertSelectOptions((SelectStmt *) $2, $3, $5,
(Node*)list_nth($4, 0), (Node*)list_nth($4, 1),
$1,
yyscanner);
$$ = $2;
}
;
select_clause:
simple_select { $$ = $1; }
| select_with_parens { $$ = $1; }
;
// 简单查询
simple_select:
SELECT hint_string opt_distinct target_list
into_clause from_clause where_clause
group_clause having_clause window_clause
{
// 新建SelectStmt节点
SelectStmt *n = makeNode(SelectStmt);
n->distinctClause = $3;
n->targetList = $4;
n->intoClause = $5;
n->fromClause = $6;
n->whereClause = $7;
n->groupClause = $8;
n->havingClause = $9;
n->windowClause = $10;
n->hintState = create_hintstate($2);
n->hasPlus = getOperatorPlusFlag();
$$ = (Node *)n;
}
| values_clause { $$ = $1; }
| TABLE relation_expr
{
/* same as SELECT * FROM relation_expr */
ColumnRef *cr = makeNode(ColumnRef);
ResTarget *rt = makeNode(ResTarget);
SelectStmt *n = makeNode(SelectStmt);
cr->fields = list_make1(makeNode(A_Star));
cr->location = -1;
rt->name = NULL;
rt->indirection = NIL;
rt->val = (Node *)cr;
rt->location = -1;
n->targetList = list_make1(rt);
n->fromClause = list_make1($2);
$$ = (Node *)n;
}
| select_clause UNION opt_all select_clause // select union select
{
$$ = makeSetOp(SETOP_UNION, $3, $1, $4);
}
| select_clause INTERSECT opt_all select_clause // select intersect select
{
$$ = makeSetOp(SETOP_INTERSECT, $3, $1, $4);
}
| select_clause EXCEPT opt_all select_clause // select except select
{
$$ = makeSetOp(SETOP_EXCEPT, $3, $1, $4);
}
| select_clause MINUS_P opt_all select_clause // select minus select
{
$$ = makeSetOp(SETOP_EXCEPT, $3, $1, $4);
}
;
hint_string: // hint
COMMENTSTRING
{
$$ = $1;
}
|
{
$$ = NULL;
}
;
/*
* SQL standard WITH clause looks like:
*
* WITH [ RECURSIVE ] [ (,...) ]
* AS (query) [ SEARCH or CYCLE clause ]
*
* We don't currently support the SEARCH or CYCLE clause.
*/
with_clause: // cte
WITH cte_list
{
$$ = makeNode(WithClause);
$$->ctes = $2;
$$->recursive = false;
$$->location = @1;
}
| WITH RECURSIVE cte_list
{
$$ = makeNode(WithClause);
$$->ctes = $3;
$$->recursive = true;
$$->location = @1;
}
;
cte_list:
common_table_expr { $$ = list_make1($1); }
| cte_list ',' common_table_expr { $$ = lappend($1, $3); }
;
common_table_expr: name opt_name_list AS '(' PreparableStmt ')'
{
CommonTableExpr *n = makeNode(CommonTableExpr);
n->ctename = $1;
n->aliascolnames = $2;
n->ctequery = $5;
n->location = @1;
n->locator_type = LOCATOR_TYPE_NONE;
$$ = (Node *) n;
}
;
opt_with_clause:
with_clause { $$ = $1; }
| /*EMPTY*/ { $$ = NULL; }
;
into_clause:
INTO OptTempTableName
{
$$ = makeNode(IntoClause);
$$->rel = $2;
$$->colNames = NIL;
$$->options = NIL;
$$->onCommit = ONCOMMIT_NOOP;
/* Here $$ is a temp table, so row_compress can be any value. To be safe, REL_CMPRS_PAGE_PLAIN is used. */
$$->row_compress = REL_CMPRS_PAGE_PLAIN;
$$->tableSpaceName = NULL;
$$->skipData = false;
$$->relkind = INTO_CLAUSE_RELKIND_DEFAULT;
}
| /*EMPTY*/
{ $$ = NULL; }
;
/*
* Redundancy here is needed to avoid shift/reduce conflicts,
* since TEMP is not a reserved word. See also OptTemp.
*/
OptTempTableName:
TEMPORARY opt_table qualified_name
{
$$ = $3;
$$->relpersistence = RELPERSISTENCE_TEMP;
}
| TEMP opt_table qualified_name
{
$$ = $3;
$$->relpersistence = RELPERSISTENCE_TEMP;
}
| LOCAL TEMPORARY opt_table qualified_name
{
$$ = $4;
$$->relpersistence = RELPERSISTENCE_TEMP;
}
| LOCAL TEMP opt_table qualified_name
{
$$ = $4;
$$->relpersistence = RELPERSISTENCE_TEMP;
}
| GLOBAL TEMPORARY opt_table qualified_name
{
$$ = $4;
#ifdef ENABLE_MULTIPLE_NODES
ereport(WARNING,
(errmsg("GLOBAL is deprecated in temporary table creation"),
parser_errposition(@1)));
$$->relpersistence = RELPERSISTENCE_TEMP;
#else
$$->relpersistence = RELPERSISTENCE_GLOBAL_TEMP;
#endif
}
| GLOBAL TEMP opt_table qualified_name
{
$$ = $4;
#ifdef ENABLE_MULTIPLE_NODES
ereport(WARNING,
(errmsg("GLOBAL is deprecated in temporary table creation"),
parser_errposition(@1)));
$$->relpersistence = RELPERSISTENCE_TEMP;
#else
$$->relpersistence = RELPERSISTENCE_GLOBAL_TEMP;
#endif
}
| UNLOGGED opt_table qualified_name
{
$$ = $3;
$$->relpersistence = RELPERSISTENCE_UNLOGGED;
}
| TABLE qualified_name
{
$$ = $2;
$$->relpersistence = RELPERSISTENCE_PERMANENT;
}
| qualified_name
{
$$ = $1;
$$->relpersistence = RELPERSISTENCE_PERMANENT;
}
;
opt_table: TABLE {}
| /*EMPTY*/ {}
;
opt_all: ALL { $$ = TRUE; }
| DISTINCT { $$ = FALSE; }
| /*EMPTY*/ { $$ = FALSE; }
;
/* We use (NIL) as a placeholder to indicate that all target expressions
* should be placed in the DISTINCT list during parsetree analysis.
*/
opt_distinct: // distinct子句
DISTINCT { $$ = list_make1(NIL); }
| DISTINCT ON '(' expr_list ')' { $$ = $4; }
| ALL { $$ = NIL; }
| /*EMPTY*/ { $$ = NIL; }
;
opt_sort_clause:
sort_clause { $$ = $1;}
| /*EMPTY*/ { $$ = NIL; }
;
sort_clause: // sort子句
ORDER BY sortby_list { $$ = $3; }
;
sortby_list:
sortby { $$ = list_make1($1); }
| sortby_list ',' sortby { $$ = lappend($1, $3); }
;
sortby: a_expr USING qual_all_Op opt_nulls_order
{
$$ = makeNode(SortBy);
$$->node = $1;
$$->sortby_dir = SORTBY_USING;
$$->sortby_nulls = (SortByNulls)$4;
$$->useOp = $3;
$$->location = @3;
}
| a_expr opt_asc_desc opt_nulls_order
{
$$ = makeNode(SortBy);
$$->node = $1;
$$->sortby_dir = (SortByDir)$2;
$$->sortby_nulls = (SortByNulls)$3;
$$->useOp = NIL;
$$->location = -1; /* no operator */
}
| NLSSORT '(' a_expr ',' Sconst ')' opt_asc_desc opt_nulls_order
{
if (checkNlssortArgs($5))
{
Node *c = NULL;
FuncCall *n = makeNode(FuncCall);
c = $3;
n->funcname = SystemFuncName("convert_to_nocase");
n->args =list_make2(c,makeStringConst("gbk",-1));
n->agg_order = NIL;
n->agg_star = FALSE;
n->agg_distinct = FALSE;
n->func_variadic = FALSE;
n->over = NULL;
n->location = @1;
n->call_func = false;
$$ = makeNode(SortBy);
$$->node = (Node*)n;
$$->sortby_dir = (SortByDir)$7;
$$->sortby_nulls = (SortByNulls)$8;
$$->useOp = NIL;
$$->location = @1;
}
else
{
$$ = NULL;
ereport(ERROR,(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("Sort method %s is not supported!",$5)));
}
}
;
select_limit: // limit子句
limit_clause offset_clause { $$ = list_make2($2, $1); }
| offset_clause limit_clause { $$ = list_make2($1, $2); }
| limit_clause { $$ = list_make2(NULL, $1); }
| limit_offcnt_clause { $$ = $1; }
| offset_clause { $$ = list_make2($1, NULL); }
;
opt_select_limit:
select_limit { $$ = $1; }
| /* EMPTY */ { $$ = list_make2(NULL,NULL); }
;
opt_delete_limit:
LIMIT a_expr { $$ = list_make2(NULL, $2); }
| /* EMPTY */ { $$ = list_make2(NULL, NULL); }
limit_clause:
LIMIT select_limit_value
{ $$ = $2; }
/* SQL:2008 syntax */
| FETCH first_or_next opt_select_fetch_first_value row_or_rows ONLY
{ $$ = $3; }
;
limit_offcnt_clause:
LIMIT select_offset_value ',' select_limit_value
{
$$ = list_make2($2, $4);
}
;
offset_clause:
OFFSET select_offset_value
{ $$ = $2; }
/* SQL:2008 syntax */
| OFFSET select_offset_value2 row_or_rows
{ $$ = $2; }
;
select_limit_value:
a_expr { $$ = $1; }
| ALL
{
/* LIMIT ALL is represented as a NULL constant */
$$ = makeNullAConst(@1);
}
;
select_offset_value:
a_expr { $$ = $1; }
;
/*
* Allowing full expressions without parentheses causes various parsing
* problems with the trailing ROW/ROWS key words. SQL only calls for
* constants, so we allow the rest only with parentheses. If omitted,
* default to 1.
*/
opt_select_fetch_first_value:
SignedIconst { $$ = makeIntConst($1, @1); }
| '(' a_expr ')' { $$ = $2; }
| /*EMPTY*/ { $$ = makeIntConst(1, -1); }
;
/*
* Again, the trailing ROW/ROWS in this case prevent the full expression
* syntax. c_expr is the best we can do.
*/
select_offset_value2:
c_expr { $$ = $1; }
;
/* noise words */
row_or_rows: ROW { $$ = 0; }
| ROWS { $$ = 0; }
;
first_or_next: FIRST_P { $$ = 0; }
| NEXT { $$ = 0; }
;
/*
* This syntax for group_clause tries to follow the spec quite closely.
* However, the spec allows only column references, not expressions,
* which introduces an ambiguity between implicit row constructors
* (a,b) and lists of column references.
*
* We handle this by using the a_expr production for what the spec calls
* , which in the spec represents either one column
* reference or a parenthesized list of column references. Then, we check the
* top node of the a_expr to see if it's an implicit RowExpr, and if so, just
* grab and use the list, discarding the node. (this is done in parse analysis,
* not here)
*
* (we abuse the row_format field of RowExpr to distinguish implicit and
* explicit row constructors; it's debatable if anyone sanely wants to use them
* in a group clause, but if they have a reason to, we make it possible.)
*
* Each item in the group_clause list is either an expression tree or a
* GroupingSet node of some type.
*/
group_clause: // group by 子句
GROUP_P BY group_by_list { $$ = $3; }
| /*EMPTY*/ { $$ = NIL; }
;
group_by_list:
group_by_item { $$ = list_make1($1); }
| group_by_list ',' group_by_item { $$ = lappend($1,$3); }
;
group_by_item:
a_expr { $$ = $1; }
| empty_grouping_set { $$ = $1; }
| cube_clause { $$ = $1; }
| rollup_clause { $$ = $1; }
| grouping_sets_clause { $$ = $1; }
;
empty_grouping_set:
'(' ')'
{
$$ = (Node *) makeGroupingSet(GROUPING_SET_EMPTY, NIL, @1);
}
;
/*
* These hacks rely on setting precedence of CUBE and ROLLUP below that of '(',
* so that they shift in these rules rather than reducing the conflicting
* unreserved_keyword rule.
*/
rollup_clause:
ROLLUP '(' expr_list ')'
{
$$ = (Node *) makeGroupingSet(GROUPING_SET_ROLLUP, $3, @1);
}
;
cube_clause:
CUBE '(' expr_list ')'
{
$$ = (Node *) makeGroupingSet(GROUPING_SET_CUBE, $3, @1);
}
;
grouping_sets_clause:
GROUPING_P SETS '(' group_by_list ')'
{
$$ = (Node *) makeGroupingSet(GROUPING_SET_SETS, $4, @1);
}
;
having_clause:
HAVING a_expr { $$ = $2; }
| /*EMPTY*/ { $$ = NULL; }
;
for_locking_clause: // 锁子句
for_locking_items { $$ = $1; }
| FOR READ ONLY { $$ = NIL; }
;
opt_for_locking_clause:
for_locking_clause { $$ = $1; }
| /* EMPTY */ { $$ = NIL; }
;
for_locking_items:
for_locking_item { $$ = list_make1($1); }
| for_locking_items for_locking_item { $$ = lappend($1, $2); }
;
for_locking_item:
FOR UPDATE locked_rels_list opt_nowait
{
LockingClause *n = makeNode(LockingClause);
n->lockedRels = $3;
n->forUpdate = TRUE;
n->noWait = $4;
$$ = (Node *) n;
}
| FOR SHARE locked_rels_list opt_nowait
{
LockingClause *n = makeNode(LockingClause);
n->lockedRels = $3;
n->forUpdate = FALSE;
n->noWait = $4;
$$ = (Node *) n;
}
;
locked_rels_list:
OF qualified_name_list { $$ = $2; }
| /* EMPTY */ { $$ = NIL; }
;
values_clause: // VALUES子句
VALUES ctext_row
{
SelectStmt *n = makeNode(SelectStmt);
n->valuesLists = list_make1($2);
$$ = (Node *) n;
}
| values_clause ',' ctext_row
{
SelectStmt *n = (SelectStmt *) $1;
n->valuesLists = lappend(n->valuesLists, $3);
$$ = (Node *) n;
}
;
/*****************************************************************************
*
* clauses common to all Optimizable Stmts:
* from_clause - allow list of both JOIN expressions and table names
* where_clause - qualifications for joins or restrictions
*
*****************************************************************************/
from_clause: // from子句
FROM from_list { $$ = $2; }
| /*EMPTY*/ { $$ = NIL; }
;
from_list:
table_ref { $$ = list_make1($1); }
| from_list ',' table_ref { $$ = lappend($1, $3); }
;
/*
* table_ref is where an alias clause can be attached. Note we cannot make
* alias_clause have an empty production because that causes parse conflicts
* between table_ref := '(' joined_table ')' alias_clause
* and joined_table := '(' joined_table ')'. So, we must have the
* redundant-looking productions here instead.
*/
// 访问表
table_ref: relation_expr
{
$$ = (Node *) $1;
}
| relation_expr alias_clause
{
$1->alias = $2;
$$ = (Node *) $1;
}
| relation_expr opt_alias_clause tablesample_clause
{
RangeTableSample *n = (RangeTableSample *) $3;
$1->alias = $2;
/* relation_expr goes inside the RangeTableSample node */
n->relation = (Node *) $1;
$$ = (Node *) n;
}
| relation_expr PARTITION '(' name ')'
{
$1->partitionname = $4;
$1->ispartition = true;
$$ = (Node *)$1;
}
| relation_expr BUCKETS '(' bucket_list ')'
{
$1->buckets = $4;
$1->isbucket = true;
$$ = (Node *)$1;
}
| relation_expr PARTITION_FOR '(' maxValueList ')'
{
$1->partitionKeyValuesList = $4;
$1->ispartition = true;
$$ = (Node *)$1;
}
| relation_expr PARTITION '(' name ')' alias_clause
{
$1->partitionname = $4;
$1->alias = $6;
$1->ispartition = true;
$$ = (Node *)$1;
}
| relation_expr PARTITION_FOR '(' maxValueList ')' alias_clause
{
$1->partitionKeyValuesList = $4;
$1->alias = $6;
$1->ispartition = true;
$$ = (Node *)$1;
}
| func_table
{
RangeFunction *n = makeNode(RangeFunction);
n->funccallnode = $1;
n->coldeflist = NIL;
$$ = (Node *) n;
}
| func_table alias_clause
{
RangeFunction *n = makeNode(RangeFunction);
n->funccallnode = $1;
n->alias = $2;
n->coldeflist = NIL;
$$ = (Node *) n;
}
| func_table AS '(' TableFuncElementList ')'
{
RangeFunction *n = makeNode(RangeFunction);
n->funccallnode = $1;
n->coldeflist = $4;
$$ = (Node *) n;
}
| func_table AS ColId '(' TableFuncElementList ')'
{
RangeFunction *n = makeNode(RangeFunction);
Alias *a = makeNode(Alias);
n->funccallnode = $1;
a->aliasname = $3;
n->alias = a;
n->coldeflist = $5;
$$ = (Node *) n;
}
| func_table ColId '(' TableFuncElementList ')'
{
RangeFunction *n = makeNode(RangeFunction);
Alias *a = makeNode(Alias);
n->funccallnode = $1;
a->aliasname = $2;
n->alias = a;
n->coldeflist = $4;
$$ = (Node *) n;
}
| select_with_parens
{
/*
* The SQL spec does not permit a subselect
* () without an alias clause,
* so we don't either. This avoids the problem
* of needing to invent a unique refname for it.
* That could be surmounted if there's sufficient
* popular demand, but for now let's just implement
* the spec and see if anyone complains.
* However, it does seem like a good idea to emit
* an error message that's better than "syntax error".
*/
/* add select_with_parens whthout alias_clause adapt A db for procedure dubug */
$$ = NULL;
if (IsA($1, SelectStmt) &&
((SelectStmt *) $1)->valuesLists)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("VALUES in FROM must have an alias"),
errhint("For example, FROM (VALUES ...) [AS] foo."),
parser_errposition(@1)));
else
{
/*
* add a anonymous table name for this subquery
* simulate A db to support no alias for subquery,
* give the suqquery a default name "anonymous_table"
*/
RangeSubselect *n = makeNode(RangeSubselect);
Alias *a = makeNode(Alias);
n->subquery = $1;
n->alias = NULL;
a->aliasname = pstrdup("__unnamed_subquery__");
n->alias = a;
$$ = (Node *) n;
}
}
| select_with_parens alias_clause
{
RangeSubselect *n = makeNode(RangeSubselect);
n->subquery = $1;
n->alias = $2;
$$ = (Node *) n;
}
| joined_table
{
$$ = (Node *) $1;
}
| '(' joined_table ')' alias_clause
{
$2->alias = $4;
$$ = (Node *) $2;
}
;
/*
* It may seem silly to separate joined_table from table_ref, but there is
* method in SQL92's madness: if you don't do it this way you get reduce-
* reduce conflicts, because it's not clear to the parser generator whether
* to expect alias_clause after ')' or not. For the same reason we must
* treat 'JOIN' and 'join_type JOIN' separately, rather than allowing
* join_type to expand to empty; if we try it, the parser generator can't
* figure out when to reduce an empty join_type right after table_ref.
*
* Note that a CROSS JOIN is the same as an unqualified
* INNER JOIN, and an INNER JOIN/ON has the same shape
* but a qualification expression to limit membership.
* A NATURAL JOIN implicitly matches column names between
* tables and the shape is determined by which columns are
* in common. We'll collect columns during the later transformations.
*/
joined_table: // 连接
'(' joined_table ')'
{
$$ = $2;
}
| table_ref CROSS JOIN table_ref
{
/* CROSS JOIN is same as unqualified inner join */
JoinExpr *n = makeNode(JoinExpr);
n->jointype = JOIN_INNER;
n->isNatural = FALSE;
n->larg = $1;
n->rarg = $4;
n->usingClause = NIL;
n->quals = NULL;
$$ = n;
}
| table_ref join_type JOIN table_ref join_qual
{
JoinExpr *n = makeNode(JoinExpr);
n->jointype = $2;
n->isNatural = FALSE;
n->larg = $1;
n->rarg = $4;
if ($5 != NULL && IsA($5, List))
n->usingClause = (List *) $5; /* USING clause */
else
n->quals = $5; /* ON clause */
$$ = n;
}
| table_ref JOIN table_ref join_qual
{
/* letting join_type reduce to empty doesn't work */
JoinExpr *n = makeNode(JoinExpr);
n->jointype = JOIN_INNER;
n->isNatural = FALSE;
n->larg = $1;
n->rarg = $3;
if ($4 != NULL && IsA($4, List))
n->usingClause = (List *) $4; /* USING clause */
else
n->quals = $4; /* ON clause */
$$ = n;
}
| table_ref NATURAL join_type JOIN table_ref
{
JoinExpr *n = makeNode(JoinExpr);
n->jointype = $3;
n->isNatural = TRUE;
n->larg = $1;
n->rarg = $5;
n->usingClause = NIL; /* figure out which columns later... */
n->quals = NULL; /* fill later */
$$ = n;
}
| table_ref NATURAL JOIN table_ref
{
/* letting join_type reduce to empty doesn't work */
JoinExpr *n = makeNode(JoinExpr);
n->jointype = JOIN_INNER;
n->isNatural = TRUE;
n->larg = $1;
n->rarg = $4;
n->usingClause = NIL; /* figure out which columns later... */
n->quals = NULL; /* fill later */
$$ = n;
}
;
alias_clause: // 别名
AS ColId '(' name_list ')'
{
$$ = makeNode(Alias);
$$->aliasname = $2;
$$->colnames = $4;
}
| AS ColId
{
$$ = makeNode(Alias);
$$->aliasname = $2;
}
| ColId '(' name_list ')'
{
$$ = makeNode(Alias);
$$->aliasname = $1;
$$->colnames = $3;
}
| ColId
{
$$ = makeNode(Alias);
$$->aliasname = $1;
}
;
opt_alias_clause: alias_clause { $$ = $1; }
| /*EMPTY*/ { $$ = NULL; }
;
join_type: FULL join_outer { $$ = JOIN_FULL; }
| LEFT join_outer { $$ = JOIN_LEFT; }
| RIGHT join_outer { $$ = JOIN_RIGHT; }
| INNER_P { $$ = JOIN_INNER; }
;
/* OUTER is just noise... */
join_outer: OUTER_P { $$ = NULL; }
| /*EMPTY*/ { $$ = NULL; }
;
/* JOIN qualification clauses
* Possibilities are:
* USING ( column list ) allows only unqualified column names,
* which must match between tables.
* ON expr allows more general qualifications.
*
* We return USING as a List node, while an ON-expr will not be a List.
*/
join_qual: USING '(' name_list ')' { $$ = (Node *) $3; }
| ON a_expr { $$ = $2; }
;
relation_expr:
qualified_name
{
/* default inheritance */
$$ = $1;
$$->inhOpt = INH_DEFAULT;
$$->alias = NULL;
}
| qualified_name '*'
{
/* inheritance query */
$$ = $1;
$$->inhOpt = INH_YES;
$$->alias = NULL;
}
| ONLY qualified_name
{
/* no inheritance */
$$ = $2;
$$->inhOpt = INH_NO;
$$->alias = NULL;
}
| ONLY '(' qualified_name ')'
{
/* no inheritance, SQL99-style syntax */
$$ = $3;
$$->inhOpt = INH_NO;
$$->alias = NULL;
}
;
relation_expr_list:
relation_expr { $$ = list_make1($1); }
| relation_expr_list ',' relation_expr { $$ = lappend($1, $3); }
;
...
where_clause: // where子句
WHERE a_expr { $$ = $2; }
| /*EMPTY*/ { $$ = NULL; }
;
...
程序段
...
/* parser_init()
* Initialize to parse one query string
*/
void
parser_init(base_yy_extra_type *yyext)
{
yyext->parsetree = NIL; /* in case grammar forgets to set it */
yyext->core_yy_extra.query_string_locationlist = NIL;
yyext->core_yy_extra.paren_depth = 0;
}
...
- 词法语法解析流程
这里用以下查询语句进行分析
postgres=# select * from a where id < 100 order by id;
语句执行流程图如下:
- 词法语法解析入口函数raw_parser,调用base_yyparse开始解析
- 首先词法解析到SELECT关键字
simple_select:
SELECT hint_string opt_distinct target_list
into_clause from_clause where_clause
group_clause having_clause window_clause
{
SelectStmt *n = makeNode(SelectStmt);
n->distinctClause = $3;
n->targetList = $4;
n->intoClause = $5;
n->fromClause = $6;
n->whereClause = $7;
n->groupClause = $8;
n->havingClause = $9;
n->windowClause = $10;
n->hintState = create_hintstate($2);
n->hasPlus = getOperatorPlusFlag();
$$ = (Node *)n;
}
(1) 由SELECT关键字匹配到simple_select语法规则
(2) hint_string, opt_distinct 返回空
(3) target_list匹配到 '*' 字符,构建ColumnRef,加入到list
(4) into_clause 返回空
(5) 匹配FROM关键字,匹配表名,构建RangeVar,加入到list
(6) 匹配WHERE关键字,匹配字段名,构建ColumnRef,匹配int常量,匹配<表达式,构建A_Expr
(7) group_clause,having_clause,window_clause 返回空
(8) 最后构建 SelectStmt
- 匹配order by
sort_clause:
ORDER BY sortby_list { $$ = $3; }
;
sortby_list:
sortby { $$ = list_make1($1); }
| sortby_list ',' sortby { $$ = lappend($1, $3); }
;
sortby: a_expr USING qual_all_Op opt_nulls_order
{
$$ = makeNode(SortBy);
$$->node = $1;
$$->sortby_dir = SORTBY_USING;
$$->sortby_nulls = (SortByNulls)$4;
$$->useOp = $3;
$$->location = @3;
}
| a_expr opt_asc_desc opt_nulls_order
{
$$ = makeNode(SortBy);
$$->node = $1;
$$->sortby_dir = (SortByDir)$2;
$$->sortby_nulls = (SortByNulls)$3;
$$->useOp = NIL;
$$->location = -1; /* no operator */
}
...
select_no_parens:
simple_select { $$ = $1; }
| select_clause sort_clause
{
insertSelectOptions((SelectStmt *) $1, $2, NIL,
NULL, NULL, NULL,
yyscanner);
$$ = $1;
}
(1) 读取到ORDER, BY关键字,匹配sort_clause语法规则
(2) 匹配字段名,构建ColumnRef,构建SortBy节点
(3) 匹配 select_clause sort_clause 规则,将sort_clause中构建的SortBy节点加入到上一步的SelectStmt中
- 返回抽象语法树
stmtblock: stmtmulti
{
pg_yyget_extra(yyscanner)->parsetree = $1;
}
;
/* the thrashing around here is to discard "empty" statements... */
stmtmulti: stmtmulti ';' stmt
{
if ($3 != NULL)
{
if (IsA($3, List))
{
$$ = list_concat($1, (List*)$3);
}
else
{
$$ = lappend($1, $3);
}
}
else
$$ = $1;
}
| stmt
{
if ($1 != NULL)
{
if (IsA($1, List))
{
$$ = (List*)$1;
}
else
{
$$ = list_make1($1);
}
}
else
$$ = NIL;
}
;
(1) 将上述SelectStmt加入list,赋值给yyextra.parsetree
(2) raw_parser函数将parsetree返回给上层调用函数